# Modified Gale Shapley Algorithm for Generative Data Comparison

## Set-Up
We'll primarily using pandas to work with the data. We start off by importing pandas and importing the data as a pandas dataframe.

In [None]:
import pandas as pd
import numpy as np
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import hamming

In [None]:
syndata = pd.read_csv('synData9.csv')
realdata = pd.read_csv('kag_risk_factors_cervical_cancer.csv')

In [None]:
syndata.iloc[0] #Use this to access a row
syndata.loc[0, : ] #Use this to access a row
x = syndata.loc[0,"Age"] #Use this to access a column
syndata.columns[0]

In [None]:
#Med represents continuous variables and bin is binary variables (in the real data)

med = ['Age', 'Number of sexual partners', 'First sexual intercourse',
        'Num of pregnancies', 'Smokes (years)','Smokes (packs/year)','Hormonal Contraceptives (years)','IUD (years)',
        'STDs (number)', 'STDs: Number of diagnosis',
        'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis']

bin = ['Smokes', 'Hormonal Contraceptives', 'IUD', 'STDs', 'STDs:condylomatosis',
        'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
        'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
        'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
        'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
        'STDs:Hepatitis B', 'STDs:HPV','Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
        'Citology', 'Biopsy']

In [None]:
realdata.hist(figsize=(25,15)), syndata.hist(figsize=(25,15))

## Data Cleaning and Helper Functions

In [None]:
def impute(data, graph=False):
    temp_data = data
    temp_data[temp_data == '?'] = np.nan

    for i in temp_data.columns:
        if(temp_data[i].dtype == 'O'):
            temp_data[i] = temp_data[i].astype('float')

    med = ['Age', 'Number of sexual partners', 'First sexual intercourse',
        'Num of pregnancies', 'Smokes (years)','Smokes (packs/year)','Hormonal Contraceptives (years)','IUD (years)',
        'STDs (number)', 'STDs: Number of diagnosis',
        'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis']

    for i in med:
        imp = temp_data[i].median()
        temp_data[i].fillna(imp, inplace=True)

    bin = ['Smokes', 'Hormonal Contraceptives', 'IUD', 'STDs', 'STDs:condylomatosis',
        'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
        'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
        'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
        'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
        'STDs:Hepatitis B', 'STDs:HPV','Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
        'Citology', 'Biopsy']

    for i in bin:
        imp = temp_data[i].mode()
        temp_data[i].fillna(imp, inplace=True)

    if graph:
        temp_data.hist(figsize=(25,15))
    
    return temp_data

In [None]:
impute(realdata, True)

In [None]:
def mean_for_column(data1, column):
    """
    Finds the mean for a specific column
    """
    tot = 0 
    for i in range(len(data1)):
        #print(data1.loc[i, column])
        if data1.loc[i, column] == "?" or data1.loc[i, column] == -1:
            pass
        else:
            tot = tot + float(data1.loc[i, column])
    tot = tot/(len(data1))
    return tot
    
def mean_for_cols(data1):
    """
    Computes the mean for all the columns 
    """
    ar = []
    for col in data1.columns: 
        ar.append(mean_for_column(data1, col))
    return ar

def norm(data1):
    """
    Min-Max normalizes the continuous data
    """
    new_dat = data1.copy()
    for col in new_dat.columns:
        if col in med:
            new_dat[col] = (new_dat[col] - new_dat[col].min()) / (new_dat[col].max() - new_dat[col].min())
    return new_dat

##Old version of the clean function.
#Replaced by impute function
def old_clean(data1):
    """
    Replaces the missing data with a -1 value and converts the strings into floats
    """
    for i in range(len(data1)):
        for f in data1.columns:
            #print(data1.loc[i, f])
            if data1.loc[i, f] == "?":
                data1.loc[i, f] = float(-1)
                continue 
            if isinstance(data1.loc[i, f], str):
                data1.loc[i, f] = float(data1.loc[i, f])
    data1 =data1.set_axis([f for f in range(len(data1))], axis='index')
    #print(data1.dtypes)
    return data1.astype('float')

In [None]:
realdata = impute(realdata, True)
syndata = impute(syndata, True)

syndata_means = mean_for_cols(syndata)
realdata_means = mean_for_cols(realdata)

#For syndata
syndataC = norm(syndata)
#For realdata
realdataC = norm(realdata)

syndataC = syndataC.fillna(-1)
realdataC = realdataC.fillna(-1)
#They both should be False, this is important
syndataC.isnull().values.any(), realdataC.isnull().values.any()

In [None]:
realdataC.hist(figsize=(25,15)), syndataC.hist(figsize=(25,15))

In [None]:
## OLD WAY

#Back up of the data
syndata_safe = syndata 
realdata_safe = realdata

realdata_clean = old_clean(realdata)
syndata_clean = old_clean(syndata)

realdata = realdata_safe
syndata = syndata_safe


#For syndata
syndata_clean_norm = norm(syndata_clean)
#For realdata
realdata_clean_norm = norm(realdata_clean)

syndata_clean_norm = syndata_clean_norm.fillna(-1)
realdata_clean_norm = realdata_clean_norm.fillna(-1)
#They both should be False, this is important
syndata_clean_norm.isnull().values.any(), realdata_clean_norm.isnull().values.any()

## Weights

In [None]:
#We start of by getting the column names
#print(realdata_clean_norm.columns)
#print(syndata_clean_norm.columns)

In [None]:
#They're the same so we use 1 single column array
cols = ['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology', 'Biopsy']

#Init
real_weights =[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
syn_weights =[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
test_weights =[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    
high_weights =['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes']
med_weights =['Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis']
low_weights =['STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology', 'Biopsy']

for i in range(len(cols)):
    if cols[i] in high_weights:
        real_weights[i] = 3
        syn_weights[i] = 3
    if cols[i] in med_weights:
        real_weights[i] = 2
        syn_weights[i] = 2
    if cols[i] in low_weights:
        real_weights[i] = 1
        syn_weights[i] = 1

## Comparison Function
The comparison function is the main function that compares the data. It compares all the synthetic data to one specific row of the real data and creates an array of the most similar matches.

We also compute the values without using the functions.

In [None]:
def compare(data1, data2, row_comp, weights, mini=False):
    """
    This is a comparsion function  
    data1: First dataset for comparsion, 1 set row
    data2: Second dataset for comparsion
    row_comp: The row we want compared
    mini: Chooses if the function returns the minimum value or not
    """
    #Goes for 1 section of the real data and then loops for every row of the synthesized data
    lst = []
    for i in range(len(data2)):
        #print('Real data: ' + str(data1.loc[row_comp, "Age"]) + '. Fake data: ' +str(data2.loc[i, 'Age']))
        #Comparsion function to compare each the set real row to current fake row
        lst.append(similiar(data1, data2, i, row_comp, weights))
    if mini:
        lst = min(lst)
    return lst

def similiar(data1, data2, ind, row_comp, weights):
    """
    Computes how similiar the two datasets are 
    """
    score = 0
    for i in range(len(data1.columns)):
        score = score + (abs(data1.loc[row_comp,:][i] - data2.loc[ind,:][i]) * weights[i])
        #print(data1.loc[row_comp,:] - data2.loc[ind,:])
    return score 


def compNew(data1, data2, row_comp, weights):
    lst = []
    
    for i in range(len(data2)):
        score = 0
        for y in range(len(data1.columns)):
            score = score + (abs(data1.loc[row_comp,:][y] - data2.loc[i,:][y]) * weights[y])
        lst.append(score)
    return lst

In [None]:
hamming([1,1,1,0], [1,1,1,1]) * len([1,1,1,0])

#You need to multiply hamming by the length of an array

In [None]:
#bin
#med

In [None]:
#syndataC[bin] - realdataC[0, bin]

##Hamming distance for binary values
hamming(syndataC.loc[0,bin].values.tolist(), realdataC.loc[1, bin].values.tolist()) * len(realdataC.loc[1, bin].values.tolist())

In [None]:
test_weights2 = np.ones(len(syndataC.loc[0,med]))

In [None]:
np.matmul(abs(syndataC.loc[0,med] - realdataC.loc[0,med]).values.tolist(),  np.transpose(test_weights2))

In [None]:
np.matmul(abs(syndataC.loc[0,med] - realdataC.loc[0,med]).values.tolist(),  np.transpose(test_weights2)) + hamming(syndataC.loc[0,bin].values.tolist(), realdataC.loc[1, bin].values.tolist()) * len(realdataC.loc[1, bin].values.tolist())