# Datasets

This is used for testing all the datasets and visualizing all the different ones.

## Set-Up
We'll primarily using pandas to work with the data. We start off by importing pandas and importing the data as a pandas dataframe.

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.spatial.distance import hamming

In [None]:
realdata = pd.read_csv('kag_risk_factors_cervical_cancer.csv')
ctgan = pd.read_csv('Syn data/ctgan_5000.csv')
gauscop = pd.read_csv('Syn data/gauscop_5000.csv')
synthpop = pd.read_csv('Syn data/synthpop_5000.csv')
tvae = pd.read_csv('Syn data/tvae_5000.csv')
syndata = pd.read_csv('synData9.csv')
realdata34 = pd.read_csv('kag_risk_factors_cervical_cancer.csv')

In [None]:
realdata34.drop('STDs: Time since first diagnosis', axis=1, inplace=True)
realdata34.drop('STDs: Time since last diagnosis', axis=1, inplace=True)

In [None]:
len(realdata.columns),len(realdata34.columns), len(ctgan.columns), len(gauscop.columns), len(synthpop.columns), len(tvae.columns), len(syndata.columns)

In [None]:
#Med represents continuous variables and bin is binary variables (in the real data)


#'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis' not in rows with 34
med34 = ['Age', 'Number of sexual partners', 'First sexual intercourse',
        'Num of pregnancies', 'Smokes (years)','Smokes (packs/year)','Hormonal Contraceptives (years)','IUD (years)',
        'STDs (number)', 'STDs: Number of diagnosis']

med = ['Age', 'Number of sexual partners', 'First sexual intercourse',
        'Num of pregnancies', 'Smokes (years)','Smokes (packs/year)','Hormonal Contraceptives (years)','IUD (years)',
        'STDs (number)', 'STDs: Number of diagnosis',
        'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis']

bin = ['Smokes', 'Hormonal Contraceptives', 'IUD', 'STDs', 'STDs:condylomatosis',
        'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
        'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
        'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
        'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
        'STDs:Hepatitis B', 'STDs:HPV','Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
        'Citology', 'Biopsy']

## Data Cleaning and Helper Functions

In [None]:
def impute(data, graph=False):
    temp_data = data
    temp_data[temp_data == '?'] = np.nan

    for i in temp_data.columns:
        if(temp_data[i].dtype == 'O'):
            temp_data[i] = temp_data[i].astype('float')
    if len(data.columns) == 34:
        med = ['Age', 'Number of sexual partners', 'First sexual intercourse',
        'Num of pregnancies', 'Smokes (years)','Smokes (packs/year)','Hormonal Contraceptives (years)','IUD (years)',
        'STDs (number)', 'STDs: Number of diagnosis']
    else:
        med = ['Age', 'Number of sexual partners', 'First sexual intercourse',
        'Num of pregnancies', 'Smokes (years)','Smokes (packs/year)','Hormonal Contraceptives (years)','IUD (years)',
        'STDs (number)', 'STDs: Number of diagnosis',
        'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis']
        
    for i in med:
        imp = float(temp_data[i].median())
        temp_data[i].fillna(imp, inplace=True)
        temp_data[i].replace(np.nan, imp)

    bin = ['Smokes', 'Hormonal Contraceptives', 'IUD', 'STDs', 'STDs:condylomatosis',
        'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
        'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
        'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
        'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
        'STDs:Hepatitis B', 'STDs:HPV','Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
        'Citology', 'Biopsy']

    for i in bin:
        imp = float(temp_data[i].mode())
        temp_data[i].fillna(imp, inplace=True)
        temp_data[i].replace(np.nan, imp)

    if graph:
        temp_data.hist(figsize=(25,15))
    
    return temp_data

def norm(data1):
    """
    Min-Max normalizes the continuous data
    """
    new_dat = data1.copy()
    for col in new_dat.columns:
        if col in med:
            denom = new_dat[col].max() - new_dat[col].min()
            if denom != 0:
                new_dat[col] = (new_dat[col] - new_dat[col].min()) / denom
            else:
                new_dat[col] = 0
    return new_dat

In [None]:
realdataA = norm(impute(realdata))
syndataA = norm(impute(syndata))
ctganA = norm(impute(ctgan))
gauscopA = norm(impute(gauscop))
synthpopA = norm(impute(synthpop))
tvaeA = norm(impute(tvae))
realdata34A = norm(impute(realdata34))

syndata.isnull().values.any(), realdata34.isnull().values.any(), realdata.isnull().values.any(), ctganA.isnull().values.any(), gauscopA.isnull().values.any(), synthpopA.isnull().values.any(), tvaeA.isnull().values.any()

## Weights

In [None]:
def weights(data):
    if len(data.columns) == 36:
        return np.ones(len(data.loc[0,med]))
    if len(data.columns) == 34:
        return np.ones(len(data.loc[0,med34]))

## Comparison Function
The comparison function is the main function that compares the data. It compares all the synthetic data to one specific row of the real data and creates an array of the most similar matches.

We also compute the values without using the functions.

In [None]:
#No weighting on the binarys
def compMain(data1, data2, row_comp, weights):
    lst = []
    for i,row in data1.iterrows():
        score = np.matmul(abs(data1.loc[i,med] - data2.loc[row_comp,med]).values.tolist(),  np.transpose(weights)) + hamming(data1.loc[i,bin].values.tolist(), data2.loc[row_comp, bin].values.tolist()) * len(data2.loc[row_comp, bin].values.tolist())
        #print(score)
        lst.append(score)
    return lst

def compMain34(data1, data2, row_comp, weights):
    lst = []
    for i,row in data1.iterrows():
        score = np.matmul(abs(data1.loc[i,med34] - data2.loc[row_comp,med34]).values.tolist(),  np.transpose(weights)) + hamming(data1.loc[i,bin].values.tolist(), data2.loc[row_comp, bin].values.tolist()) * len(data2.loc[row_comp, bin].values.tolist())
        #print(score)
        lst.append(score)
    return lst

In [None]:
#Weights
syndataAW = weights(syndataA)
ctganAW = weights(ctganA)
gauscopAW = weights(gauscopA)
synthpopAW = weights(synthpopA)
tvaeAW = weights(tvaeA)

#realdata34A, realdataA

In [None]:
syndata_lst = []
for i in range(len(realdataA)):
   syndata_lst.append(compMain(syndataA, realdataA, i, syndataAW))

arr = np.array(syndata_lst)
index_values = []
for i in range(858):
    index_values.append(i)
column_values = []
for i in range(5000):
    column_values.append(i)
df = pd.DataFrame(data = arr, 
                  index = index_values, 
                  columns = column_values)
df.to_csv('syndataR.csv', index=True)

In [None]:
ctganA_lst = []
for i in range(len(realdata34A)):
   ctganA_lst.append(compMain34(ctganA, realdata34A, i, ctganAW))

arr = np.array(ctganA_lst)
index_values = []
for i in range(858):
    index_values.append(i)
column_values = []
for i in range(5000):
    column_values.append(i)
df = pd.DataFrame(data = arr, 
                  index = index_values, 
                  columns = column_values)
df.to_csv('ctganR.csv', index=True)

In [None]:
gauscopA_lst = []
for i in range(len(realdata34A)):
   gauscopA_lst.append(compMain34(gauscopA, realdata34A, i, gauscopAW))

arr = np.array(gauscopA_lst)
index_values = []
for i in range(858):
    index_values.append(i)
column_values = []
for i in range(5000):
    column_values.append(i)
df = pd.DataFrame(data = arr, 
                  index = index_values, 
                  columns = column_values)
df.to_csv('gauscopR.csv', index=True)

In [None]:
synthpopA_lst = []
for i in range(len(realdata34A)):
   synthpopA_lst.append(compMain34(gauscopA, realdata34A, i, synthpopAW))

arr = np.array(gauscopA_lst)
index_values = []
for i in range(858):
    index_values.append(i)
column_values = []
for i in range(5000):
    column_values.append(i)
df = pd.DataFrame(data = arr, 
                  index = index_values, 
                  columns = column_values)
df.to_csv('synthpopR.csv', index=True)

In [None]:
tvaeA_lst = []
for i in range(len(realdata34A)):
   tvaeA_lst.append(compMain34(tvaeA, realdata34A, i, tvaeAW))

arr = np.array(tvaeA_lst)
index_values = []
for i in range(858):
    index_values.append(i)
column_values = []
for i in range(5000):
    column_values.append(i)
df = pd.DataFrame(data = arr, 
                  index = index_values, 
                  columns = column_values)
df.to_csv('tvaeR.csv', index=True)

## Data Splitting

In [None]:
#Data Splitting
syndataCS = np.array_split(syndataC, 100)
realdataCS = np.array_split(realdataC, 17)
realsplit1 = realdataCS[0]
synsplit1 = syndataCS[0]
realsplit1 = realsplit1.drop(50)
len(realsplit1), len(synsplit1)


#For splitting other indexes, you might want to use something like
#data1 =data1.set_axis([f for sf in range(len(data1))], axis='index')
#This can change the index

## Heatmap

In [None]:
xasda = []
for i in range(len(realsplit1)):
   xasda.append(compMain(synsplit1, realsplit1, i, test_weights))

In [None]:
synD = [str(i+1) for i in range(50)]
realD = [str(i+1) for i in range(50)]

heatscores=[]

for i in range(len(realsplit1)):
    heatscores.append(compMain(synsplit1, realsplit1, i, test_weights))

In [None]:
fig, ax = plt.subplots()
im = ax.imshow(heatscores, cmap=mpl.cm.get_cmap('cividis_r'))

ax.set_xticks(np.arange(len(realD)), labels=realD)
ax.set_yticks(np.arange(len(synD)), labels=synD)



plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")


ax.set_title("Syn vs Real")

plt.colorbar(im, ax=ax)


plt.show()

## Gale Shapley

heatscores #The comparison arrays (50 x 50)

synsplit1 #The synthetic data data

realsplit1 #The real data

In [None]:
heatscores2 = heatscores

In [None]:
def rank(ar):
    """
    Takes in an array, ranks it and returns it
    """
    old_ar = ar
    sorted_ar = sorted(ar)
    for i in range(len(ar)):
        ind_in_old = old_ar.index(sorted_ar[i])
        old_ar[ind_in_old] = i


    return old_ar

def synrank(ar):
    """
    The ranking algo with changing shape of arrays
    """
    new_ar = []
    for j in range(len(ar[0])):
        curr_ar = []
        for i in range(len(ar)):        
            curr_ar.append(ar[i][j])
        new_ar.append(curr_ar)

    
    return new_ar

In [None]:
dictR = {}
dictS = {}

keys = range(50)
for i in keys:
    dictR[i] = rank(heatscores2[i])
    dictS[i] = rank(synrank(heatscores2)[i])

In [None]:
dictS

In [None]:
from matching.algorithms import galeshapley
matching = galeshapley(dictS, dictR)
matching