In [1]:
import scipy
import scipy.io as sio
from scipy.io import savemat
import pandas as pd
import numpy as np
import math

In [2]:
KNOT_all_raw = pd.read_csv('/Volumes/My Passport/Curiosity/Data/KNOT_data_raw.csv')
KNOT_all_raw.head(5)

Unnamed: 0,ID,SourceName,TargetName,Day,TimeOrder,Hyperlink,DistanceWeights,AgeYears,SexOrient,Race,GenderFactor,EducDeg,Income,JE_5D,DS_5D,ST_5D,SC_5D,TS_5D,Count,Weight
0,101,/wiki/Jeff_Bezos,/wiki/Cloud_infrastructure,1,1,no,1.0,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,1,0.0
1,101,/wiki/Cloud_infrastructure,/wiki/Cloud_computing_security,1,2,yes,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2,0.8
2,101,/wiki/Cloud_computing_security,/wiki/Cloud_infrastructure,1,3,no,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,3,0.8
3,101,/wiki/Cloud_infrastructure,/wiki/Information_technology,1,4,yes,0.8,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,4,0.2
4,101,/wiki/Information_technology,/wiki/Computer_language,1,5,no,0.6,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,5,0.4


In [3]:
reinf_reg = pd.read_csv('/Volumes/My Passport/Curiosity/Data/KNOT_reinf_reg.csv')
reinf_reg.head(5)

Unnamed: 0,ID,Reinforcement,Regularity
0,101,50.086906,2.095558
1,104,36.52771,1.871805
2,105,39.451567,1.932047
3,106,46.694123,2.089505
4,107,36.949026,2.308878


In [4]:
mat_contents = sio.loadmat('/Volumes/My Passport/Curiosity/v8/Data/KNOT/processed_KNOT_data.mat')
ID = mat_contents['ID']
density = mat_contents['density']
num_nodes = mat_contents['num_nodes']
num_edges = mat_contents['num_edges']
all_C = mat_contents['all_C']
all_d = mat_contents['all_d'] # embedding dimensionality curves
num_change_points = mat_contents['num_change_points'] # number of dimensionality increments
all_DoF_C = mat_contents['all_DoF_C'] # conformational degrees of freedom curves
all_Betti_0 = mat_contents['all_Betti_0'] # dimension 0 Betti curves
all_Betti_1 = mat_contents['all_Betti_1'] # dimension 1 Betti curves
all_Betti_2 = mat_contents['all_Betti_2'] # dimension 2 Betti curves

In [5]:
df = pd.DataFrame(columns = ['ID', 'Node', 'density', 'num_nodes', 'num_edges',
                             'C', 'd', 'num_change_points', 'DoF_C', 'Betti_0', 'Betti_1', 'Betti_2',
                             'Age', 'Sex', 'Race', 'Gender', 'Education', 'Income',
                             'JE', 'DS', 'ST', 'SC', 'TS',
                             'Regularity', 'Reinforcement'
                            ])
for i, subj_ID in enumerate(np.squeeze(ID).tolist()):
    print('Processing subject %d.' % subj_ID)
    subj_C = all_C[i, :]
    subj_d = all_d[i, :]
    subj_DoF_C = all_DoF_C[i, :]
    subj_Betti_0 = all_Betti_0[i, :]
    subj_Betti_1 = all_Betti_1[i, :]
    subj_Betti_2 = all_Betti_2[i, :]
    for j in range(subj_C.shape[0]):
        # if no data exists for a given filtration index, the network's size must be less than the max size.
        if (math.isnan(subj_C[j]) and math.isnan(subj_d[j]) and math.isnan(subj_DoF_C[j]) 
        and math.isnan(subj_Betti_0[j]) and math.isnan(subj_Betti_1[j]) and math.isnan(subj_Betti_2[j])):
            continue
        df = df.append({'ID': int(subj_ID), 
                        'Node': int(j + 1),  
                        'density': density[i][0], 
                        'num_nodes': num_nodes[i][0], 
                        'num_edges': num_edges[i][0], 
                        'C': subj_C[j],
                        'd': subj_d[j],
                        'num_change_points': num_change_points[i][0],
                        'DoF_C': subj_DoF_C[j],
                        'Betti_0': subj_Betti_0[j], 
                        'Betti_1': subj_Betti_1[j],
                        'Betti_2': subj_Betti_2[j],
                        'Age': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'AgeYears'].iloc[0],
                        'Sex': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'SexOrient'].iloc[0],
                        'Race': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'Race'].iloc[0],
                        'Gender': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'GenderFactor'].iloc[0],
                        'Education': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'EducDeg'].iloc[0],
                        'Income': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'Income'].iloc[0],
                        'JE': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'JE_5D'].iloc[0],
                        'DS': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'DS_5D'].iloc[0],
                        'ST': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'ST_5D'].iloc[0],
                        'SC': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'SC_5D'].iloc[0],
                        'TS': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'TS_5D'].iloc[0],
                        'Regularity': reinf_reg.loc[reinf_reg['ID'] == subj_ID, 'Regularity'].item(),
                        'Reinforcement': reinf_reg.loc[reinf_reg['ID'] == subj_ID, 'Reinforcement'].item()
                       }, ignore_index = True)

Processing subject 101.
Processing subject 104.
Processing subject 105.
Processing subject 106.
Processing subject 107.
Processing subject 108.
Processing subject 109.
Processing subject 112.
Processing subject 114.
Processing subject 115.
Processing subject 117.
Processing subject 119.
Processing subject 120.
Processing subject 121.
Processing subject 122.
Processing subject 126.
Processing subject 127.
Processing subject 128.
Processing subject 130.
Processing subject 131.
Processing subject 132.
Processing subject 135.
Processing subject 138.
Processing subject 139.
Processing subject 140.
Processing subject 141.
Processing subject 146.
Processing subject 150.
Processing subject 153.
Processing subject 154.
Processing subject 155.
Processing subject 156.
Processing subject 157.
Processing subject 158.
Processing subject 159.
Processing subject 162.
Processing subject 164.
Processing subject 165.
Processing subject 167.
Processing subject 169.
Processing subject 171.
Processing subje

In [6]:
df.head(50)

Unnamed: 0,ID,Node,density,num_nodes,num_edges,C,d,num_change_points,DoF_C,Betti_0,...,Gender,Education,Income,JE,DS,ST,SC,TS,Regularity,Reinforcement
0,101,1,0.01065,285,431,,,1,,0.0,...,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906
1,101,2,0.01065,285,431,,,1,,1.0,...,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906
2,101,3,0.01065,285,431,,,1,,2.0,...,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906
3,101,4,0.01065,285,431,,,1,,3.0,...,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906
4,101,5,0.01065,285,431,,,1,,4.0,...,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906
5,101,6,0.01065,285,431,,1.0,1,0.0,5.0,...,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906
6,101,7,0.01065,285,431,0.666667,1.0,1,0.0,5.0,...,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906
7,101,8,0.01065,285,431,0.57782,1.0,1,0.0,5.0,...,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906
8,101,9,0.01065,285,431,0.438752,2.0,1,2.0,5.0,...,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906
9,101,10,0.01065,285,431,0.415203,2.0,1,3.0,5.0,...,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906


In [7]:
df.to_csv('/Volumes/My Passport/Curiosity/v8/Data/KNOT/processed_KNOT_data.csv', index = False)

In [8]:
df_2 = pd.DataFrame(columns = ['ID', 'density', 'num_nodes', 'num_edges', 'num_change_points', 
                             'Age', 'Sex', 'Race', 'Gender', 'Education', 'Income',
                             'JE', 'DS', 'ST', 'SC', 'TS',
                             'Regularity', 'Reinforcement'
                              ])
for i, subj_ID in enumerate(np.squeeze(ID).tolist()):
    print('Processing subject %d.' % subj_ID)
    df_2 = df_2.append({'ID': int(subj_ID),  
                    'density': density[i][0], 
                    'num_nodes': num_nodes[i][0], 
                    'num_edges': num_edges[i][0], 
                    'num_change_points': num_change_points[i][0],
                    'Age': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'AgeYears'].iloc[0],
                    'Sex': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'SexOrient'].iloc[0],
                    'Race': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'Race'].iloc[0],
                    'Gender': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'GenderFactor'].iloc[0],
                    'Education': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'EducDeg'].iloc[0],
                    'Income': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'Income'].iloc[0],
                    'JE': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'JE_5D'].iloc[0],
                    'DS': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'DS_5D'].iloc[0],
                    'ST': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'ST_5D'].iloc[0],
                    'SC': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'SC_5D'].iloc[0],
                    'TS': KNOT_all_raw.loc[KNOT_all_raw['ID'] == subj_ID, 'TS_5D'].iloc[0],
                    'Regularity': reinf_reg.loc[reinf_reg['ID'] == subj_ID, 'Regularity'].item(),
                    'Reinforcement': reinf_reg.loc[reinf_reg['ID'] == subj_ID, 'Reinforcement'].item()
                   }, ignore_index = True)

Processing subject 101.
Processing subject 104.
Processing subject 105.
Processing subject 106.
Processing subject 107.
Processing subject 108.
Processing subject 109.
Processing subject 112.
Processing subject 114.
Processing subject 115.
Processing subject 117.
Processing subject 119.
Processing subject 120.
Processing subject 121.
Processing subject 122.
Processing subject 126.
Processing subject 127.
Processing subject 128.
Processing subject 130.
Processing subject 131.
Processing subject 132.
Processing subject 135.
Processing subject 138.
Processing subject 139.
Processing subject 140.
Processing subject 141.
Processing subject 146.
Processing subject 150.
Processing subject 153.
Processing subject 154.
Processing subject 155.
Processing subject 156.
Processing subject 157.
Processing subject 158.
Processing subject 159.
Processing subject 162.
Processing subject 164.
Processing subject 165.
Processing subject 167.
Processing subject 169.
Processing subject 171.
Processing subje

In [9]:
df_2.head(50)

Unnamed: 0,ID,density,num_nodes,num_edges,num_change_points,Age,Sex,Race,Gender,Education,Income,JE,DS,ST,SC,TS,Regularity,Reinforcement
0,101,0.01065,285,431,1,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2.095558,50.086906
1,104,0.021935,125,170,1,30.84384,Heterosexual,White,0,BachDegree,under20k,3.2,2.5,2.4,4.0,0.0,1.871805,36.52771
2,105,0.024383,84,85,1,31.26849,Heterosexual,White,0,Master,150to199k,3.0,3.0,3.0,2.4,1.0,1.932047,39.451567
3,106,0.012459,317,624,2,25.99452,Heterosexual,AsiaAm,0,BachDegree,100to149k,5.6,3.5,1.4,5.2,1.25,2.089505,46.694123
4,107,0.012026,174,181,1,65.23562,Heterosexual,White,0,Master,50to74k,3.4,2.75,2.5,2.4,0.75,2.308878,36.949026
5,108,0.015725,111,96,1,51.58082,Heterosexual,White,0,Master,150to199k,4.8,5.0,3.0,4.8,4.5,2.073689,36.680929
6,109,0.010873,109,64,1,30.02466,Heterosexual,White,0,BachDegree,50to74k,2.8,3.75,0.4,1.6,1.75,2.09014,42.432988
7,112,0.034953,69,82,3,34.21918,Heterosexual,White,1,Master,50to74k,4.4,2.0,1.2,3.4,3.0,2.088085,33.653904
8,114,0.023256,86,85,1,20.45753,Heterosexual,AsiaAm,0,SomeCollegeNoDegree,200andup,4.4,4.5,4.0,4.6,4.0,1.870814,47.644795
9,115,0.015787,289,657,2,22.40274,Heterosexual,AsiaAm,0,BachDegree,PreferNotToAnswer,3.8,4.0,3.6,4.0,3.6,2.385505,42.2456


In [10]:
df_2.to_csv('/Volumes/My Passport/Curiosity/v8/Data/KNOT/processed_KNOT_data_no_curves.csv', index = False)