In [40]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import csv


# Data normalization

### connectome_mean80_fibercount.csv

- Drop the regions missing from the seeds dataframe 
- Reorder columns alphabetically
- Save modified dataframe into a new CSV file 






In [41]:
# Reorder columns alphabetically for the connectome
connectome = pd.read_csv("./data/connectome_mean80_fibercount.csv")
connectome = connectome.reindex(sorted(connectome.columns), axis=1)


In [42]:
# Get the columns of the connectome
columns_array = connectome.columns.tolist()
columns_array

['Left-Accumbens-area',
 'Left-Amygdala',
 'Left-Caudate',
 'Left-Cerebellum-Cortex',
 'Left-Hippocampus',
 'Left-Pallidum',
 'Left-Putamen',
 'Left-Thalamus-Proper',
 'Left-VentralDC',
 'Right-Accumbens-area',
 'Right-Amygdala',
 'Right-Caudate',
 'Right-Cerebellum-Cortex',
 'Right-Hippocampus',
 'Right-Pallidum',
 'Right-Putamen',
 'Right-Thalamus-Proper',
 'Right-VentralDC',
 'ctx-lh-bankssts',
 'ctx-lh-caudalanteriorcingulate',
 'ctx-lh-caudalmiddlefrontal',
 'ctx-lh-cuneus',
 'ctx-lh-entorhinal',
 'ctx-lh-frontalpole',
 'ctx-lh-fusiform',
 'ctx-lh-inferiorparietal',
 'ctx-lh-inferiortemporal',
 'ctx-lh-insula',
 'ctx-lh-isthmuscingulate',
 'ctx-lh-lateraloccipital',
 'ctx-lh-lateralorbitofrontal',
 'ctx-lh-lingual',
 'ctx-lh-medialorbitofrontal',
 'ctx-lh-middletemporal',
 'ctx-lh-paracentral',
 'ctx-lh-parahippocampal',
 'ctx-lh-parsopercularis',
 'ctx-lh-parsorbitalis',
 'ctx-lh-parstriangularis',
 'ctx-lh-pericalcarine',
 'ctx-lh-postcentral',
 'ctx-lh-posteriorcingulate',
 'ct

### Tau_with_Deomographics.csv


Here, the average of Tau between the left and reight cerebellum cortex are average. 
These values are subtracted in both the left and right Cerebellum columns. 
Reorder columns alphabetically





In [43]:
twd_df = pd.read_csv("./data/Tau_with_Demographics.csv")
twd_df['Avg-Cerebellum-Cortex'] =  twd_df.loc[:,['Left-Cerebellum-Cortex', 'Right-Cerebellum-Cortex']].mean(axis=1)

# Subtracts average from Left cerebellum, then makes negative values 0.

twd_df["Left-Cerebellum-Cortex"] = twd_df["Left-Cerebellum-Cortex"].subtract(twd_df['Avg-Cerebellum-Cortex'], axis=0).clip(lower=0)

# Subtracts average from Right cerebellum, then makes negative values 0.

twd_df["Right-Cerebellum-Cortex"] = twd_df["Right-Cerebellum-Cortex"].subtract(twd_df['Avg-Cerebellum-Cortex'], axis=0).clip(lower=0)





In [44]:
# Order the dempgraphics regional tau values in alaphabetical order
twd_df = twd_df.reindex(sorted(twd_df.columns), axis=1)


#### Check for duplicates 




In [45]:
twd_df_columns = twd_df.columns.values
twd_df_columns

array(['ABETA', 'ADAS11_bl', 'ADAS11_bl.1', 'ADAS13_bl', 'ADAS13_bl.1',
       'AGE', 'APOE4', 'AV45', 'Avg-Cerebellum-Cortex', 'CDRSB',
       'Diagnosis', 'FDG', 'Left-Accumbens-area', 'Left-Amygdala',
       'Left-Caudate', 'Left-Cerebellum-Cortex', 'Left-Hippocampus',
       'Left-Pallidum', 'Left-Putamen', 'Left-Thalamus-Proper',
       'Left-VentralDC', 'MMSE_bl', 'MMSE_bl.1', 'PIB', 'PTAU',
       'PTEDUCAT', 'PTETHCAT', 'PTGENDER', 'PTMARRY', 'PTRACCAT', 'RID',
       'RID.1', 'Right-Accumbens-area', 'Right-Amygdala', 'Right-Caudate',
       'Right-Cerebellum-Cortex', 'Right-Hippocampus', 'Right-Pallidum',
       'Right-Putamen', 'Right-Thalamus-Proper', 'Right-VentralDC', 'TAU',
       'Unnamed: 0', 'Unnamed: 127', 'W_ADAS11', 'W_average_frontal',
       'W_average_hippo', 'W_average_occipital', 'W_average_parietal',
       'W_average_tau', 'W_average_temporal', 'best_DX', 'best_DX.1',
       'ctx-lh-bankssts', 'ctx-lh-caudalanteriorcingulate',
       'ctx-lh-caudalmiddlefront

#### Min-Max Scaling the Ml_stage 

In [46]:
twd_df["ml_stage"] =  (twd_df['ml_stage'] - twd_df['ml_stage'].min()) / (twd_df['ml_stage'].max() - twd_df['ml_stage'].min())
twd_df["ml_stage"]

0      0.200000
1      0.266667
2      0.266667
3      0.266667
4      0.266667
         ...   
814    0.400000
815    0.666667
816    0.000000
817    0.666667
818    0.066667
Name: ml_stage, Length: 819, dtype: float64

In [47]:
# Drop the unnamed columns 
columns_to_drop = filter(lambda x: x.startswith('Unnamed'), twd_df.columns)
columns_to_drop = list(columns_to_drop)
twd_df.drop(columns=columns_to_drop, inplace=True)


In [48]:
# Check for duplicates
duplicates_demographics  = twd_df[twd_df.duplicated(subset='RID', keep=False)]

if not duplicates_demographics.empty:
    print("There are duplicates within the same column in the DataFrame.")
    print("Duplicate values in 'RID':")
    print(duplicates_demographics['RID'].unique())
else:
    print("There are no duplicates within the same column in the DataFrame.")

There are duplicates within the same column in the DataFrame.
Duplicate values in 'RID':
[  31   56   69   96  112  377  467  668  731  734  751  800 1190 1280
 1378 1427 2002 2155 2180 2234 2239 2245 2263 2301 2304 2332 2392 4003
 4036 4094 4100 4114 4176 4179 4197 4198 4210 4214 4229 4277 4278 4281
 4290 4292 4293 4301 4309 4332 4356 4367 4384 4387 4399 4400 4410 4422
 4427 4429 4446 4448 4464 4488 4489 4510 4513 4538 4548 4576 4587 4598
 4604 4620 4637 4649 4654 4674 4706 4722 4723 4742 4744 4765 4767 4782
 4799 4813 4817 4835 4842 4855 4862 4868 4869 4874 4919 5004 5093 5097
 5100 5177 5185 5194 5198 5200 5203 5230 5259 5265 5266 5273 5277 5289
 5290 6007 6008 6013 6038 6039 6056 6061 6073 6076 6088 6100 6104 6110
 6116 6120 6133 6141 6178 6179 6185 6209 6255 6258 6284 6298]


In [49]:
# Make values for unique and actual RID numbers 
RID_demographicsUnique = twd_df['RID'].unique()
RID_demographicsvalues = twd_df['RID'].values


# Tau Demographics Categorical Binning
Categorical binning for ages and education 

**Values of Age Bins**

55-64

65-74

75-91

**Values of Years of Education Bins**

8-12

13-15 

16-20


In [50]:
#bin ages 
bin_ages = [55,65,75,90]
twd_df["age_bin"] = pd.cut(twd_df["AGE"], bins=bin_ages, labels=['55-64', '65-74', '75-91'])

In [51]:
# bin years of education 

bin_education = [7, 12, 15, 20]
twd_df["education_bin"] = pd.cut(twd_df["PTEDUCAT"],  bins=bin_education, labels=['8-12', '13-15', '16-20'])

### OneHotEncode of Age Binning

In [52]:
#OneHotEncoder Age 

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)


# Fit and transform the Gender column
encoded_data_age = encoder.fit_transform(twd_df[['age_bin']])

# Create DataFrame from encoded data
encoded_twd_df_age = pd.DataFrame(encoded_data_age, columns=encoder.get_feature_names_out(['age_bin']))

# Concatenate the original twd DataFrame with the encoded DataFrame
twd_df_encoded_age = pd.concat([encoded_twd_df_age, twd_df], axis=1)


### OneHotEncode Gender

In [53]:
#OneHotEncoder Gender 

# Fit and transform the Gender column
encoded_data = encoder.fit_transform(twd_df[['PTGENDER']])

# Create DataFrame from encoded data
encoded_twd_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['PTGENDER']))

# Concatenate the original twd DataFrame with the encoded DataFrame
twd_df_encoded_gender = pd.concat([encoded_twd_df, twd_df_encoded_age], axis=1)


### OneHotEncode Cognitive score

In [54]:
#OneHotEncoder cogntive progression  
# Fit and transform the Cognitive scores column
encoded_data_scores = encoder.fit_transform(twd_df[['best_DX.1']])

# Create DataFrame from encoded data
best_DX_demographics_scores = pd.DataFrame(encoded_data_scores, columns=encoder.get_feature_names_out(['best_DX.1']))




#### Concat the OneHotEncoded DataFrames with the original data

In [55]:
# Concatenate the original twd DataFrame with the encoded DataFrame
twd_df = pd.concat([best_DX_demographics_scores, twd_df_encoded_gender], axis=1)



### Seeding patterns.csv

Download CSV and check for duplicates 

In [56]:
seeds =  pd.read_csv("./data/Seeding patterns.csv")
# Reorder columns alphabetically
seeds = seeds.reindex(sorted(seeds.columns), axis=1)



In [57]:
# Make values for unique and actual RID numbers for seed Dataframe
RID_seedsUnique = seeds["RID"].unique()
RID_seedsvalue = seeds["RID"].values


In [58]:
# Check for duplicates
duplicates_seeds  = seeds[seeds.duplicated(subset='RID', keep=False)]

if not duplicates_demographics.empty:
    print("There are duplicates within the same column in the DataFrame.")
    print("Duplicate values in 'RID':")
    print(duplicates_demographics['RID'].unique())
else:
    print("There are no duplicates within the same column in the DataFrame.")

There are duplicates within the same column in the DataFrame.
Duplicate values in 'RID':
[  31   56   69   96  112  377  467  668  731  734  751  800 1190 1280
 1378 1427 2002 2155 2180 2234 2239 2245 2263 2301 2304 2332 2392 4003
 4036 4094 4100 4114 4176 4179 4197 4198 4210 4214 4229 4277 4278 4281
 4290 4292 4293 4301 4309 4332 4356 4367 4384 4387 4399 4400 4410 4422
 4427 4429 4446 4448 4464 4488 4489 4510 4513 4538 4548 4576 4587 4598
 4604 4620 4637 4649 4654 4674 4706 4722 4723 4742 4744 4765 4767 4782
 4799 4813 4817 4835 4842 4855 4862 4868 4869 4874 4919 5004 5093 5097
 5100 5177 5185 5194 5198 5200 5203 5230 5259 5265 5266 5273 5277 5289
 5290 6007 6008 6013 6038 6039 6056 6061 6073 6076 6088 6100 6104 6110
 6116 6120 6133 6141 6178 6179 6185 6209 6255 6258 6284 6298]


In [59]:
# Function to check if the columns are in the seed csv 

def columns_not_found(csv_file, desired_column_names):
    not_found = []
    with open(csv_file, 'r', newline='') as file:
        reader = csv.reader(file)
        header_row = next(reader)  # Read the first row as header
        for column_name in desired_column_names:
            if column_name not in header_row:
                not_found.append(column_name)
    return not_found



In [60]:
seeds_csv = "./data/Seeding patterns.csv"

In [61]:
not_found_columns = columns_not_found(seeds_csv, columns_array)
if not_found_columns:
    print("Columns not found in the CSV file:", ', '.join(not_found_columns))
else:
    print("All column names found in the CSV file.")

Columns not found in the CSV file: Left-Accumbens-area, Left-Caudate, Left-Pallidum, Left-Putamen, Right-Accumbens-area, Right-Caudate, Right-Pallidum, Right-Putamen


In [62]:
# Drop all the unnamed columns

seed_columns_to_drop = filter(lambda x: x.startswith('Unnamed'), seeds.columns)
seed_columns_to_drop = list(seed_columns_to_drop)
seeds.drop(columns=seed_columns_to_drop, inplace=True)


### Compare the RID from the Seed Dataframe (See) and the Demographic Dataframe 

In [63]:
def compare_arrays(a, b):
    common_numbers = set(a) & set(b)
    unique_to_a = set(a) - set(b)
    unique_to_b = set(b) - set(a)
    return common_numbers, unique_to_a, unique_to_b

common, unique_to_a, unique_to_b = compare_arrays(RID_seedsvalue, RID_demographicsvalues)

print("Common numbers:", common)
print("Numbers unique to array 'a':", unique_to_a)
print("Numbers unique to array 'b':", unique_to_b)


Common numbers: {6144, 6145, 6146, 6147, 4100, 6148, 6151, 4105, 6157, 6158, 6159, 6161, 4114, 4115, 2068, 21, 6163, 4119, 6164, 6168, 6173, 31, 6175, 6178, 6179, 6180, 6183, 6184, 6185, 6186, 6187, 6188, 6189, 4148, 4149, 6197, 6199, 56, 6200, 6202, 59, 6203, 6204, 6206, 6207, 4160, 6209, 4162, 6211, 4164, 69, 6212, 2119, 6213, 2121, 4169, 2123, 6216, 6221, 6222, 4175, 4176, 4177, 2130, 4179, 6224, 2133, 6226, 6227, 6228, 6229, 6231, 4187, 6232, 6233, 6234, 6236, 96, 6237, 6240, 6243, 4090, 4197, 4198, 4199, 4200, 6250, 2155, 6251, 6252, 6253, 6255, 112, 6256, 4210, 6257, 4212, 4213, 4214, 6258, 120, 6259, 6260, 6264, 6266, 6268, 6269, 127, 4224, 6271, 6274, 6275, 2180, 4229, 6277, 2183, 6278, 6279, 4234, 2187, 6281, 6282, 142, 6283, 6284, 6287, 6288, 6289, 6291, 6292, 6293, 6294, 2200, 6297, 6298, 156, 6300, 4254, 6303, 6304, 6306, 6307, 6308, 4262, 6310, 6313, 6314, 2219, 2220, 6315, 6316, 4271, 4272, 6317, 6318, 6319, 6320, 4277, 4278, 6321, 6323, 2233, 2234, 4281, 6326, 6328, 2238

### Merge the seeds and tau demographics 


In [64]:
# Merge demographics and seed DataFrames with custom suffixes using inner join
merged_twd_seeds = pd.merge(seeds, twd_df, on='RID', suffixes=('_seed', '_demographics'), how='inner')

# Drop duplicates in the merged DataFrame based on the 'RID' column
merged_twd_seeds.drop_duplicates(subset=['RID'], keep='first', inplace=True)

print("Number of rows after dropping duplicates:", len(merged_twd_seeds))
merged_twd_seeds

Number of rows after dropping duplicates: 652


Unnamed: 0,Left-Amygdala_seed,Left-Cerebellum-Cortex_seed,Left-Hippocampus_seed,Left-Thalamus-Proper_seed,Left-VentralDC_seed,RID,Right-Amygdala_seed,Right-Cerebellum-Cortex_seed,Right-Hippocampus_seed,Right-Thalamus-Proper_seed,...,merge_DX,merge_DX.1,merge_RID,merge_RID.1,ml_stage,ml_subtype,prob_ml_stage,prob_ml_subtype,age_bin,education_bin
0,0.000000,0.100782,0.000000,0.000000,0.000000,21,0.000000,0.000000,0.000000,0.000000,...,CN,CN,21,21,0.200000,0.0,0.166798,1.0,65-74,16-20
1,0.000000,0.000000,0.000000,0.000000,0.000000,31,0.000000,0.000000,0.000000,0.000000,...,CN,CN,31,31,0.266667,0.0,0.149829,1.0,75-91,16-20
3,0.000000,0.000000,0.000000,0.000000,0.000000,56,0.000000,0.000000,0.000000,0.000000,...,MCI,MCI,56,56,0.266667,0.0,0.149921,1.0,65-74,13-15
5,0.000000,0.000000,0.000000,0.000000,0.000000,59,0.000000,0.000000,0.000000,0.000000,...,MCI,MCI,59,59,0.266667,0.0,0.150045,1.0,65-74,13-15
6,0.093932,0.237063,0.111021,0.122752,0.121115,69,0.135576,0.163374,0.028771,0.162657,...,MCI,MCI,69,69,0.000000,0.0,0.244147,1.0,65-74,16-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
808,0.000000,0.000000,0.000000,0.000000,0.000000,6704,0.549741,0.000000,0.000000,0.000000,...,CN,CN,6704,6704,0.400000,0.0,0.168199,1.0,65-74,8-12
809,0.000000,0.000000,0.000000,0.000000,0.062053,6705,0.083348,0.000000,0.000000,0.000000,...,Dementia,Dementia,6705,6705,0.666667,0.0,0.375510,1.0,65-74,16-20
810,0.000000,0.000000,0.000000,0.000000,0.000000,6713,0.000000,0.000000,0.000000,0.000000,...,Dementia,Dementia,6713,6713,0.000000,0.0,0.201921,1.0,65-74,16-20
811,0.000000,0.000000,0.000000,0.000000,0.216880,6721,0.065006,0.000000,0.000000,0.000000,...,Dementia,Dementia,6721,6721,0.666667,0.0,,1.0,75-91,16-20


In [65]:
# Save to CSV
merged_twd_seeds.to_csv('./data/Merged_Tau_with_Demographics_and_Seeds.csv')

### After EDA drop the missing regions and rows from the connectome 


In [66]:
# Drop missing regions from the connectome
connectome.drop(columns=not_found_columns, inplace=True)

connectome = connectome.iloc[:78]


In [67]:
# Save to CSV
connectome.to_csv('./data/connectome_mean80_fibercount_ordered.csv')