In [1]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
# import plotly.express as px
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA, NMF, FastICA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from get_mapping import get_mapping

In [2]:
file_names = ["GDS4987", "GDS4399", "GDS4133", "GDS4132", "GDS3841", "GDS3104", "GDS2084", "GDS1051", "GDS1050"]
file_names.sort()
print("Current Location:", os.getcwd())
os.chdir("../datasets/")
print("Changed Directory To:", os.getcwd())

Current Location: /home/sowmya/Desktop/cs6024/project/codes
Changed Directory To: /home/sowmya/Desktop/cs6024/project/datasets


In [3]:
# Assign mapping between samples and sample type
PCOS_mapping = {'GDS1050': ['GSM27536', 'GSM27537', 'GSM27538', 'GSM27540', 'GSM27541'], 'GDS1051': ['GSM29645', 'GSM29646', 'GSM29647', 'GSM29648', 'GSM29649'], 'GDS2084': ['GSM114834', 'GSM114842', 'GSM114843', 'GSM114847', 'GSM114848', 'GSM114850', 'GSM114852', 'GSM114853'], 'GDS3104': ['GSM156186', 'GSM156187', 'GSM156510', 'GSM156511', 'GSM156512', 'GSM156749', 'GSM156750', 'GSM156751', 'GSM156752', 'GSM156753', 'GSM156763', 'GSM156946', 'GSM156948', 'GSM156949', 'GSM156950', 'GSM156951'], 'GDS3841': ['GSM277460', 'GSM277459', 'GSM277458', 'GSM277457', 'GSM277456', 'GSM277455', 'GSM277454', 'GSM277453', 'GSM277452', 'GSM277451', 'GSM277450', 'GSM277449'], 'GDS4132': ['GSM201542', 'GSM201543', 'GSM201544', 'GSM201545', 'GSM201829', 'GSM201830', 'GSM201831', 'GSM201832', 'GSM201833', 'GSM201834'], 'GDS4133': ['GSM201863', 'GSM201864', 'GSM201865', 'GSM201866', 'GSM201867', 'GSM201868', 'GSM201869', 'GSM201870', 'GSM201871', 'GSM201872'], 'GDS4399': ['GSM850530', 'GSM850531', 'GSM850532', 'GSM850533', 'GSM850534', 'GSM850535', 'GSM850536'], 'GDS4987': ['GSM1174423', 'GSM1174424', 'GSM1174425', 'GSM1174426', 'GSM1174427', 'GSM1174428', 'GSM1174429', 'GSM1174430', 'GSM1174431', 'GSM1174432', 'GSM1174433', 'GSM1174434', 'GSM1174435', 'GSM1174436']}
control_mapping = {'GDS1050': ['GSM27531', 'GSM27532', 'GSM27533', 'GSM27534', 'GSM27543', 'GSM27546', 'GSM27548', 'GSM27549'], 'GDS1051': ['GSM29537', 'GSM29638', 'GSM29643', 'GSM29644', 'GSM29650', 'GSM29651', 'GSM29652', 'GSM29653'], 'GDS2084': ['GSM114841', 'GSM114844', 'GSM114845', 'GSM114849', 'GSM114851', 'GSM114854', 'GSM114855'], 'GDS3104': ['GSM155631', 'GSM155643', 'GSM155644', 'GSM155729', 'GSM156170', 'GSM156171', 'GSM156176', 'GSM156177', 'GSM156178', 'GSM156179', 'GSM156180', 'GSM156181', 'GSM156184'], 'GDS3841': ['GSM277448', 'GSM277447', 'GSM277446', 'GSM277445', 'GSM277444', 'GSM277443', 'GSM277442', 'GSM277441', 'GSM277440', 'GSM277439', 'GSM277438'], 'GDS4132': [''], 'GDS4133': ['GSM201849', 'GSM201850', 'GSM201851', 'GSM201852', 'GSM201853', 'GSM201854', 'GSM201855', 'GSM201856', 'GSM201857', 'GSM201858', 'GSM201859', 'GSM201861', 'GSM201862'], 'GDS4399': ['GSM850527', 'GSM850528', 'GSM850529'], 'GDS4987': ['GSM1174423', 'GSM1174424', 'GSM1174425', 'GSM1174426', 'GSM1174427', 'GSM1174428', 'GSM1174429', 'GSM1174430', 'GSM1174431', 'GSM1174432', 'GSM1174433', 'GSM1174434', 'GSM1174435', 'GSM1174436']}

# Get mapping between the IDs provided and the Entrez Gene IDs
mapping, exclude_mapping = get_mapping(flag=True)
unique_geneids = list(set(mapping.values()))

def get_float_or_str(j):
    if type(j) == float:
        return int(j)
    elif type(j) == str and len(j.split("_")) == 1:
        return (int(j))
    else:
        return j
unique_geneids = set([get_float_or_str(i) for i in unique_geneids])
print(len(unique_geneids))

Number of IDs excluded: 20665
Number of genes mapped: 67313
Number of unique gene IDs: 25180
24979


In [4]:
df_list = []
for i in file_names:
    print(i)
    fin = open(i + "_full.txt")
    complete_data = fin.read().splitlines()
    fin.close()
    
    # Find position from which the dataset tabular form begins
    pos = complete_data.index("!dataset_table_begin")
    data = [i.split("\t") for i in complete_data[pos+1:-1]]
    df = pd.DataFrame(data[1:], columns=data[0])
    
    # Remove the last 19 columns
    new_columns = list(df.columns[:-19])
    # Remove the column at position 1 - IDENTIFIER
    new_columns.pop(1)
    PCOS = []
    
    # Additionally ignore the last 1 column
    # Get the mapping - PCOS, Normal, Other == [1, 0, -1]
    for j in new_columns[1:-1]:
        if j in PCOS_mapping[i]: 
            PCOS.append(1)
        elif j in control_mapping[i]: 
            PCOS.append(0)
        else:
            PCOS.append(-1)
    
    # Set df to be only equal to the required columns
    df = df[new_columns[:-1]]

    # Replace the ID_REF using the geneid_mapping
    # df.replace({"ID_REF":mapping}, inplace=True) # Has a large overhead. Using map instead
    df["ID_REF"] = df["ID_REF"].map(lambda x: mapping.get(x, np.nan))
    print("Dimensions now:", df.shape)
    
    # Remove all columns that have a mapping to null :)
    # Basically remove columns that don't map to a valid Entrez ID
    print("Removing all columns that have no Entrez mapping ...")
    df.drop(index=np.where(pd.isnull(df["ID_REF"]))[0], inplace=True)
    print("Dimensions now:", df.shape)
    
    print("Unique IDs:", len(set(df["ID_REF"])), "Actual Size:", len(df["ID_REF"]))
    # a = list(np.where(df["ID_REF"].duplicated())[0])
    # values = df["ID_REF"].reset_index().loc[a]["ID_REF"]
    # print("Duplicates:", values)
    
    for col in new_columns[1:-1]:
        try:
            df[col] = df[col].astype(float)
        except:
            df[col] = df[col].str.replace("null", str(np.nan)).astype(float)
    
    print("Eliminating all duplicates, replacing them with averages ...")
    df = df.groupby("ID_REF").mean().reset_index()
    print("Dimensions now:", df.shape)
    
    print("Checking for spurious gene IDs (if any) ...")
    for name in df["ID_REF"]:
        if name not in unique_geneids:
            print(name)
    
    print("Dimensions now:", df.shape)
    
    for col in new_columns[1:-1]:
        try:
            df[col] = df[col].astype(float)
        except:
            df[col] = df[col].str.replace("null", str(np.nan)).astype(float)
    
    print("Eliminating all duplicates, replacing them with averages ...")
    df = df.groupby("ID_REF").mean().reset_index()
            
    print("Dimensions now:", df.shape)
    
    # Transpose and add PCOS mapping column
    df = df[new_columns[:-1]].T 
    df.columns = df.iloc[0]
    df = df[1:]
    
    print("Scanning for additional columns to add ...")
    time.sleep(2)
    for j in tqdm(unique_geneids):
        if (type(j) == str or type(j) == int) and (j not in df.columns):
            df[j] = np.nan
    
    df = df[unique_geneids]
    
    df["PCOS"] = PCOS
    df.index.rename('sample_id', inplace=True)
    
    display(df.head(2))
    print("Dimensions now:", df.shape)
    
    print("="*100)
    
    df.to_csv(i + ".csv")
    df_list.append(df)

GDS1050
Dimensions now: (22283, 14)
Removing all columns that have no Entrez mapping ...
Dimensions now: (21156, 14)
Unique IDs: 13299 Actual Size: 21156
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (13299, 14)
Checking for spurious gene IDs (if any) ...
Dimensions now: (13299, 14)
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (13299, 14)
Scanning for additional columns to add ...


100%|██████████| 24979/24979 [00:23<00:00, 1047.34it/s]


ID_REF,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM27536,,33.2,,,67.6,15,442.6,7.4,365.0,5.5,...,5.9,,,55.5,554.5,,,,,1
GSM27537,,46.8,,,140.1,18,1061.4,8.1,200.5,4.3,...,16.9,,,69.3,363.9,,,,,1


Dimensions now: (13, 24980)
GDS1051
Dimensions now: (22645, 14)
Removing all columns that have no Entrez mapping ...
Dimensions now: (16667, 14)
Unique IDs: 10759 Actual Size: 16667
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (10759, 14)
Checking for spurious gene IDs (if any) ...
Dimensions now: (10759, 14)
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (10759, 14)
Scanning for additional columns to add ...


100%|██████████| 24979/24979 [00:28<00:00, 871.20it/s] 


ID_REF,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM29645,309.3,,211.8,58.0,,,,,,,...,75.2667,164.5,,429.7,,3.5,15.75,13.45,,1
GSM29646,383.6,,164.5,155.7,,,,,,,...,91.2,37.0,,582.1,,40.7,12.8,63.45,,1


Dimensions now: (13, 24980)
GDS2084
Dimensions now: (22283, 16)
Removing all columns that have no Entrez mapping ...
Dimensions now: (21156, 16)
Unique IDs: 13299 Actual Size: 21156
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (13299, 16)
Checking for spurious gene IDs (if any) ...
Dimensions now: (13299, 16)
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (13299, 16)
Scanning for additional columns to add ...


100%|██████████| 24979/24979 [00:25<00:00, 985.17it/s] 


ID_REF,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM114841,,2151.2,,,71.0,8.6,25.8,212.2,193.8,2.7,...,87.9,,,42.1,616.5,,,,,0
GSM114844,,1537.6,,,52.6,7.5,35.4,59.2,123.7,3.5,...,78.3,,,51.9,572.3,,,,,0


Dimensions now: (15, 24980)
GDS3104
Dimensions now: (54675, 30)
Removing all columns that have no Entrez mapping ...
Dimensions now: (45118, 30)
Unique IDs: 22189 Actual Size: 45118
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (22189, 30)
Checking for spurious gene IDs (if any) ...
Dimensions now: (22189, 30)
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (22189, 30)
Scanning for additional columns to add ...


100%|██████████| 24979/24979 [00:08<00:00, 2825.33it/s]


ID_REF,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM155631,104.35,670.221,163.193,88.8521,19.622,72.9238,251.359,99.4758,247.801,64.7368,...,98.8929,38.8509,,156.605,1972.94,23.0387,92.8232,51.9724,,0
GSM155643,91.2421,590.509,153.165,93.6215,15.5295,100.396,141.757,94.3241,219.067,49.4561,...,71.7802,37.303,,149.458,1429.28,18.002,79.438,41.8261,,0


Dimensions now: (29, 24980)
GDS3841
Dimensions now: (54675, 24)
Removing all columns that have no Entrez mapping ...
Dimensions now: (45118, 24)
Unique IDs: 22189 Actual Size: 45118
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (22189, 24)
Checking for spurious gene IDs (if any) ...
Dimensions now: (22189, 24)
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (22189, 24)
Scanning for additional columns to add ...


100%|██████████| 24979/24979 [00:07<00:00, 3478.53it/s]


ID_REF,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM277438,5.75855,5.11752,4.6908,5.9242,5.39236,4.17492,8.66519,10.1912,7.5254,3.35787,...,5.93128,3.98704,,6.39367,10.0155,3.61285,3.54311,2.73325,,0
GSM277439,5.31011,3.85633,3.49743,5.93624,6.44435,3.93217,8.52584,9.67913,7.45636,3.62648,...,7.22788,4.00439,,6.3163,11.8838,3.28341,3.19928,2.71341,,0


Dimensions now: (23, 24980)
GDS4132
Dimensions now: (54675, 21)
Removing all columns that have no Entrez mapping ...
Dimensions now: (45118, 21)
Unique IDs: 22189 Actual Size: 45118
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (22189, 21)
Checking for spurious gene IDs (if any) ...
Dimensions now: (22189, 21)
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (22189, 21)
Scanning for additional columns to add ...


100%|██████████| 24979/24979 [00:07<00:00, 3543.57it/s]


ID_REF,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM201542,157.416,619.794,259.838,92.6128,21.0529,407.872,225.69,122.944,328.8,99.1529,...,128.486,52.1319,,198.349,2274.41,23.4929,93.2298,61.0159,,1
GSM201543,151.416,528.307,282.33,83.2014,17.2249,440.123,179.091,121.109,335.461,89.1627,...,100.918,58.2103,,186.683,2126.79,22.1188,67.7522,59.5491,,1


Dimensions now: (20, 24980)
GDS4133
Dimensions now: (54675, 24)
Removing all columns that have no Entrez mapping ...
Dimensions now: (45118, 24)
Unique IDs: 22189 Actual Size: 45118
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (22189, 24)
Checking for spurious gene IDs (if any) ...
Dimensions now: (22189, 24)
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (22189, 24)
Scanning for additional columns to add ...


100%|██████████| 24979/24979 [00:07<00:00, 3393.04it/s]


ID_REF,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM201849,118.261,666.033,187.786,121.851,17.1161,132.988,157.22,123.814,279.946,64.6772,...,185.722,49.069,,189.303,1565.4,30.5327,78.0891,53.8534,,0
GSM201850,131.127,805.182,201.122,106.731,21.7864,88.0687,167.941,124.759,311.144,78.3015,...,237.105,52.4979,,196.058,2295.58,37.079,97.8779,65.314,,0


Dimensions now: (23, 24980)
GDS4399
Dimensions now: (54675, 11)
Removing all columns that have no Entrez mapping ...
Dimensions now: (45118, 11)
Unique IDs: 22189 Actual Size: 45118
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (22189, 11)
Checking for spurious gene IDs (if any) ...
Dimensions now: (22189, 11)
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (22189, 11)
Scanning for additional columns to add ...


100%|██████████| 24979/24979 [00:07<00:00, 3500.79it/s]


ID_REF,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM850527,132.221,281.916,426.536,1763.09,1136.28,14.4495,424.969,5980.19,207.0,30.0271,...,183.376,110.007,,339.691,1225.72,39.1957,23.6702,9.83481,,0
GSM850528,140.994,385.727,6.58768,507.824,1218.55,6.1793,2083.93,4655.95,726.953,4.10135,...,1190.19,56.0966,,390.308,5005.75,32.3426,17.1339,25.6234,,0


Dimensions now: (10, 24980)
GDS4987
Dimensions now: (33297, 30)
Removing all columns that have no Entrez mapping ...
Dimensions now: (8383, 30)
Unique IDs: 7932 Actual Size: 8383
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (7932, 30)
Checking for spurious gene IDs (if any) ...
100129884
10294
10328
10329
113246
118672
121227
121599
121665
128240
128338
140691
196968
2188
219902
220929
22978
23254
23478
25911
25912
2662
28232
283537
2950
29880
29928
317772
349196
374354
3799
4259
440068
440104
440200
440279
4914
5033
51108
51234
51643
51706
5176
54939
54996
55657
56339
57146
5757
57611
57661
5826
5916
5920
60626
6171
6232
6302
63967
642559
643332
646029
64801
650669
656
6886
7253
79038
79050
80150
80975
84243
84304
84441
84817
84838
84879
85414
8711
9045
9445
9556
9698
9742
9789
Dimensions now: (7932, 30)
Eliminating all duplicates, replacing them with averages ...
Dimensions now: (7932, 30)
Scanning for additional columns to add ...


100%|██████████| 24979/24979 [00:26<00:00, 926.73it/s] 


ID_REF,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM1174425,,2.81815,,,,,0.0,,,,...,,,,,,,,,0.373983,1
GSM1174429,,2.88769,,,,,0.0144272,,,,...,,,,,,,,,1.14427,1


Dimensions now: (29, 24980)


In [5]:
df_normalised_list = []
str_unique_geneids = [str(i) for i in unique_geneids]

for j,i in enumerate(file_names):
    df = pd.read_csv(i+".csv", index_col=0)
    X = df_list[j].to_numpy()

    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    
    df_norm = pd.DataFrame(X, columns=df.columns, index=df.index)
    df_norm["PCOS"] = df["PCOS"]
    
    print("Normalized")
    display(df_norm.head(2))
    print("Dimensions now:", df_norm.shape)
    
    df_norm.to_csv(i + "_normalized.csv")
    df_normalised_list.append(df_norm)

Normalized


Unnamed: 0_level_0,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM27536,,0.356383,,,0.0,0.384615,0.305418,0.020958,1.0,0.607143,...,0.222222,,,0.309761,0.30704,,,,,1
GSM27537,,0.597518,,,0.969251,0.48951,0.919428,0.041916,0.017324,0.392857,...,0.901235,,,0.447211,0.13522,,,,,1


Dimensions now: (13, 24980)
Normalized


Unnamed: 0_level_0,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM29645,0.359551,,0.694396,0.018298,,,,,,,...,0.009889,1.0,,0.542205,,0.0,0.302326,0.08969,,1
GSM29646,0.579243,,0.421144,0.912168,,,,,,,...,0.050292,0.082073,,1.0,,1.0,0.17759,0.927913,,1


Dimensions now: (13, 24980)
Normalized


Unnamed: 0_level_0,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM114841,,1.0,,,1.0,0.367089,0.143934,0.563865,1.0,0.169811,...,1.0,,,0.788337,0.372115,,,,,0
GSM114844,,0.208973,,,0.414013,0.227848,0.216277,0.146288,0.100128,0.245283,...,0.862857,,,1.0,0.301767,,,,,0


Dimensions now: (15, 24980)
Normalized


Unnamed: 0_level_0,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM155631,0.653276,0.810938,0.168546,0.350665,0.272959,0.341838,0.730945,0.744826,0.736345,0.711208,...,0.722419,0.714947,,0.682032,0.626154,0.375688,0.508493,0.30879,,0
GSM155643,0.353426,0.61021,0.090093,0.405875,0.126684,0.6732,0.149307,0.669677,0.297598,0.381989,...,0.228773,0.668479,,0.565364,0.157341,0.072098,0.201682,0.118327,,0


Dimensions now: (29, 24980)
Normalized


Unnamed: 0_level_0,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM277438,1.0,1.0,1.0,0.322851,0.0,0.532115,0.479324,0.667352,0.593257,0.248661,...,0.250757,0.457888,,0.827365,0.54705,0.693762,0.718163,0.33255,,0
GSM277439,0.653275,0.16758,0.165049,0.327351,0.34417,0.342526,0.449352,0.539547,0.540561,0.601389,...,0.775408,0.48136,,0.740126,1.0,0.266477,0.0,0.288548,,0


Dimensions now: (23, 24980)
Normalized


Unnamed: 0_level_0,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM201542,0.826925,0.151088,0.55927,0.097592,0.234521,0.427557,0.338108,0.342172,0.621765,0.778657,...,0.469126,0.680423,,0.636389,0.212357,0.553655,0.502746,0.14677,,1
GSM201543,0.750406,0.036972,0.728029,0.0,0.09592,0.537423,0.0,0.294047,0.683663,0.654951,...,0.0,0.851633,,0.49629,0.086864,0.483539,0.0,0.125473,,1


Dimensions now: (20, 24980)
Normalized


Unnamed: 0_level_0,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM201849,0.458835,0.378227,0.108546,0.464012,0.065336,0.848337,0.795135,0.724952,0.480945,0.416413,...,0.199512,0.540349,,0.75652,0.057868,0.0,0.009699,0.139814,,0
GSM201850,0.62777,0.64632,0.181504,0.310362,0.237945,0.215371,1.0,0.752945,0.782331,0.603324,...,0.541623,0.624514,,0.849769,0.682092,0.217251,0.387122,0.311004,,0


Dimensions now: (23, 24980)
Normalized


Unnamed: 0_level_0,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM850527,0.434441,0.0,1.0,1.0,0.695249,0.294968,0.012769,1.0,0.0,1.0,...,0.0,0.984177,,0.228134,0.0,0.377126,0.193806,0.059685,,0
GSM850528,0.480817,0.104285,0.002012,0.284588,0.785497,0.077018,0.495981,0.534207,0.433449,0.000879,...,0.710545,0.439039,,0.416488,0.431332,0.277298,0.102602,0.40066,,0


Dimensions now: (10, 24980)
Normalized


Unnamed: 0_level_0,1,2,3,131076,9,10,12,13,14,15,...,130872,130888,393046,130916,101059938_339044_9284_100288332_642799_101059953_642778_101930075,130940,130951,131034,100996712_23380_653464_647135,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM1174425,,0.955117,,,,,0.128648,,,,...,,,,,,,,,0.616121,1
GSM1174429,,0.96552,,,,,0.131916,,,,...,,,,,,,,,0.945128,1


Dimensions now: (29, 24980)
