In [1]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [2]:
file_names = ["GDS4987", "GDS4399", "GDS4133", "GDS4132", "GDS3841", "GDS3104", "GDS2084", "GDS1051", "GDS1050"]
file_names.sort()
print("Current Location:", os.getcwd())
os.chdir("../datasets/")
print("Changed Directory To:", os.getcwd())

Current Location: /home/sowmya/Desktop/cs6024/research_activity/report1/codes
Changed Directory To: /home/sowmya/Desktop/cs6024/research_activity/report1/datasets


In [3]:
PCOS_mapping = {'GDS1050': ['GSM27536', 'GSM27537', 'GSM27538', 'GSM27540', 'GSM27541'], 'GDS1051': ['GSM29645', 'GSM29646', 'GSM29647', 'GSM29648', 'GSM29649'], 'GDS2084': ['GSM114834', 'GSM114842', 'GSM114843', 'GSM114847', 'GSM114848', 'GSM114850', 'GSM114852', 'GSM114853'], 'GDS3104': ['GSM156186', 'GSM156187', 'GSM156510', 'GSM156511', 'GSM156512', 'GSM156749', 'GSM156750', 'GSM156751', 'GSM156752', 'GSM156753', 'GSM156763', 'GSM156946', 'GSM156948', 'GSM156949', 'GSM156950', 'GSM156951'], 'GDS3841': ['GSM277460', 'GSM277459', 'GSM277458', 'GSM277457', 'GSM277456', 'GSM277455', 'GSM277454', 'GSM277453', 'GSM277452', 'GSM277451', 'GSM277450', 'GSM277449'], 'GDS4132': ['GSM201542', 'GSM201543', 'GSM201544', 'GSM201545', 'GSM201829', 'GSM201830', 'GSM201831', 'GSM201832', 'GSM201833', 'GSM201834'], 'GDS4133': ['GSM201863', 'GSM201864', 'GSM201865', 'GSM201866', 'GSM201867', 'GSM201868', 'GSM201869', 'GSM201870', 'GSM201871', 'GSM201872'], 'GDS4399': ['GSM850530', 'GSM850531', 'GSM850532', 'GSM850533', 'GSM850534', 'GSM850535', 'GSM850536'], 'GDS4987': ['GSM1174423', 'GSM1174424', 'GSM1174425', 'GSM1174426', 'GSM1174427', 'GSM1174428', 'GSM1174429', 'GSM1174430', 'GSM1174431', 'GSM1174432', 'GSM1174433', 'GSM1174434', 'GSM1174435', 'GSM1174436']}
control_mapping = {'GDS1050': ['GSM27531', 'GSM27532', 'GSM27533', 'GSM27534', 'GSM27543', 'GSM27546', 'GSM27548', 'GSM27549'], 'GDS1051': ['GSM29537', 'GSM29638', 'GSM29643', 'GSM29644', 'GSM29650', 'GSM29651', 'GSM29652', 'GSM29653'], 'GDS2084': ['GSM114841', 'GSM114844', 'GSM114845', 'GSM114849', 'GSM114851', 'GSM114854', 'GSM114855'], 'GDS3104': ['GSM155631', 'GSM155643', 'GSM155644', 'GSM155729', 'GSM156170', 'GSM156171', 'GSM156176', 'GSM156177', 'GSM156178', 'GSM156179', 'GSM156180', 'GSM156181', 'GSM156184'], 'GDS3841': ['GSM277448', 'GSM277447', 'GSM277446', 'GSM277445', 'GSM277444', 'GSM277443', 'GSM277442', 'GSM277441', 'GSM277440', 'GSM277439', 'GSM277438'], 'GDS4132': [''], 'GDS4133': ['GSM201849', 'GSM201850', 'GSM201851', 'GSM201852', 'GSM201853', 'GSM201854', 'GSM201855', 'GSM201856', 'GSM201857', 'GSM201858', 'GSM201859', 'GSM201861', 'GSM201862'], 'GDS4399': ['GSM850527', 'GSM850528', 'GSM850529'], 'GDS4987': ['GSM1174423', 'GSM1174424', 'GSM1174425', 'GSM1174426', 'GSM1174427', 'GSM1174428', 'GSM1174429', 'GSM1174430', 'GSM1174431', 'GSM1174432', 'GSM1174433', 'GSM1174434', 'GSM1174435', 'GSM1174436']}

# for i in file_names:
#     print(i)
#     data = input("Enter PCOS labels: ")
#     PCOS_mapping[i] = data.split(",")
#     data = input("Enter control labels: ")
#     control_mapping[i] = data.split(",")
#     print("="*50, "\n")
# GDS1050
# PCOS: GSM27536,GSM27537,GSM27538,GSM27540,GSM27541
# Control: GSM27531,GSM27532,GSM27533,GSM27534,GSM27543,GSM27546,GSM27548,GSM27549
# GDS1051
# PCOS: GSM29645,GSM29646,GSM29647,GSM29648,GSM29649
# Control: GSM29537,GSM29638,GSM29643,GSM29644,GSM29650,GSM29651,GSM29652,GSM29653
# GDS2084
# PCOS: GSM114834,GSM114842,GSM114843,GSM114847,GSM114848,GSM114850,GSM114852,GSM114853
# Control: GSM114841,GSM114844,GSM114845,GSM114849,GSM114851,GSM114854,GSM114855
# GDS3104
# PCOS: GSM156186,GSM156187,GSM156510,GSM156511,GSM156512,GSM156749,GSM156750,GSM156751,GSM156752,GSM156753,GSM156763,GSM156946,GSM156948,GSM156949,GSM156950,GSM156951
# Control: GSM155631,GSM155643,GSM155644,GSM155729,GSM156170,GSM156171,GSM156176,GSM156177,GSM156178,GSM156179,GSM156180,GSM156181,GSM156184
# GDS3841
# PCOS: GSM277460,GSM277459,GSM277458,GSM277457,GSM277456,GSM277455,GSM277454,GSM277453,GSM277452,GSM277451,GSM277450,GSM277449
# Control: GSM277448,GSM277447,GSM277446,GSM277445,GSM277444,GSM277443,GSM277442,GSM277441,GSM277440,GSM277439,GSM277438
# GDS4132
# PCOS: GSM201542,GSM201543,GSM201544,GSM201545,GSM201829,GSM201830,GSM201831,GSM201832,GSM201833,GSM201834
# Control: 
# GDS4133
# PCOS: GSM201863,GSM201864,GSM201865,GSM201866,GSM201867,GSM201868,GSM201869,GSM201870,GSM201871,GSM201872
# Control: GSM201849,GSM201850,GSM201851,GSM201852,GSM201853,GSM201854,GSM201855,GSM201856,GSM201857,GSM201858,GSM201859,GSM201861,GSM201862
# GDS4399
# PCOS: GSM850530,GSM850531,GSM850532,GSM850533,GSM850534,GSM850535,GSM850536
# Control: GSM850527,GSM850528,GSM850529
# GDS4987
# PCOS: GSM1174423,GSM1174424,GSM1174425,GSM1174426,GSM1174427,GSM1174428,GSM1174429,GSM1174430,GSM1174431,GSM1174432,GSM1174433,GSM1174434,GSM1174435,GSM1174436
# Control: GSM1174408,GSM1174409,GSM1174410,GSM1174411,GSM1174412,GSM1174413,GSM1174414,GSM1174415,GSM1174416,GSM1174417,GSM1174418,GSM1174419,GSM1174420,GSM1174421,GSM1174422

In [4]:
for i in file_names:
    print(i)
    fin = open(i + "_full.txt")
    complete_data = fin.read().splitlines()
    fin.close()

    pos = complete_data.index("!dataset_table_begin")
    data = [i.split("\t") for i in complete_data[pos+1:-1]]
    df = pd.DataFrame(data[1:], columns=data[0])
    new_columns = df.columns[:-19]
    PCOS = []
    for j in new_columns[2:-1]:
        if j in PCOS_mapping[i]: PCOS.append(1)
        else: PCOS.append(0)
    
    df = df[new_columns[2:-1]].T 
    df["PCOS"] = PCOS
    df.index.rename('sample_id', inplace=True)
    
    display(df.head())
    print("="*100)
    
    df.to_csv(i + ".csv")

GDS1050


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,22274,22275,22276,22277,22278,22279,22280,22281,22282,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM27536,540.3,98.9,75.8,561.3,38.2,203.3,48.6,23.4,2.2,31.7,...,65.7,9194.0,8178.3,4.4,5.7,17.4,2.0,8.2,3.7,1
GSM27537,801.0,48.2,39.5,433.6,26.3,223.0,46.2,49.8,1.7,28.8,...,121.5,10675.2,9301.4,4.1,4.3,7.5,0.9,5.4,2.9,1
GSM27538,701.4,90.9,36.2,395.6,22.0,244.1,41.4,30.9,22.1,38.7,...,50.5,8469.8,7840.6,3.1,3.5,12.0,8.3,14.6,6.8,1
GSM27540,540.9,53.8,21.6,414.6,17.8,278.2,41.8,28.3,97.8,29.7,...,37.4,8727.7,7610.4,4.1,1.7,17.9,0.6,2.1,1.4,1
GSM27541,570.0,57.5,56.2,606.6,24.5,179.5,68.5,51.1,7.1,40.6,...,31.9,9857.7,8474.0,4.1,2.8,20.7,4.7,26.9,6.4,1


GDS1051


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,22636,22637,22638,22639,22640,22641,22642,22643,22644,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM29645,2291.7,9096.1,21086.9,27971.5,6570.2,6553.2,11778.6,8333.4,1583.7,4471.5,...,260.1,29727.2,24685.0,15.1,52.5,57.1,36.7,23.1,4.7,1
GSM29646,1203.4,5326.1,12777.1,20778.2,6469.9,3466.8,7409.4,6658.2,1469.0,4418.5,...,206.4,28925.9,19619.9,14.8,23.4,13.6,19.5,14.5,6.2,1
GSM29647,1553.1,6787.8,12945.5,19074.7,5711.8,3829.5,8626.0,6397.4,1808.2,4594.3,...,164.6,24719.0,22135.4,17.4,11.1,13.2,3.9,15.3,5.4,1
GSM29648,969.5,5953.7,13443.0,18613.0,5408.9,3097.9,8440.4,6835.5,1505.3,4325.0,...,229.5,31156.9,23934.5,13.3,12.3,7.2,21.7,33.8,3.9,1
GSM29649,1234.7,6260.2,10435.5,18431.2,8110.2,3705.1,7638.2,6695.7,2019.2,3987.4,...,138.3,26179.9,21176.6,14.0,14.0,16.5,8.3,17.5,9.5,1


GDS2084


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,22274,22275,22276,22277,22278,22279,22280,22281,22282,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM114841,222.6,35.5,41.5,229.8,14.3,150.8,29.7,35.1,47.8,22.5,...,19.9,3231.0,2542.6,3.4,1.9,4.3,0.5,2.6,1.3,0
GSM114844,252.7,24.5,53.3,419.6,13.0,116.0,35.4,44.8,53.2,24.9,...,1.2,3303.6,2947.0,3.1,2.2,2.5,0.5,2.0,1.2,0
GSM114845,219.3,23.4,31.3,274.5,29.6,89.9,53.0,28.8,41.7,38.7,...,58.9,3203.0,2572.3,8.1,10.1,9.6,4.1,6.1,5.0,0
GSM114849,258.9,31.4,43.0,227.1,16.3,125.1,42.0,39.7,116.9,14.9,...,9.0,3256.7,2750.3,2.0,2.9,4.4,0.6,1.2,0.9,0
GSM114851,239.0,20.6,65.5,271.6,4.6,89.2,44.1,30.9,43.3,19.6,...,17.8,3253.5,2905.0,2.7,1.5,1.8,0.3,1.2,1.2,0


GDS3104


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,54666,54667,54668,54669,54670,54671,54672,54673,54674,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM155631,494.091,70.1196,169.272,433.047,23.6958,190.536,582.988,76.7257,97.6912,20.3144,...,4349.59,10608.0,7981.63,43.0217,34.7916,13.3606,10.0436,24.0888,27.7694,0
GSM155643,429.377,50.927,181.858,470.4,22.7677,139.201,506.538,70.7865,39.1372,18.7627,...,4983.1,11824.1,9674.27,35.5562,18.7041,11.6228,12.2426,15.9718,21.0326,0
GSM155644,483.314,57.5347,203.872,433.667,22.1392,139.37,577.318,73.9636,55.5885,17.1864,...,4493.86,10969.6,8592.58,41.8618,21.6336,12.9994,12.9526,20.667,22.6861,0
GSM155729,370.79,68.8989,144.182,416.779,25.5855,144.213,391.381,67.7056,125.812,13.4048,...,4842.34,10973.7,8606.99,36.8131,33.941,11.2103,10.3604,14.4338,15.8809,0
GSM156170,440.023,88.3869,134.05,395.866,25.0873,191.459,567.038,89.6631,77.8286,19.249,...,5316.63,12584.2,9971.58,41.158,34.1386,10.0443,9.32941,20.5394,23.0927,0


GDS3841


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,54666,54667,54668,54669,54670,54671,54672,54673,54674,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM277438,9.22931,5.39148,4.70996,7.6806,4.01431,5.86275,4.83529,5.39932,3.07253,3.27844,...,11.0498,13.2461,13.0183,5.94438,4.13642,3.83022,2.546,3.0265,3.24679,0
GSM277439,10.0032,5.29239,3.97881,6.77488,3.95995,6.9071,4.91009,4.39151,2.96424,3.91685,...,10.5238,12.5086,12.2226,7.59906,3.85175,5.38626,2.64385,2.93269,3.0314,0
GSM277440,9.30767,5.62825,4.2565,6.61363,4.54025,6.43471,4.7625,5.02412,3.02973,3.63199,...,10.3128,13.0451,12.639,3.90759,3.11438,3.22656,2.6668,3.00462,3.04596,0
GSM277441,9.0761,6.44412,4.16672,6.633,3.80671,7.06877,4.25926,4.56714,2.86435,3.85604,...,10.6329,12.7192,12.2778,7.32951,3.4906,4.8024,2.68168,3.07677,3.18272,0
GSM277442,9.11024,6.52364,3.99349,6.38885,3.79425,6.69611,4.38182,4.66655,3.09806,3.78027,...,11.5474,13.1436,12.786,7.46809,3.66457,4.77174,2.76723,3.18352,3.12361,0


GDS4132


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,54666,54667,54668,54669,54670,54671,54672,54673,54674,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM201542,564.0,89.9426,227.829,819.361,33.2482,171.582,690.06,118.682,69.9397,20.3993,...,5512.45,13489.5,10837.7,37.1135,37.3114,15.8929,11.1081,40.1626,55.9619,1
GSM201543,602.786,80.9309,240.16,877.348,31.7194,152.814,782.983,117.017,61.0156,24.3013,...,5211.86,12773.2,9674.75,29.5291,36.845,13.4949,13.9345,36.0268,39.8973,1
GSM201544,748.264,132.771,191.737,1064.08,36.789,181.894,1014.61,142.079,89.5705,20.7396,...,5092.92,13242.4,10486.9,37.0986,42.6567,18.0175,9.93043,28.1912,34.779,1
GSM201545,677.546,118.68,200.043,1120.31,46.8138,186.127,913.275,142.023,126.21,19.594,...,4730.04,12365.2,9634.63,37.0756,47.2945,16.5661,9.44505,26.7917,31.6785,1
GSM201829,640.927,89.1658,217.188,902.493,35.7579,166.472,782.863,140.18,87.9341,24.8939,...,5237.65,12531.8,9826.49,19.3392,38.6019,14.2826,11.7873,28.6702,49.7037,1


GDS4133


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,54666,54667,54668,54669,54670,54671,54672,54673,54674,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM201849,483.409,66.9167,219.597,755.41,29.3928,127.664,422.687,91.608,51.3111,181.239,...,5347.41,12696.4,10359.1,35.783,33.0276,13.6201,14.3735,17.3238,19.0954,0
GSM201850,590.238,83.4537,210.304,683.005,26.7009,137.976,510.516,94.6009,122.923,231.476,...,5040.14,12351.1,9257.2,48.5689,50.0206,15.471,12.0054,27.5912,29.6003,0
GSM201851,543.701,78.4425,248.096,717.778,29.0265,126.366,480.698,99.7509,74.5672,190.576,...,4790.6,11486.0,9080.12,50.6883,32.2976,12.9002,15.6121,26.1336,27.1602,0
GSM201852,482.75,74.2555,176.755,870.41,26.9254,144.465,404.405,74.8484,154.936,259.128,...,5680.61,12736.2,9972.56,38.8801,66.637,11.0992,9.96302,14.5015,16.2584,0
GSM201853,557.421,104.467,159.189,892.417,29.2165,133.658,527.972,106.448,92.3223,207.577,...,6726.34,15494.6,12391.8,44.1424,52.7042,11.4834,10.6655,23.0248,22.0425,0


GDS4399


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,54666,54667,54668,54669,54670,54671,54672,54673,54674,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM850527,1827.7,98.7153,53.1415,510.73,95.8522,188.842,263.427,29.4203,5544.43,74.6677,...,23305.5,112224.0,76785.6,142.772,24.4004,4.4367,4.70751,14.5444,11.3604,0
GSM850528,2735.92,361.32,552.695,476.999,75.366,661.84,46.3222,193.988,5945.68,40.5147,...,8376.29,38106.8,28217.1,120.297,7.4676,38.5328,1.85965,9.62345,2.79069,0
GSM850529,3528.27,308.85,505.498,535.266,92.8731,583.556,109.413,153.369,5937.47,60.7556,...,9466.6,49463.5,34278.5,177.618,3.27977,41.5537,33.1628,5.66041,2.67271,0
GSM850530,2697.7,266.351,693.408,631.601,44.9358,547.97,149.737,106.504,4367.22,21.7508,...,15511.5,64373.0,46428.0,131.742,8.09418,52.556,3.50057,6.05977,4.43519,1
GSM850531,1474.9,446.131,1321.53,459.353,36.2481,682.971,116.228,170.82,3000.35,26.5737,...,10360.4,42177.9,31892.4,273.665,5.01694,12.3979,23.0304,14.1844,25.4986,1


GDS4987


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,33288,33289,33290,33291,33292,33293,33294,33295,33296,PCOS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM1174425,0.426693,0.0790515,-0.0371127,0.102249,,0.691976,0.0931196,0.278441,0.879943,-0.113919,...,,,,,,,,,,1
GSM1174429,0.0471764,0.101923,0.0815725,1.48231,,0.165609,-0.347932,-0.20964,-1.30048,0.134601,...,,,,,,,,,,1
GSM1174436,0.0611334,0.273007,0.151132,-1.08528,,1.61301,3.81034,2.96815,1.57339,1.03149,...,,,,,,,,,,1
GSM1174427,0.417477,0.197721,0.455904,0.665998,,0.192279,-0.402528,-0.0986814,-0.409827,-0.0326066,...,,,,,,,,,,1
GSM1174430,-0.0449924,0.211543,0.362855,0.987918,,1.16286,0.0,0.429622,0.55779,-0.0490508,...,,,,,,,,,,1




In [6]:
for i in file_names:
    print(i, "\tPCOS:", len(PCOS_mapping[i]), "\tControl:", len(control_mapping[i]))

GDS1050 	PCOS: 5 	Control: 8
GDS1051 	PCOS: 5 	Control: 8
GDS2084 	PCOS: 8 	Control: 7
GDS3104 	PCOS: 16 	Control: 13
GDS3841 	PCOS: 12 	Control: 11
GDS4132 	PCOS: 10 	Control: 1
GDS4133 	PCOS: 10 	Control: 13
GDS4399 	PCOS: 7 	Control: 3
GDS4987 	PCOS: 14 	Control: 14


In [7]:
for i in file_names:
    df = pd.read_csv(i+".csv", index_col=0)
    X = df.drop(["PCOS"], axis=1).to_numpy()
    y = df["PCOS"]
    
    print(i)
    pca = PCA(n_components=2)
    X[np.isnan(X)] = 0
    pca.fit(X)
    X_pca = pca.transform(X)
    df_pca = pd.DataFrame(X_pca)
    df_pca = df_pca.set_index(df.index)
    df_pca["PCOS"] = y
    df_pca["PCOS"] = df_pca["PCOS"].astype("str")
    df_pca = df_pca.rename(columns={0:"PC1", 1:"PC2"})

    fig = px.scatter(df_pca, x="PC1", y="PC2", color="PCOS", title="Data for "+i+", in 2D")
    fig.show()
    print("\n", "="*100)
    
    df['C'] = "0"
    df.loc[df["PCOS"]==0, 'C'] = "r"
    df.loc[df["PCOS"]==1, 'C'] = "g"
    
    plt.figure(figsize=[14,12])
    plt.scatter(df_pca["PC1"], df_pca["PC2"], color=df["C"].tolist(), alpha=0.6)
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title("Data for "+i+", in 2D")
    plt.grid(True)
    plt.show()

GDS1050


NameError: name 'px' is not defined

In [None]:
X[np.isnan(X)] = 0
print(np.isnan(X))
(np.where(np.isnan(X)))