In [1]:
import warnings
warnings.filterwarnings("ignore")

import missingno
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from missingpy import MissForest

In [2]:
file_names = ["GDS4987", "GDS4399", "GDS4133", "GDS4132", "GDS3841", "GDS3104", "GDS2084", "GDS1051", "GDS1050"]
file_names.sort()

In [3]:
for i,model in enumerate(file_names):
    if i == 0:
        df = pd.read_csv("../datasets/"+model+"_normalized.csv")
    else:
        temp = pd.read_csv("../datasets/"+model+"_normalized.csv")
        df = df.append(temp, ignore_index=True)
        print("Shape: ", df.shape, "; Addition: ", temp.shape[0], sep="")
        
df.to_csv("../datasets/consolidated_normalized.csv")

Shape: (26, 24981); Addition: 13
Shape: (41, 24981); Addition: 15
Shape: (70, 24981); Addition: 29
Shape: (93, 24981); Addition: 23
Shape: (113, 24981); Addition: 20
Shape: (136, 24981); Addition: 23
Shape: (146, 24981); Addition: 10
Shape: (175, 24981); Addition: 29


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Columns: 24981 entries, sample_id to PCOS
dtypes: float64(24979), int64(1), object(1)
memory usage: 33.4+ MB


In [5]:
# Drop columns that have all null values
all_null_values = [i for i in df.columns[1:] if df[i].isnull().all()]
df.drop(all_null_values, axis=1, inplace=True)
print("Number of columns will all NaNs:", len(all_null_values))

# Removing additional columns that have nulls
more_than_0_null = [i for i in df.columns[1:] if df[i].isnull().sum() > 0]
df.drop(more_than_0_null, axis=1, inplace=True)
print("Number of additional columns dropped (>0% NaNs):", len(more_than_0_null))
print("Size of the dataframe now:", df.shape)

# Preprocessing
df["PCOS"] = df["PCOS"].astype('category')
cat_pos = list(np.where(df[df.columns[1:]].dtypes=='category')[0])
print("Categorical Variables:", cat_pos)

Number of columns will all NaNs: 1555
Number of additional columns dropped (>0% NaNs): 21757
Size of the dataframe now: (175, 1669)
Categorical Variables: [1667]


In [6]:
display(df.isnull().sum())
print(df.isnull().sum().sum())

sample_id    0
27           0
36           0
59           0
87           0
            ..
388336       0
259266       0
317762       0
261726       0
PCOS         0
Length: 1669, dtype: int64

0


In [7]:
df.to_csv("../datasets/common_normalized.csv")

In [8]:
df

Unnamed: 0,sample_id,27,36,59,87,94,105,153,164,159,...,254359,254531,100132341,100287932_100652748,387893,388336,259266,317762,261726,PCOS
0,GSM27536,0.693258,0.125461,0.336077,0.044463,0.267819,0.467742,0.490196,0.370576,0.008907,...,0.396190,1.000000,0.412466,0.601428,0.000000,0.312354,0.198387,0.770810,0.120213,1
1,GSM27537,0.214607,0.487085,0.589704,0.104294,0.000000,0.106452,0.000000,0.476058,0.631829,...,0.564190,0.497418,0.325390,0.850926,0.356499,0.173660,0.430645,0.000000,1.000000,1
2,GSM27538,0.241573,0.446494,0.548203,0.247916,0.033477,1.000000,0.137255,0.529493,0.457245,...,0.712381,0.686747,0.252062,1.000000,0.371943,0.097902,0.517742,0.676169,0.673404,1
3,GSM27540,0.000000,0.173432,0.000000,0.139978,0.242981,0.000000,0.117647,0.761277,0.350356,...,0.354286,0.778830,0.195234,0.791564,0.444015,0.132867,0.156452,0.729761,0.730851,1
4,GSM27541,0.723596,0.845018,0.373149,0.446988,0.768898,0.212903,0.352941,0.722415,1.000000,...,0.108190,0.254733,0.000000,0.267351,0.549550,0.343823,1.000000,0.329532,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,GSM1174420,0.581038,0.637359,1.000000,1.000000,0.185537,0.404005,0.550455,0.423211,0.683348,...,0.115631,0.237789,0.250726,0.765354,0.759678,0.277592,0.570973,0.200546,0.557889,-1
171,GSM1174410,0.266673,0.783178,0.487781,0.475529,0.342318,0.581647,0.313639,0.341682,0.616150,...,0.478896,0.360149,0.547353,0.633198,0.386967,0.683394,0.712426,0.648506,0.671575,-1
172,GSM1174411,0.132671,0.812518,0.490907,0.276122,0.299019,0.514624,0.455273,0.348759,0.707459,...,0.386777,0.281866,0.375745,0.966289,0.086710,0.436739,0.775867,0.504792,1.000000,-1
173,GSM1174415,0.219215,0.935999,0.589718,0.443145,0.302450,0.435289,0.000000,0.693894,0.578225,...,0.105437,0.000000,0.283685,0.956116,0.228903,0.929545,0.710650,0.238808,0.872641,-1
