In [1]:
import warnings
warnings.filterwarnings("ignore")

import missingno
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from missingpy import MissForest

In [2]:
file_names = ["GDS4987", "GDS4399", "GDS4133", "GDS4132", "GDS3841", "GDS3104", "GDS2084", "GDS1051", "GDS1050"]
file_names.sort()

In [3]:
for i,model in enumerate(file_names):
    if i == 0:
        df = pd.read_csv("../datasets/"+model+"_normalized.csv")
    else:
        temp = pd.read_csv("../datasets/"+model+"_normalized.csv")
        df = df.append(temp, ignore_index=True)
        print("Shape: ", df.shape, "; Addition: ", temp.shape[0], sep="")
        
df.to_csv("../datasets/consolidated_normalized.csv")

Shape: (26, 24981); Addition: 13
Shape: (41, 24981); Addition: 15
Shape: (70, 24981); Addition: 29
Shape: (93, 24981); Addition: 23
Shape: (113, 24981); Addition: 20
Shape: (136, 24981); Addition: 23
Shape: (146, 24981); Addition: 10
Shape: (175, 24981); Addition: 29


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Columns: 24981 entries, sample_id to PCOS
dtypes: float64(24979), int64(1), object(1)
memory usage: 33.4+ MB


In [5]:
# Drop columns that have all null values
all_null_values = [i for i in df.columns[1:] if df[i].isnull().all()]
df.drop(all_null_values, axis=1, inplace=True)
print("Number of columns will all NaNs:", len(all_null_values))

# Removing additional columns that have nulls
more_than_0_null = [i for i in df.columns[1:] if df[i].isnull().sum() > 0]
df.drop(more_than_0_null, axis=1, inplace=True)
print("Number of additional columns dropped (>0% NaNs):", len(more_than_0_null))
print("Size of the dataframe now:", df.shape)

# Preprocessing
df["PCOS"] = df["PCOS"].astype('category')
cat_pos = list(np.where(df[df.columns[1:]].dtypes=='category')[0])
print("Categorical Variables:", cat_pos)

Number of columns will all NaNs: 1555
Number of additional columns dropped (>0% NaNs): 18808
Size of the dataframe now: (175, 4618)
Categorical Variables: [4616]


In [6]:
display(df.isnull().sum())
print(df.isnull().sum().sum())

sample_id           0
2                   0
12                  0
16                  0
18                  0
                   ..
261726              0
261734              0
100534595_221092    0
100533181_486       0
PCOS                0
Length: 4618, dtype: int64

0


In [7]:
df.to_csv("../datasets/consolidated_normalized.csv")

In [8]:
df

Unnamed: 0,sample_id,2,12,16,18,21,24,27,32,34,...,390502,390616,390999,653808,317762,261726,261734,100534595_221092,100533181_486,PCOS
0,GSM27536,0.356383,0.305418,0.477974,0.204301,0.405797,0.429752,0.693258,0.455461,0.000000,...,0.140998,0.765579,0.026634,0.243243,0.770810,0.120213,1.000000,0.156442,0.600733,1
1,GSM27537,0.597518,0.919428,0.348646,0.443441,0.536232,0.123967,0.214607,0.000000,0.770288,...,0.151844,0.379822,0.052462,0.087838,0.000000,1.000000,0.331117,0.000000,0.435897,1
2,GSM27538,0.365248,1.000000,0.179577,0.138925,0.980676,0.169421,0.241573,0.333850,0.513743,...,0.394794,0.163205,0.163035,1.000000,0.676169,0.673404,0.521277,0.325153,0.406593,1
3,GSM27540,0.150709,0.000000,0.019669,0.259785,0.000000,0.140496,0.000000,0.487219,0.657723,...,0.000000,0.169139,0.000000,0.101351,0.729761,0.730851,0.000000,0.444785,0.000000,1
4,GSM27541,0.000000,0.115201,0.192375,0.775484,0.241546,0.280992,0.723596,0.417506,1.000000,...,0.249458,0.620178,0.282486,0.148649,0.329532,0.000000,0.739362,0.239264,0.366300,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,GSM1174420,0.820160,0.170277,0.862773,0.440229,0.373087,0.263756,0.581038,0.626831,0.396902,...,0.116683,0.551667,0.394304,0.499936,0.200546,0.557889,0.738547,0.528741,0.297046,-1
171,GSM1174410,0.272481,0.036889,0.788318,0.435212,0.569510,0.124192,0.266673,0.436317,0.740972,...,0.119115,0.651566,0.387843,0.730485,0.648506,0.671575,0.945682,0.751633,0.950924,-1
172,GSM1174411,0.376323,0.036739,0.379657,0.432101,0.452168,0.176891,0.132671,0.322771,1.000000,...,0.162493,0.451815,0.626175,0.596608,0.504792,1.000000,0.227078,0.751568,0.490597,-1
173,GSM1174415,0.471400,0.114186,0.748434,0.700640,0.000000,0.569880,0.219215,0.462896,0.867065,...,0.181427,0.378690,0.471187,0.204347,0.238808,0.872641,0.667150,0.793771,0.331359,-1
