In [1]:
import numpy as np
import pandas as pd
import pickle as pkl

import time
from tqdm import tqdm

from sklearn.metrics import log_loss
from sklearn.decomposition import PCA, NMF, FastICA
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
# Setting up the possible latent dimensions
# A total of 27 latent dimensions are taken under consideration

k_list = []
k_list.extend(list(range(2, 10)))
k_list.extend(list(range(10, 20, 2)))
k_list.extend(list(range(20, 50, 5)))
k_list.extend(list(range(50, 61, 10)))
k_list.append(78)
k_list.extend(list(range(80, 100, 10)))
k_list.extend(list(range(100, 176, 25)))

print("Latent dimensions:")
print(k_list)

Latent dimensions:
[2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, 50, 60, 78, 80, 90, 100, 125, 150, 175]


In [4]:
# Read the data. The df_new file has additional information about each sample
# such as the following:
#   - PCOS/Control
#   - Cell type
#   - Dataset they belong to

common_norm_df = pd.read_csv('/content/drive/MyDrive/aacb_project/datasets/common_normalized.csv', index_col=0)
df_new = pd.read_csv("/content/drive/MyDrive/aacb_project/datasets/control_pcos_celltype_mapping.csv")
df_new.head()

Unnamed: 0,sample_id,PCOS/Control,cell_type,dataset
0,GSM1174425,PCOS,endothelial,GDS4987
1,GSM1174429,PCOS,endothelial,GDS4987
2,GSM1174436,PCOS,endothelial,GDS4987
3,GSM1174427,PCOS,epithelial,GDS4987
4,GSM1174430,PCOS,epithelial,GDS4987


In [5]:
# Rearrange the rows in df_new to ensure that the 
position = []
values = list(df_new["sample_id"])
for i,j in enumerate(common_norm_df["sample_id"]):
    position.append(values.index(j))

df_new = df_new.loc[position]
df_new = df_new.reset_index()
df_new = df_new.drop("index", axis=1)

# Merge the two dataframes together
result = pd.merge(common_norm_df, df_new[df_new.columns[:-1]], how='inner', on='sample_id')
result.head()

Unnamed: 0,sample_id,27,36,59,87,94,105,153,159,164,226,288,290,311,330,334,335,345,355,359,377,382,389,392,394,405,408,420,430,443,463,476,487,488,515,533,552,567,572,582,...,92822,93164,93487,93974,6248_84301,8693_100528030,100506581,112399,112479,374655,375035,375057,113251,114088,114791,114882,116228,116285,116985,116986,51463_653519,118491,118987,120227,645644,100129482,100529257_55333,253512,122704,253959,254359,254531,100132341,387893,388336,259266,261726,PCOS,PCOS/Control,cell_type
0,GSM27536,0.693258,0.125461,0.336077,0.044463,0.267819,0.467742,0.490196,0.008907,0.370576,0.953515,0.0,0.060236,0.230814,0.078014,0.1193,1.0,1.0,0.0,0.712054,0.648867,0.535433,0.593551,0.65132,0.0,0.0,1.0,0.132791,1.0,0.0,0.263318,0.725872,0.701149,0.395953,0.302829,0.83933,0.349876,0.835372,0.502399,0.800414,...,0.146067,0.064516,0.915703,0.768041,0.925659,0.0,0.85574,0.130112,0.0,1.0,0.086751,0.288095,0.0,0.560748,0.965517,0.0,0.121212,0.0,1.0,0.261745,0.209231,0.061603,0.295745,0.489703,0.481948,0.113924,0.100254,0.0,0.369072,0.0,0.39619,1.0,0.412466,0.0,0.312354,0.198387,0.120213,1,PCOS,theca
1,GSM27537,0.214607,0.487085,0.589704,0.104294,0.0,0.106452,0.0,0.631829,0.476058,1.0,0.0409,0.038093,0.799387,0.131206,0.118719,0.0,0.025316,0.216724,0.023437,0.286346,0.619005,0.946925,0.854954,0.337931,0.352278,0.149398,0.636856,0.0,0.184211,0.238965,0.808785,0.609195,0.561854,0.916302,0.265155,0.28536,0.808202,0.291843,0.515512,...,0.070225,0.129032,0.512184,0.694845,0.0,0.536028,0.937385,0.0,0.809129,0.158155,0.586751,0.424603,0.644419,1.0,0.0,0.949077,0.994318,0.085938,0.427987,0.049664,0.763077,0.0,1.0,0.080092,0.0,0.082278,0.40736,0.418699,0.437113,0.367855,0.56419,0.497418,0.32539,0.356499,0.17366,0.430645,1.0,1,PCOS,theca
2,GSM27538,0.241573,0.446494,0.548203,0.247916,0.033477,1.0,0.137255,0.457245,0.529493,0.864979,0.321063,0.21261,0.564342,0.124113,0.0,0.119891,0.0,0.205272,0.253348,0.610282,1.0,0.764284,0.762722,0.244444,0.062224,0.227353,0.00271,0.0,0.338057,0.135464,1.0,0.452107,0.908008,0.264473,1.0,0.25062,0.634656,0.218459,0.446225,...,0.390449,0.322581,1.0,0.691409,0.534772,0.061511,0.709024,0.009294,0.578838,0.166392,0.323344,0.0,0.183544,0.88785,0.75431,0.677912,0.556818,0.035156,0.0,0.618792,0.622154,0.575527,0.753191,0.338673,0.245172,0.0,0.480964,0.317073,0.618557,0.229993,0.712381,0.686747,0.252062,0.371943,0.097902,0.517742,0.673404,1,PCOS,theca
3,GSM27540,0.0,0.173432,0.0,0.139978,0.242981,0.0,0.117647,0.350356,0.761277,0.958612,0.243354,1.0,0.103006,1.0,0.585332,0.035422,0.778481,0.864088,0.006696,0.321946,0.977631,0.541706,0.052429,0.422222,0.176384,0.0,0.439024,0.25,0.350202,0.324201,0.706619,0.689655,0.372847,0.309933,0.948579,0.0,1.0,0.230596,0.0,...,0.839888,0.612903,0.623574,1.0,0.633094,0.084359,0.448128,0.589219,0.929461,0.337727,0.160883,0.315079,0.539701,0.0,0.625,0.810312,0.0,1.0,0.135025,0.0,0.688,0.151899,0.555319,0.0,0.359362,0.120253,0.083756,0.158537,0.0,0.240081,0.354286,0.77883,0.195234,0.444015,0.132867,0.156452,0.730851,1,PCOS,theca
4,GSM27541,0.723596,0.845018,0.373149,0.446988,0.768898,0.212903,0.352941,1.0,0.722415,0.571534,0.26244,0.150497,0.178874,0.836879,1.0,0.169619,0.525316,1.0,0.444196,0.0,0.0,0.461032,0.0,1.0,0.516903,0.341622,0.116531,0.125,0.504049,1.0,0.265945,0.214559,0.254592,1.0,0.0,0.665012,0.44067,0.187412,0.415202,...,1.0,0.0,0.273651,0.334021,0.654676,0.557118,0.0,0.124535,1.0,0.3229,0.0,0.246825,1.0,0.186916,0.362069,0.969764,0.507576,0.136719,0.115385,0.053691,0.950154,0.858228,0.821277,0.819222,0.680101,0.120253,0.621827,0.20935,0.171134,1.0,0.10819,0.254733,0.0,0.54955,0.343823,1.0,0.0,1,PCOS,theca


In [6]:
def get_cost_reconstruction(X, model, k_list=k_list):
    bce_loss = []
    l2_error = []
    output = {}
    
    print("Calculating Reconstruction Error for:", model.upper())
    time.sleep(1)
    
    if model == "pca":
        for k in tqdm(k_list):
            model = PCA(n_components=k, random_state=4)
            model.fit(X)
            reduced = model.transform(X)
            reconstructed = model.inverse_transform(reduced)
            bce_loss.append(log_loss(X.reshape(-1,).astype(int), reconstructed.reshape(-1,)))
            l2_error.append(np.linalg.norm(X-reconstructed))
            output[k] = model.components_

    if model == "ica":
        for k in tqdm(k_list):
            model = FastICA(n_components=k, random_state=4, max_iter=400)
            model.fit(X)
            reduced = model.transform(X)
            reconstructed = model.inverse_transform(reduced)
            bce_loss.append(log_loss(X.reshape(-1,).astype(int), reconstructed.reshape(-1,)))
            l2_error.append(np.linalg.norm(X-reconstructed))
            output[k] = model.components_

    if model == "nmf":
        for k in tqdm(k_list):
            model = NMF(n_components=k, random_state=4, max_iter=400)
            model.fit(X)
            reduced = model.transform(X)
            reconstructed = model.inverse_transform(reduced)
            bce_loss.append(log_loss(X.reshape(-1,).astype(int), reconstructed.reshape(-1,)))
            l2_error.append(np.linalg.norm(X-reconstructed))
            output[k] = model.components_
    
    return bce_loss, l2_error, output

In [7]:
model_list = ["pca", "ica", "nmf"]
bce_loss = []
l2_error = []
reconstruction_list = {}

X = common_norm_df[common_norm_df.columns[1:-1]].to_numpy()

for model in model_list:
    bce, l2, output = get_cost_reconstruction(X, model)
    bce_loss.append(bce)
    l2_error.append(l2)
    reconstruction_list[model] = output

Calculating Reconstruction Error for: PCA


100%|██████████| 28/28 [00:04<00:00,  6.57it/s]


Calculating Reconstruction Error for: ICA


100%|██████████| 28/28 [00:09<00:00,  2.90it/s]


Calculating Reconstruction Error for: NMF


100%|██████████| 28/28 [03:17<00:00,  7.07s/it]


In [8]:
z_dict = {}
for k in k_list:
  z_dict[k] = {}
  for algo in model_list:
    z_dict[k][algo] = reconstruction_list[algo][k]

In [12]:
with open('/content/drive/MyDrive/aacb_project/datasets/z_dict_pca_ica_nmf.p', 'wb') as f:
  pkl.dump(z_dict, f)