# Dimensionality Reduction

In [1]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
expression_df = pd.read_parquet("../data/expression_df_clean.parquet")
gdsc1_cell_lines_df = pd.read_parquet("../data/gdsc1_cell_lines_df_clean.parquet")

In [3]:
gene_cols = [col for col in expression_df.columns if col not in ['COSMIC_ID', 'GENE_SYMBOLS']]
print(f"Genes: {len(gene_cols)}")

expression_df[gene_cols] = expression_df[gene_cols].astype('float32')

Genes: 17419


## Method 1: Highly Variable Genes [Feature Selection]

### An HVG is a gene that varies drastically among a distribution of genes. By taking the mean and standard deviation of all of the gene columns, you can calculate the coefficient of variation 


Lowest of keys might need another way to calculate highly variable genes 

In [4]:
# Highly Variable Gene --- Simplest Option (allegedly by Chat, need to confirm)
# Could be interesting to try other ways of dimensionality reduction through HVG 

mu = expression_df[gene_cols].mean()
sigma = expression_df[gene_cols].std()
cv = sigma / (mu + 1e-6)

#print(cv.head())
#print(cv.dtype)


# Select top 1000 by CV (coefficient of variation)
top_2000_genes = cv.nlargest(2000).index.tolist()

GENE_SYMBOLS
TSPAN6      0.282659
TNMD        0.089558
DPM1        0.053858
SCYL3       0.124744
C1orf112    0.116423
dtype: float32
float32


In [5]:
expression_red = expression_df[['COSMIC_ID'] + top_2000_genes].copy()
expression_red[top_2000_genes] = expression_red[top_2000_genes].astype('float32')

expression_red.head()

GENE_SYMBOLS,COSMIC_ID,RPS4Y1,ITM2A,MIR205HG,HLA-DRA,TACSTD2,SPP1,KRT6A,LAPTM5,TSPAN8,...,ASS1,COL23A1,CCDC74A,HLX,ORM2,STAP2,DOK5,SH3GL2,SMIM3,GRAMD3
0,906826,3.29073,2.621757,4.288404,2.903513,11.081974,2.917451,2.746571,2.807994,2.687465,...,5.93969,2.970062,6.760531,4.39281,2.748087,5.789423,2.969498,2.849725,3.465793,6.921724
1,687983,8.145208,2.643407,3.018357,3.002582,2.833932,2.890623,2.78359,2.954887,2.75562,...,6.475314,4.64703,7.887856,3.661675,3.081276,4.066525,2.947699,2.724412,3.165486,3.482354
2,910927,3.33739,3.925171,2.903852,2.669335,2.866781,5.265876,2.837032,2.983895,4.717341,...,11.020127,2.807256,6.843297,3.064123,2.889762,5.335353,2.814388,3.459408,3.890452,5.766239
3,1240138,3.019027,2.735902,3.124749,2.8644,2.78187,3.585305,2.814675,2.812968,2.900259,...,11.061975,2.944432,6.405406,5.762559,2.691225,6.634615,2.962747,2.889696,4.160778,5.271458
4,1240139,11.226059,2.690414,3.014851,2.866014,2.849831,5.050464,2.809895,2.937513,3.048636,...,8.643678,2.610121,3.694187,3.929687,3.058894,3.959592,2.891671,2.876758,3.732006,7.038034


In [6]:
df = pd.merge(gdsc1_cell_lines_df, expression_red, on='COSMIC_ID')
df.head()

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,...,ASS1,COL23A1,CCDC74A,HLX,ORM2,STAP2,DOK5,SH3GL2,SMIM3,GRAMD3
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,...,8.616495,2.866278,7.210727,3.155348,2.974097,5.157155,3.009701,2.89728,3.40775,4.085085
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,...,12.583158,2.704864,5.801737,3.354692,2.839417,4.176773,4.320833,3.306872,3.367319,3.594063
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,...,7.404163,2.893073,6.602934,3.66746,2.954791,3.871328,2.91313,2.816141,3.520408,3.315893
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,...,11.642661,2.693721,7.023689,3.194075,2.903121,4.217533,3.624136,2.951353,3.526348,3.42438
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,...,4.878064,2.699771,3.644734,5.291128,2.926488,3.585454,3.144589,3.037634,3.273145,3.904578


In [7]:
drug_counts = df["DRUG_NAME"].value_counts()
#print("Frequency counts for 'Category':")
print(drug_counts)

DRUG_NAME
Cisplatin      1786
AZD7762        1784
SN-38          1783
PLX-4720       1777
Avagacestat    1771
               ... 
torin2          121
HG-6-71-01      121
THZ-1-87         93
XMD11-50         93
THZ-2-98-01      93
Name: count, Length: 378, dtype: int64
