In [16]:
import pandas as pd
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from sklearn.ensemble import RandomForestClassifier
import random as rnd
import warnings
warnings.filterwarnings('ignore')

In [17]:
# Read in probability data
prob_df = pd.read_csv('CRISPR_gene_effect.csv')
# Read in labels
label_df = pd.read_csv('sample_info.csv').reset_index()


# Select desired labels only
label_df = label_df[['DepMap_ID', 'lineage', 'sex']]

In [18]:
# Merge data with labels
combined_df = pd.merge(prob_df, label_df, left_on='DepMap_ID', right_on='DepMap_ID')

In [19]:
# Drop NaN columns
combined_df.dropna(axis=0, inplace=True)

In [20]:
# Remove rows with very uncommon cancer lineages (less than 2 rows)
combined_df = combined_df.groupby('lineage').filter(lambda x: len(x) >= 2)

In [21]:
all_lineages = list(combined_df['lineage'].unique())

In [22]:
def cancer_binary_subset(df, target_lineage):
    # Subset data into target lineage and everything else
    cancer_df = df[df['lineage'] == target_lineage]
    not_cancer_df = df[df['lineage'] != target_lineage]

    # Grab a random sample from the not cancer dataframe
    sample_idxs = rnd.sample(list(not_cancer_df.index), k=len(cancer_df))
    not_cancer_sample = not_cancer_df.loc[sample_idxs]

    # Concatenate the dataframes
    total_df = pd.concat([cancer_df, not_cancer_sample], axis=0)

    # Seperate the features
    X = total_df.drop(columns=['lineage', 'DepMap_ID'])
    X = pd.get_dummies(X)

    # Separate the targets
    y = total_df['lineage'] == target_lineage

    return X, y

In [23]:
knn = KNeighborsClassifier(n_neighbors=4)
for lineage in all_lineages:
    X, y = cancer_binary_subset(combined_df, lineage)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, shuffle=True)
    knn.fit(X_train, y_train)
    score = knn.score(X_test,y_test)

    print(f'R2 value for {lineage}: {score}')

R2 value for ovary: 0.5263157894736842
R2 value for blood: 0.717948717948718
R2 value for colorectal: 0.8235294117647058
R2 value for urinary_tract: 0.6
R2 value for lung: 0.47674418604651164
R2 value for breast: 0.84375
R2 value for pancreas: 0.5333333333333333
R2 value for plasma_cell: 1.0
R2 value for central_nervous_system: 0.6078431372549019
R2 value for soft_tissue: 0.6428571428571429
R2 value for bone: 0.7692307692307693
R2 value for gastric: 0.5
R2 value for lymphocyte: 0.84
R2 value for peripheral_nervous_system: 0.7083333333333334
R2 value for kidney: 0.8888888888888888
R2 value for prostate: 0.5
R2 value for thyroid: 0.6666666666666666
R2 value for upper_aerodigestive: 0.8444444444444444
R2 value for unknown: 0.5
R2 value for uterus: 0.6086956521739131
R2 value for bile_duct: 0.6111111111111112
R2 value for liver: 0.6
R2 value for skin: 0.5581395348837209
R2 value for esophagus: 0.5714285714285714
R2 value for cervix: 0.7272727272727273
R2 value for eye: 0.8


In [24]:
logreg = LogisticRegression()
for lineage in all_lineages:
    X, y = cancer_binary_subset(combined_df, lineage)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, shuffle=True)
    logreg.fit(X_train, y_train)
    score = logreg.score(X_test,y_test)

    print(f'R2 value for {lineage}: {score}')

R2 value for ovary: 0.7894736842105263
R2 value for blood: 1.0
R2 value for colorectal: 0.7941176470588235
R2 value for urinary_tract: 0.75
R2 value for lung: 0.6162790697674418
R2 value for breast: 0.8125
R2 value for pancreas: 0.8666666666666667
R2 value for plasma_cell: 0.9230769230769231
R2 value for central_nervous_system: 0.8823529411764706
R2 value for soft_tissue: 0.6785714285714286
R2 value for bone: 0.8076923076923077
R2 value for gastric: 0.7727272727272727
R2 value for lymphocyte: 0.88
R2 value for peripheral_nervous_system: 0.875
R2 value for kidney: 0.8333333333333334
R2 value for prostate: 1.0
R2 value for thyroid: 0.6666666666666666
R2 value for upper_aerodigestive: 0.9111111111111111
R2 value for unknown: 1.0
R2 value for uterus: 0.6956521739130435
R2 value for bile_duct: 0.6111111111111112
R2 value for liver: 0.6
R2 value for skin: 0.9069767441860465
R2 value for esophagus: 0.7619047619047619
R2 value for cervix: 0.8181818181818182
R2 value for eye: 0.9


In [25]:
rfc = RandomForestClassifier()
for lineage in all_lineages:
    X, y = cancer_binary_subset(combined_df, lineage)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, shuffle=True)
    rfc.fit(X_train, y_train)
    score = rfc.score(X_test,y_test)

    print(f'R2 value for {lineage}: {score}')

R2 value for ovary: 0.7368421052631579
R2 value for blood: 0.8974358974358975
R2 value for colorectal: 0.7352941176470589
R2 value for urinary_tract: 0.55
R2 value for lung: 0.6511627906976745
R2 value for breast: 0.78125
R2 value for pancreas: 0.7333333333333333
R2 value for plasma_cell: 0.8461538461538461
R2 value for central_nervous_system: 0.7647058823529411
R2 value for soft_tissue: 0.6785714285714286
R2 value for bone: 0.6923076923076923
R2 value for gastric: 0.7272727272727273
R2 value for lymphocyte: 0.92
R2 value for peripheral_nervous_system: 0.8333333333333334
R2 value for kidney: 0.8333333333333334
R2 value for prostate: 0.6666666666666666
R2 value for thyroid: 0.3333333333333333
R2 value for upper_aerodigestive: 0.8888888888888888
R2 value for unknown: 0.5
R2 value for uterus: 0.6521739130434783
R2 value for bile_duct: 0.6666666666666666
R2 value for liver: 0.5333333333333333
R2 value for skin: 0.7674418604651163
R2 value for esophagus: 0.8571428571428571
R2 value for cerv