In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from pprint import pprint
import random as rnd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read in probability data
prob_df = pd.read_csv('CRISPR_gene_dependency 3.06.36 PM.csv')
# Read in labels
label_df = pd.read_csv('sample_info.csv').reset_index()


# Select desired labels only
label_df = label_df[['DepMap_ID', 'lineage', 'sex']]

In [3]:
combined_df = pd.merge(prob_df, label_df, left_on='DepMap_ID', right_on='DepMap_ID')

In [4]:
combined_df.dropna(axis=0, inplace=True)

In [5]:
blood_df = combined_df[combined_df['lineage'] == 'blood']
not_blood_df = combined_df[combined_df['lineage'] != 'blood']

In [6]:
size = len(blood_df)

not_blood = rnd.sample(list(not_blood_df.index), k=size)
not_blood_df2 = not_blood_df.loc[not_blood]

blood_data = pd.concat([blood_df, not_blood_df2], axis=0)
blood_data

Unnamed: 0,DepMap_ID,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),...,ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009),lineage,sex
1,ACH-000004,0.012676,0.049011,0.075933,0.033215,0.013176,0.097497,0.005015,0.153166,0.007358,...,0.007125,0.021209,0.011203,0.060266,0.128375,0.005911,0.004645,0.042530,blood,Male
2,ACH-000005,0.053957,0.027968,0.010139,0.005448,0.018599,0.081636,0.005457,0.159904,0.050884,...,0.054525,0.033396,0.033416,0.034712,0.092832,0.012482,0.020843,0.050412,blood,Male
28,ACH-000045,0.032071,0.009133,0.016520,0.018493,0.015912,0.037355,0.107148,0.265936,0.049798,...,0.003826,0.003129,0.012569,0.053228,0.025654,0.040709,0.096456,0.408855,blood,Male
37,ACH-000070,0.006839,0.016416,0.011145,0.000823,0.249916,0.023081,0.011167,0.047596,0.014651,...,0.004846,0.001659,0.018769,0.045529,0.011120,0.050825,0.046322,0.347739,blood,Male
38,ACH-000074,0.022002,0.017107,0.005077,0.000649,0.018865,0.044079,0.009882,0.012521,0.018678,...,0.006785,0.012226,0.028397,0.043169,0.012278,0.007279,0.005885,0.156583,blood,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721,ACH-001295,0.067002,0.041778,0.033993,0.048634,0.037873,0.128578,0.113695,0.160861,0.028161,...,0.015653,0.015170,0.037919,0.051443,0.059393,0.060943,0.237563,0.481471,eye,Female
948,ACH-001862,0.003301,0.010862,0.053514,0.115383,0.056457,0.610802,0.012984,0.508069,0.127964,...,0.056923,0.011835,0.083590,0.063044,0.064767,0.185720,0.174639,0.032133,bile_duct,Male
64,ACH-000120,0.152735,0.071259,0.015100,0.039175,0.030886,0.138717,0.017778,0.673827,0.026037,...,0.013832,0.002857,0.037749,0.014961,0.066051,0.089158,0.042005,0.558650,peripheral_nervous_system,Male
826,ACH-001536,0.103327,0.105757,0.023968,0.226432,0.148291,0.179185,0.117725,0.200721,0.051404,...,0.057576,0.086684,0.016250,0.111736,0.048491,0.096945,0.144653,0.226515,bile_duct,Female


In [7]:
X = blood_data.drop(columns=['lineage', 'DepMap_ID'])
X = pd.get_dummies(X)

y = blood_data['lineage'] == 'blood'

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, shuffle=True)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [13]:
logreg.score(X_test, y_test)

0.9487179487179487