In [1]:
# This note book use nearest neighbor method(The most closest cell line) to predict CERES score of each gene. 

# Author: Yiyun

import pandas as pd
from os.path import join
import time
from scipy.spatial import distance
import seaborn as sns

In [2]:
# We first get the closets cell line from the Celligner in Depmap
celligner_dir = '../data/DepMap/celligner'
q3_dir = '../data/DepMap/19Q3'

In [3]:
# Read data
df_celligner = pd.read_csv(join(celligner_dir,'celligner_alignment.csv'),index_col = 0)
df_ref19q3 = pd.read_csv(join(q3_dir,'Achilles_gene_effect.csv'), index_col = 0)



# Check if the UMAP data is consistant as on the Celligner website
# df_cells.plot.scatter(x='UMAP_1',
#                       y='UMAP_2')

In [4]:
### Not all 19q3 cell lines are in celligner, we find nearest neighbors in 19q3 dataset
# Therefore the closest cell line is not necessarily the one in the Celligner

# Get only the cell line data in celligner
df_cells = df_celligner[df_celligner['sampleID'].str.startswith('ACH-')][['sampleID','UMAP_1','UMAP_2']]
df_cells = df_cells.transpose()
new_header = df_cells.iloc[0] 
df_cells = df_cells[1:] 
df_cells.columns = new_header 

In [5]:
# Select 19q3 cell line in Celligner cell lines, 3 cell lines are not found, process as missing data
cells_19q3 = df_ref19q3.index
cells_celligner = df_cells.columns

# Check if there's cell line in 19q3 not in celligner
cells_NF = [q3_cell for q3_cell in cells_19q3 if q3_cell not in cells_celligner]

# drop those cell lines from 19q3 list
cells_F = [q3_cell for q3_cell in cells_19q3 if q3_cell not in cells_NF]

In [6]:
# Select q3 cell line in celligner
df_cells = df_cells[cells_F]

In [7]:
### Calculate euclidean distance for each samples
df_close_cells = pd.DataFrame()

for cells in df_cells.columns:
    rest_cells = df_cells.columns.drop(cells)
    
    dist_0 = float(10000) # initiate a biggest value to find the smallest value
    close_cell = 'Random'
    for rest_cell in rest_cells:
        dist = distance.euclidean(df_cells[cells],df_cells[rest_cell])
        if dist <= dist_0:
            dist_0=dist
            close_cell = rest_cell
    df_close_cells[cells] = [close_cell,dist_0]

KeyboardInterrupt: 

In [None]:
# Create a prediction dataframe
index_cellline = df_ref19q3.index
column_genes = df_ref19q3.columns

df_prediction = pd.DataFrame(index = index_cellline, columns = column_genes)

In [None]:
### Fill the prediction dataframe by the value of the closest cell line
### This part runs ~30min
for genes in column_genes:
    for cells in index_cellline:
        try:
            close_cellline = df_close_cells.loc[0,cells]
            pred_score = df_ref19q3.loc[close_cellline,genes]
            df_prediction.loc[cells,genes] = pred_score
        except KeyError:
            df_prediction.loc[cells,genes] = None

In [None]:
# df_prediction

In [None]:
# df_prediction.to_csv('prediction_040621.csv')

In [15]:
# df_prediction = pd.read_csv('prediction_040621.csv', index_col = 0)

In [16]:
df_prediction

Unnamed: 0,A1BG (1),A1CF (29974),A2M (2),A2ML1 (144568),A3GALT2 (127550),A4GALT (53947),A4GNT (51146),AAAS (8086),AACS (65985),AADAC (13),...,ZWILCH (55055),ZWINT (11130),ZXDA (7789),ZXDB (158586),ZXDC (79364),ZYG11A (440590),ZYG11B (79699),ZYX (7791),ZZEF1 (23140),ZZZ3 (26009)
ACH-000004,-0.068759,0.218792,0.178252,0.158390,-0.193862,-0.324566,0.246220,-0.576495,-0.081217,0.016182,...,-0.176432,-0.391199,-0.182117,-0.108978,0.186545,-0.075884,-0.095781,0.029269,0.000945,-0.242038
ACH-000005,0.168684,0.089128,-0.196966,-0.021260,0.038541,-0.175141,0.349346,-0.441008,0.291208,0.147993,...,-0.135665,-0.461810,,,0.253495,0.233904,-0.406140,0.283878,0.230978,-0.135112
ACH-000007,0.048482,0.094373,-0.139037,0.238002,0.103259,0.062679,0.673452,-0.593827,-0.125634,0.804001,...,-0.253909,-0.705856,-0.017592,-0.155222,-0.365297,0.246678,-0.163414,0.290514,-0.239763,-0.378078
ACH-000009,0.078662,0.027718,-0.239604,0.216768,-0.203420,-0.142226,0.088980,-0.601941,-0.748862,0.207593,...,-0.276164,-0.795237,0.163436,0.075255,0.099157,-0.071944,-0.440533,0.054337,-0.270058,-0.274998
ACH-000011,0.318330,0.166838,-0.079634,0.158711,-0.069318,-0.140133,-0.008595,-0.680885,-0.237415,0.058600,...,-0.035171,-0.334166,-0.242778,0.026172,0.027567,0.161534,-0.144769,0.100855,-0.287420,-0.382177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ACH-001765,0.208843,0.153637,-0.134906,0.167583,-0.130067,-0.082006,0.108194,-0.511436,0.096255,0.091139,...,-0.006292,-0.598863,0.191322,0.145424,0.056416,0.125346,-0.089675,0.063647,0.123191,-0.459169
ACH-001814,0.052705,0.145646,-0.081237,0.226078,-0.298986,0.089380,0.247493,-0.585167,-0.016999,0.114008,...,-0.087530,-0.749758,-0.073984,0.105300,0.246149,0.153385,-0.123115,0.098683,-0.263423,-0.553399
ACH-001838,0.076815,0.044027,0.063829,0.244633,-0.170792,-0.146647,0.120449,-0.473052,0.157312,0.194676,...,-0.143366,-0.347620,0.072566,0.096851,0.161170,0.158999,-0.137031,0.256604,-0.169694,-0.283112
ACH-001956,,,,,,,,,,,...,,,,,,,,,,


In [None]:
### Plot the correlation plot between actual and predicted, for every cell line
list_p =[]

for cell_lines in cells_F:
    df_score = pd.DataFrame(columns = ['actual','predicted'])
    score_actual = df_ref19q3.loc[cell_lines,:]
    score_predicted = df_prediction.loc[cell_lines,:]
    df_score['actual'] = score_actual
    df_score['predicted'] = score_predicted
    ax = sns.regplot(x='actual', y='predicted',data = df_score)
    corr = df_score['actual'].corr(df_score['predicted'],method = 'pearson')
    list_p.append(corr)