In [None]:
### This is a simple script formatting the PC9 data to data preprocessing
# Author: Yiyun
import pandas as pd
import os
import shutil

***
### Drop PC9 in 19Q3

In [None]:
### q3 and q4 input and output folder path
q3_dir = '../data/DepMap/19Q3'
q4_dir = '../data/DepMap/19Q4'

out_q3_dir = '../data/DepMap_DROP_PC9/19Q3'
out_q4_dir = '../data/DepMap_DROP_PC9/19Q4'
if not os.path.exists(out_q3_dir):
    shutil.copytree(q3_dir,out_q3_dir)
if not os.path.exists(out_q4_dir):
    shutil.copytree(q4_dir,out_q4_dir)

In [None]:
# Read 19Q3 and 19Q4 file, use column names as reference for genes 
df_ref19q3 = pd.read_csv(os.path.join(q3_dir,'Achilles_gene_effect.csv'), index_col = 0)
df_ref19q4 = pd.read_csv(os.path.join(q4_dir,'Achilles_gene_effect.csv'), index_col = 0)

In [None]:
# Delete PC9 data from gene effect file
df_ref19q3_m = df_ref19q3.drop(['ACH-000030'])
df_ref19q4_m = df_ref19q4.drop(['ACH-000030'])

In [None]:
# Save file
# df_ref19q3_m.to_csv(os.path.join(out_q3_dir,'Achilles_gene_effect.csv'), sep = ',')
# df_ref19q4_m.to_csv(os.path.join(out_q4_dir,'Achilles_gene_effect.csv'), sep = ',')

***
### Format external data as q3 input

In [None]:
### Read files
# Need to read 19Q3 files for mapping the gene name + gene code
q3_dir = '../data/DepMap/19Q3'
q4_dir = '../data/DepMap/19Q4'
pc9_dir = '../data/ceres_external/PC9_corrected'
to_dir = '../data/ceres_external/To'


df_ref19q3 = pd.read_csv(os.path.join(q3_dir,'Achilles_gene_effect.csv'), index_col = 0)
df_pc9 = pd.read_csv(os.path.join(pc9_dir,'BatchCorrectedPritchard.csv'), index_col = 0)
df_to = pd.read_csv(os.path.join(to_dir,'ToCellCERES.csv'), index_col = 0)

In [None]:
### Assign random cell line names and transpose the dataframe to let columns be genes
df_pc9.columns = ['ACH-000113'] # Assign a random ID for PC9
df_pc9= df_pc9.T
df_to.columns = ['ACH-000113','ACH-000067','ACH-000070','ACH-000075',\
                 'ACH-000108','ACH-000228','ACH-000233','ACH-000414']
df_to= df_to.T

In [None]:
### Map 19q3 column names to external data
# Create dictionary for gene names and id in 19Q3 file
dict_code = {}
for i in range(0, len(df_ref19q3.T)):
    name,idx = df_ref19q3.columns[i].split(' ')[0:2]
    if name not in dict_code:
        dict_code[name] = idx

# Map gene in external data
def map_gene_names(df):
    global dict_code
    for names in df.columns:
        if names in dict_code:
            new_name = names + ' ' + dict_code[names]
            df.rename(columns = {names:new_name}, inplace = True)
        else:
            df = df.drop(columns=[names])
    
    df = df.dropna(axis = 1)
    return df

df_pc9 = map_gene_names(df_pc9)
df_to = map_gene_names(df_to)

In [None]:
# df_pc9.to_csv(os.path.join(pc9_dir,'gene_effect.csv'), sep = ',')
# df_to.to_csv(os.path.join(to_dir,'gene_effect.csv'), sep = ',')

***
### Format L200 standalone score 

In [None]:
### Read the l200 standalone gene and score
pc9_dir = '../data/ceres_external/PC9_corrected'
df_pc9 = pd.read_csv(os.path.join(pc9_dir,'gene_effect.csv'), index_col = 0)
df_l200 = pd.read_csv(os.path.join(pc9_dir,'PC9AvL200_CERES.csv'))
df_l200.columns = ['landmark','PC9.1','PC9.2','mean','gene.type','scaled.score']

In [None]:
### Assign gene id - use q3 column names as reference for genes 
q3_dir = '../data/DepMap/19Q3'
df_ref19q3 = pd.read_csv(os.path.join(q3_dir,'Achilles_gene_effect.csv'), index_col = 0)

### map gene id to gene name
dict_code = {}
for i in range(0, len(df_ref19q3.T)):
    name,idx = df_ref19q3.columns[i].split(' ')[0:2]
    if name not in dict_code:
        dict_code[name] = idx
# Map gene id to gene
for i in range(0,len(df_l200)):
    name = df_l200.loc[i,'landmark']
    if name in dict_code:
        new_name = name + ' ' + dict_code[name]
        df_l200.loc[i,'landmark'] = new_name
    else:
        df_l200 = df_l200.drop(i)
df_l200 = df_l200.reset_index(drop = True)

In [None]:
### Create stand alone l200 q3q4 file 
## -- Note, this is based on the PC9 drop version, so you have to drop PC9 beforehead
out_q3_dir = '../data/DepMap_DROP_PC9/19Q3'
out_q4_dir = '../data/DepMap_DROP_PC9/19Q4'

standalone_q3_dir = '../data/DepMap_PC9_Standalonel200/19Q3'
standalone_q4_dir = '../data/DepMap_PC9_Standalonel200/19Q4'
if not os.path.exists(standalone_q3_dir):
    shutil.copytree(out_q3_dir,standalone_q3_dir)
if not os.path.exists(standalone_q4_dir):
    shutil.copytree(out_q4_dir,standalone_q4_dir)

In [None]:
### Change L200 gene scores in 19q3 and 14(dropped pc9) -- read data
df_q3 = pd.read_csv(os.path.join(standalone_q3_dir,'Achilles_gene_effect.csv'), index_col = 0)
df_q4 = pd.read_csv(os.path.join(standalone_q4_dir,'Achilles_gene_effect.csv'), index_col = 0)
for i in range(0,len(df_l200)):
    name = df_l200.loc[i,'landmark']
    score = df_l200.loc[i,'scaled.score']
    df_q3.loc['ACH-000113',name] = score
    df_q4.loc['ACH-000113',name] = score
# Save file
# df_q3.to_csv(os.path.join(standalone_q3_dir,'Achilles_gene_effect.csv'), sep = ',')
# df_q4.to_csv(os.path.join(standalone_q4_dir,'Achilles_gene_effect.csv'), sep = ',')

In [None]:
### For comparing Bunello l200 and standalone l200
for i in range(0,len(df_l200)):
    name = df_l200.loc[i,'landmark']
    try:
        score_q3 = df_pc9.loc['ACH-000113',name]
        df_l200.loc[i,'brunello.score.pc9'] = score_q3
    except KeyError:
        pass # Some l200 genes is not in Brunello
# save l200 file
# df_l200.to_csv(os.path.join(pc9_dir,'formatted_PC9AvL200_CERES.csv'))