In [17]:
### This is a simple script formatting the PC9 data to data preprocessing
# Author: Yiyun
import pandas as pd
import os, shutil

***
### Drop PC9 from 19Q3

In [18]:
# q3 input and output folder path
q3_dir = '../data/DepMap/19Q3';out_q3_dir = '../data/DepMap_DROP_PC9/19Q3'

# If the output folder does not exist, make a copy of the 19Q3 folder to output path
if not os.path.exists(out_q3_dir):
    shutil.copytree(q3_dir,out_q3_dir)
    
# Read 19Q3 and delete PC9 data from gene effect file -- 'ACH-000030' = PC14 = PC9
df_ref19q3 = pd.read_csv(os.path.join(q3_dir,'Achilles_gene_effect.csv'), index_col = 0)
df_ref19q3_m = df_ref19q3.drop(['ACH-000030'])

# Save file
# df_ref19q3_m.to_csv(os.path.join(out_q3_dir,'Achilles_gene_effect.csv'), sep = ',')

***
### Format PC9 and To data to match 19Q3 CERES input

In [51]:
### Read files
# Directories for PC9 and To data
q3_dir = '../data/DepMap/19Q3' # Need to read 19Q3 files for mapping the gene name + gene ID
pc9_dir = '../data/ceres_external/PC9_corrected'
to_dir = '../data/ceres_external/To'
# read data
df_ref19q3 = pd.read_csv(os.path.join(q3_dir,'Achilles_gene_effect.csv'), index_col = 0)
df_pc9 = pd.read_csv(os.path.join(pc9_dir,'BatchCorrectedPritchard.csv'), index_col = 0)
df_to = pd.read_csv(os.path.join(to_dir,'ToCellCERES.csv'), index_col = 0)

In [20]:
### Start formatting...

### 1. Assign a name for PC9 (use original name for To)
df_pc9.columns = ['PC9'] 

### 2. Transpose the dataframe to let column names be genes as in 19Q3
df_pc9= df_pc9.T; df_to= df_to.T

### 3. Rename gene columns in PC9 and To data to match that with 19Q3
#   3.1 Create a dictionary for gene names and ids in 19Q3 file, e.g. dict_code['A1BG'] = '1'
dict_code = {}
for i in range(0, len(df_ref19q3.T)):
    name,idx = df_ref19q3.columns[i].split(' ')[0:2]
    if name not in dict_code:
        dict_code[name] = idx

#   3.2 Go through the external dataframe columns and rename the gene columns to gene+gene id
def map_gene_names(df):
    global dict_code
    for names in df.columns:
        if names in dict_code: 
            new_name = names + ' ' + dict_code[names]
            df.rename(columns = {names:new_name}, inplace = True) # e.g. A1BG to A1BG (1)
        else:
            df = df.drop(columns=[names]) # a few genes are not found in Q3 dataset
    df = df.dropna(axis = 1) # Drop columns with NA. There is one NA in To data.
    return df
#   3.3 Apply function to pc9 and To
df_pc9 = map_gene_names(df_pc9)
df_to = map_gene_names(df_to)

### Finish formatting...

In [21]:
### Save the formatted files 
# df_pc9.to_csv(os.path.join(pc9_dir,'gene_effect.csv'), sep = ',')
# df_to.to_csv(os.path.join(to_dir,'gene_effect.csv'), sep = ',')

***
### Format L200 standalone score and replace the pooled L200 score 

In [28]:
### Read the l200 standalone and PC9 dataframe
pc9_dir = '../data/ceres_external/PC9_corrected'
df_l200 = pd.read_csv(os.path.join(pc9_dir,'PC9AvL200_CERES.csv'))

In [31]:
### Start formatting ...

### 1. rename columns for l200 dataframe
df_l200.columns = ['landmark','PC9.1','PC9.2','mean','gene.type','scaled.score']

### 2. Rename gene name to match that with 19Q3 - A1BG to A1BG (1)
#   2.1 Go through the l200 dataframe columns and rename the gene columns to gene+gene id
for i in range(0,len(df_l200)):
    name = df_l200.loc[i,'landmark'] # Get gene name
    if name in dict_code:
        new_name = name + ' ' + dict_code[name]
        df_l200.loc[i,'landmark'] = new_name
    else:
        df_l200 = df_l200.drop(i) # a few genes are not found in Q3 dataset

### 3. Reset the index as some genes were dropped
df_l200 = df_l200.reset_index(drop = True)

### Finish formatting ...

In [38]:
### Start replacing the pooled L200 score...
# Read PC9 gene effect
df_pc9_l200 = pd.read_csv(os.path.join(pc9_dir,'gene_effect.csv'), index_col = 0)
# Replace the score of l200 in PC9 gene effect by the score in l200 dataframe
for i in range(0,len(df_l200)):
    name = df_l200.loc[i,'landmark']
    score = df_l200.loc[i,'scaled.score']
    df_pc9_l200.loc['PC9',name] = score
### Finish replacing the pooled L200 score...

In [41]:
# Save file
# df_pc9_l200.to_csv(os.path.join(pc9_dir,'gene_effect_standalone.csv'), sep = ',')

In [50]:
### Note, there are more standalone l200 genes than the PC9 pooled l200 genes
for columns in df_pc9_l200.columns:
    if columns not in df_pc9.columns:
        print(columns)

TBC1D3 (729873)
MARCH5 (54708)
ICE1 (23379)
GOLGA6L1 (283767)
SPAG11B (10407)
RIC1 (57589)
OR4N4 (283694)
FOXO3 (2309)
ZNF658 (26149)
KDF1 (126695)
OST4 (100128731)
FKBP6 (8468)
HSD17B7 (51478)
NAIP (4671)
