In [1]:
### This notebook prepares data for figure6 regression analysis
# patient, syn, nsyn, cnv, categorical expression
# Author: Yiyun

import pandas as pd
import os
import pickle

In [2]:
### Directories
# Read expression data
exp_dir = '../anno_ref/icgc/pcawg_rnaseq/'
gene_tophatuq = 'tophat_star_fpkm_uq.v2_aliquot_gl.tsv'
gene_tophat = 'tophat_star_fpkm.v2.aliquot_gl.tsv'

# maf dir -- only for histology VCAN now
maf_dir = '../maf_out/maf_cohorts_060121/histology'

# cnv dir
cnv_dir = '../anno_ref/icgc/consensus_cnv/annotated_cnv'

In [3]:
### Read expression data
# Read aliquot id information
df_exp_info = pd.read_csv(os.path.join(exp_dir,'rnaseq.metadata.tsv'), sep = '\t')
df_exp_info = df_exp_info[df_exp_info['tumor.normal'] == 'tumor']
df_ids = df_exp_info[['aliquot_id','tcga_sample_uuid','wgs_aliquot_id']]
# df_aliquot_tcgaid = df_exp_info[['aliquot_id','tcga_sample_uuid']].set_index('aliquot_id')

# Read expression information
df_exp_uq = pd.read_csv(os.path.join(exp_dir,gene_tophatuq),sep = '\t', index_col = 0)
df_exp = pd.read_csv(os.path.join(exp_dir,gene_tophat),sep = '\t', index_col = 0)
# Read organ-aliquot id list
dict_tumor = pickle.load(open('./pcawg_exp/tumor_aliquotid_dict.pkl','rb'))
# Read cnv file
df_cnv = pd.read_csv('../anno_ref/icgc/consensus_cnv/consensus_CN.by_gene.170214.txt', sep = '\t')

In [4]:
###------------Functions----------
# Get patients with silent mutations
def get_cancer_patients(feat, gene_name, subtype = None):
    global maf_dir
    # read maf file
    df_maf= pd.read_csv(os.path.join(maf_dir,feat+'.csv'),sep = '\t')
    df_gene = df_maf[df_maf['Hugo_Symbol'] == gene_name]
    
    # get patients, initialize dataframe
    patients = df_gene['Tumor_Sample_Barcode'].unique().tolist()
    df_data = pd.DataFrame(index = patients)
    for p in patients:
        df_donor = df_gene[df_gene['Tumor_Sample_Barcode'] == p]
        if len(df_donor[df_donor['Variant_Classification'] == 'Silent']) > 0:
            df_data.loc[p,'has_syn'] = 1
        else:
            df_data.loc[p,'has_syn'] = 0
        if any(i in df_donor['Variant_Classification'].unique() for i in \
               ['Missense_Mutation', 'Nonsense_Mutation','Splice_Site', 'Nonstop_Mutation','Start_Codon_SNP',\
               'De_novo_Start_OutOfFrame','De_novo_Start_InFrame','Frame_Shift_Del','Frame_Shift_Ins',\
                'In_Frame_Del','In_Frame_Ins','Stop_Codon_Ins']):
            print(df_donor['Variant_Classification'].unique())
            df_data.loc[p,'has_nsyn'] = 1
        else:
            df_data.loc[p,'has_nsyn'] = 0
    return patients, df_data

# Get copy number 
def get_cn(df_data, gene_name, patient):
    global df_cnv
    df_cnv_p = df_cnv[df_cnv['Gene Symbol'] == 'VCAN'][patient]
    idx = df_cnv_p.index.tolist()[0]
    for p in patient:
        df_data.loc[p,'cnv'] = df_cnv_p.loc[idx, p]
    return df_data

# Convert the tumor aliquot id to id in expression files, some patients not found
def convert_exp_id(patient):
    dict_expp = {}
    for p in lp:
        idx_wgs = df_ids.loc[df_ids['wgs_aliquot_id'] == p].index
        idx_tcga = df_ids.loc[df_ids['tcga_sample_uuid'] == p].index
        if (len(idx_wgs) ==0) and (len(idx_tcga) > 0):
            exp_id = df_ids.loc[idx_tcga,'aliquot_id'].tolist()[0]
            dict_expp[exp_id] = df_ids.loc[idx_tcga,'tcga_sample_uuid'].tolist()[0]
        elif (len(idx_wgs) >0) and (len(idx_tcga) == 0):
            exp_id = df_ids.loc[idx_wgs,'aliquot_id'].tolist()[0]
            dict_expp[exp_id] = df_ids.loc[idx_wgs,'wgs_aliquot_id'].tolist()[0]
        elif (len(idx_wgs) ==0) and (len(idx_tcga) == 0):
            print(f'NOT FOUND {p}')
        elif (len(idx_wgs) >0) and (len(idx_tcga) >0):
            if idx_wgs == idx_tcga:
                exp_id = df_ids.loc[idx_wgs,'aliquot_id'].tolist()[0]
                dict_expp[exp_id] = df_ids.loc[idx_wgs,'wgs_aliquot_id'].tolist()[0]
            else:
                print(f'ERROR: DIFFERENT INDEX {p}')
                break
    return dict_expp

# Get gene from expression data
def get_gene_exp(gene_id, df_expression):
    df = df_expression.loc[df_expression.index.str.contains(rf'^{gene_id}'),:]
    
    return df

# get patient expressions
def get_patient_exp(df_gene_exp, dict_expp, df_data):
    exp_p = dict_expp.keys()
    df_gene_exp_patient = df_gene_exp[exp_p]
    for p in exp_p:
        idx = df_gene_exp_patient.index.tolist()[0]
        exp_val = df_gene_exp_patient.loc[idx,p]

        icgc_id = dict_expp[p]
        df_data.loc[icgc_id,'exp_uq'] = exp_val
    return df_data

In [5]:
lp, df_dat= get_cancer_patients('Kidney-RCC', 'VCAN')
df_dat = get_cn(df_dat, 'VCAN', lp)
dict_exp_id = convert_exp_id(lp)
df_exp_gene = get_gene_exp('ENSG00000038427', df_exp_uq)
df_dat = get_patient_exp(df_exp_gene, dict_exp_id, df_dat)

  if (await self.run_code(code, result,  async_=asy)):


['Intron' 'Frame_Shift_Ins']
['Missense_Mutation']
['Missense_Mutation']
NOT FOUND 5c156f63-6537-4d93-a6c2-4155618cf638
NOT FOUND 675a5a32-b405-4f03-bfcd-756343d1dfaf
NOT FOUND 759e20be-6a2b-4c54-aca0-6d358598d6f5
NOT FOUND a2034620-e4b5-4d4f-ac05-7fc1c098eb40
NOT FOUND b30dfb8b-8288-4e5a-afc2-3d5bd7bfa26c
NOT FOUND c19a1388-95e6-4708-a24c-3738f1908071
NOT FOUND d4c6061b-5019-4564-806d-4e75910a4690


In [8]:
df_dat

Unnamed: 0,has_syn,has_nsyn,cnv,exp_uq
2c581a74-7716-4c20-b366-a8e1d9a901f6,1.0,0.0,2.0,110.871777
2e8ad1cd-d5d5-4bb4-8bbd-e9bef45088da,0.0,0.0,2.0,55.534959
39c55051-e1bc-4081-962a-17205645de45,0.0,1.0,2.0,3.545059
3feac02a-c99f-4dd9-9eff-e4f5e87f7dc5,1.0,0.0,2.0,18.298739
51206157-94cc-4702-8247-c98283b487aa,0.0,0.0,2.0,17.1399
5c156f63-6537-4d93-a6c2-4155618cf638,0.0,0.0,2.0,
5ecc88f7-8391-4168-af11-07a6bf9b3652,0.0,1.0,3.0,0.624052
675a5a32-b405-4f03-bfcd-756343d1dfaf,0.0,0.0,4.0,
68b7fdb1-22b5-4152-9e69-a41151640cd8,0.0,0.0,2.0,44.085253
759e20be-6a2b-4c54-aca0-6d358598d6f5,0.0,0.0,2.0,


In [7]:
# df_dat.to_csv('./figure6/data/VCAN.csv')