In [1]:
### This script create patient dictionary for cohorts and meta cohorts. The result dataframe will be used for merging maf and coverage file
# Author: Yiyun

import pandas as pd
import os
from datetime import date
import pickle
import re

today = date.today()
today = today.strftime("%b-%d-%Y")

In [4]:
### Read data
f_info = '../anno_ref/PCAWG_sample_info.txt'
df_info = pd.read_csv(f_info, sep = '\t')

out_dir = '../anno_ref/cohorts'
#Read all patients data
lp = pickle.load(open('../anno_ref/list_all_patients.pkl','rb'))
#Filter patients
df_info = df_info[df_info['tumour_specimen_aliquot_id'].isin(lp)]

In [6]:
p = [i.split('.')[0] for i in os.listdir('/gpfs/scratch/yur97/compressed_coverage_tracks')]

In [8]:
set(p) == set(lp)

False

In [10]:
len(p)

2761

In [11]:
len(lp)

2571

In [16]:
f_info = '../anno_ref/PCAWG_sample_info.txt'
df_info = pd.read_csv(f_info, sep = '\t')

In [19]:
lp = df_info['tumour_specimen_aliquot_id'].unique().tolist()

In [20]:
len(lp)

2583

In [22]:
len(set(lp).intersection(set(p)))

2572

***
Cohorts to include:   
1. Histology -- histology
2. Histology_tier2 (Organ types) -- organ
3. BY cell type of origin -- origin
4. BY Organ system -- system
5. Pan-cancer 

### Pan-cancer

In [3]:
df_pan = df_info[['tumour_specimen_aliquot_id','icgc_donor_id']]
df_pan.columns = ['tumor_aliquot_id', 'donor_id']
df_pan['pancancer'] = 'pancancer'
df_pan = df_pan[df_pan['tumor_aliquot_id'].isin(lp)]
df_pan = df_pan.reset_index(drop = True)
# df_pan.to_csv(os.path.join(out_dir,'pancancer.csv'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pan['pancancer'] = 'pancancer'


### By histology type

In [4]:
### Get aliquot_id, donor_id, histology abbreviation df
df_his = df_info[['tumour_specimen_aliquot_id','icgc_donor_id','histology_abbreviation']]
df_his.columns = ['tumor_aliquot_id', 'donor_id','histology']
# find histology type with patients<20
l_p = []
for his in df_his['histology'].unique():
    df = df_his[df_his['histology'] == his]
    n_p = len(df)
    if n_p <20:
        l_p.append(his)
# df_his_filtered = df_his[df_his['histology'].isin(l_p)]
df_his = df_his[df_his['tumor_aliquot_id'].isin(lp)]
df_his = df_his.reset_index(drop = True)

In [5]:
df_his

Unnamed: 0,tumor_aliquot_id,donor_id,histology
0,0009b464-b376-4fbc-8a56-da538269a02f,DO46416,Ovary-AdenoCA
1,003819bc-c415-4e76-887c-931d60ed39e7,DO36062,CNS-PiloAstro
2,0040b1b6-b07a-4b6e-90ef-133523eaf412,DO45049,Liver-HCC
3,00493087-9d9d-40ca-86d5-936f1b951c93,DO22145,CNS-Oligo
4,00508f2b-36bf-44fc-b66b-97e1f3e40bfa,DO48578,Panc-Endocrine
...,...,...,...
2566,ff870342-f0d6-4450-8f9c-344c046a0baf,DO51079,Prost-AdenoCA
2567,ffa976f0-aa60-4867-842e-361afa7d68ac,DO52704,Lymph-CLL
2568,ffad9288-c622-11e3-bf01-24c6515278c0,DO23552,Liver-HCC
2569,ffdd4feb-aca3-4104-b1e8-954d705a6450,DO720,Bladder-TCC


In [63]:
# Save the cohort that have patients < 20
# pickle.dump(l_p, open(os.path.join(out_dir,'low_patient_histology.pkl'),'wb'))

In [61]:
### Save the filtered histologies
# df_his.to_csv(os.path.join(out_dir,'histology.csv'))

### By organ type (histology_tier2)

In [6]:
### Get aliquot_id, donor_id, histology abbrebiation df
df_org = df_info[['tumour_specimen_aliquot_id','icgc_donor_id','histology_tier2']]
df_org.columns = ['tumor_aliquot_id', 'donor_id','organ']

# Rename org with'/'
rename_dict = {}
for org in df_org['organ'].unique():
    if '/' in org:
        rename_dict[org] = re.sub('/', '&', org)
df_org = df_org.replace(rename_dict)

# Save the organ type cohort
df_org = df_org[df_org['tumor_aliquot_id'].isin(lp)]
df_org = df_org.reset_index(drop = True)
# df_org.to_csv(os.path.join(out_dir,'organ.csv'))

In [None]:
df_org

### By cell type of origin 
[From supp of PCAWG noncoding driver detection paper](https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-020-1965-x/MediaObjects/41586_2020_1965_MOESM1_ESM.pdf)

In [24]:
### Define function to format the organ types
def format_str(string):
    l_str = string.replace(' ','').split(',')
    l_str = [re.sub(r'Ca$','CA', i) for i in l_str]
    return l_str

In [25]:
carcinoma = 'Bladder-TCC, Biliary-AdenoCa,Breast-AdenoCa, Breast-LobularCa, Cervix-AdenoCa, \
ColoRect-AdenoCa, Eso-AdenoCa, Kidney-ChRCC, Kidney-RCC, Liver-HCC, Lung-AdenoCa, Ovary-AdenoCa,\
Panc-AdenoCa, Panc-Endocrine, Prost-AdenoCa, Stomach-AdenoCa, Thy-AdenoCa,Uterus-AdenoCa,Head-SCC, \
Cervix-SCC, Lung-SCC'
adenocarcinoma = 'Biliary-AdenoCa,Breast-AdenoCa, Breast-LobularCa, Cervix-AdenoCa, ColoRect-AdenoCa,\
Eso-AdenoCa, Kidney-ChRCC, Kidney-RCC, Liver-HCC, Lung-AdenoCa, Ovary-AdenoCa, Panc-AdenoCa, Prost-AdenoCa, \
Stomach-AdenoCa, Thy-AdenoCa, Uterus-AdenoCa'
squamous_epithelium = 'Head-SCC, Cervix-SCC, Lung-SCC'
sarcoma = 'Bone-Cart, Bone-Epith, Bone-Leiomyo, Bone-Osteosarc'
glioma = 'CNS-PiloAstro, CNS-Oligo, CNS-GBM'
hematopoietic_system = 'Lymph-BNHL, Lymph-CLL, Lymph-NOS, Myeloid-AML,Myeloid-MDS, Myeloid-MPN'

In [29]:
dict_origin = {}
dict_origin['carcinoma'] = format_str(carcinoma)
dict_origin['adeno'] = format_str(adenocarcinoma)
dict_origin['squamous'] = format_str(squamous_epithelium)
dict_origin['sarcoma'] = format_str(sarcoma)
dict_origin['glioma'] = format_str(glioma)
dict_origin['hematopoietic'] = format_str(hematopoietic_system)

In [51]:
# pickle.dump(dict_origin, open(os.path.join(out_dir,'dict_origin_histology.pkl'),'wb'))

In [30]:
df_origin = pd.DataFrame(columns = ['tumor_aliquot_id', 'donor_id','histology_abb', 'origin'])
for k in dict_origin:

    for v in dict_origin[k]:
        df = df_info[df_info['histology_abbreviation']==v]
        df_temp = df[['tumour_specimen_aliquot_id','icgc_donor_id','histology_abbreviation']]
        df_temp.columns = ['tumor_aliquot_id', 'donor_id','histology_abb']
        df_temp['origin'] = k
        df_origin = pd.concat([df_origin, df_temp])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['origin'] = k


In [31]:
df_origin = df_origin[df_origin['tumor_aliquot_id'].isin(lp)]
df_origin = df_origin.reset_index(drop = True)
# df_origin.to_csv(os.path.join(out_dir,'origin.csv'))

### By organ system

In [32]:
digestive_tract = 'Liver-HCC, ColoRect-AdenoCa, Panc-AdenoCa, Eso-AdenoCa,Stomach-AdenoCa, Biliary-AdenoCa'
kidney = 'Kidney-RCC, Kidney-ChRCC'
lung = 'Lung-AdenoCa, Lung-SCC'
lymphatic_system = 'Lymph-BNHL, Lymph-CLL, LymphNOS'
myeloid = 'Myeloid-AML, Myeloid-MDS, Myeloid-MPN'
breast = 'Breast-AdenoCa,Breast-LobularCa'
female_reproductive_system = 'Breast-AdenoCa, Breast-LobularCa,Cervix-AdenoCa, Cervix-SCC,\
Ovary-AdenoCa, Uterus-AdenoCa'
central_nervous_system = 'CNS-PiloAstro, CNS-Oligo, CNS-Medullo, CNS-GBM'

In [33]:
dict_system = {}
dict_system['digestive'] = format_str(digestive_tract)
dict_system['kidney'] = format_str(kidney)
dict_system['lung'] = format_str(lung)
dict_system['lymphatic'] = format_str(lymphatic_system)
dict_system['breast'] = format_str(glioma)
dict_system['female'] = format_str(female_reproductive_system)
dict_system['cns'] = format_str(central_nervous_system)

In [56]:
# pickle.dump(dict_system,open(os.path.join(out_dir,'dict_system_histology.pkl'),'wb'))

In [34]:
df_system = pd.DataFrame(columns = ['tumor_aliquot_id', 'donor_id','histology_abb', 'system'])
for k in dict_system:

    for v in dict_system[k]:
        df = df_info[df_info['histology_abbreviation']==v]
        if len(df) == 0 :print(v)
        df_temp = df[['tumour_specimen_aliquot_id','icgc_donor_id','histology_abbreviation']]
        df_temp.columns = ['tumor_aliquot_id', 'donor_id','histology_abb']
        df_temp['system'] = k
        df_system = pd.concat([df_system, df_temp])

LymphNOS


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['system'] = k


In [35]:
# Save file
df_system = df_system[df_system['tumor_aliquot_id'].isin(lp)]
df_system = df_system.reset_index(drop = True)
# df_system.to_csv(os.path.join(out_dir,'system.csv'))

### Below code is not related

In [2]:
fmaf = '../maf_out/maf_cohorts_060121/histology/ColoRect-AdenoCA.csv'

In [3]:
df = pd.read_csv(fmaf, sep = '\t')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
df = df[df['Hugo_Symbol'] == 'CCDC168']

In [10]:
df[df['Variant_Classification'].str.contains('Flank')]

Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,...,i_signature_R1,i_signature_R2,i_snv_near_indel,t_alt_count,t_ref_count,i_model_score,i_n_vaf,Project_Code,Donor_ID,categ
58369,CCDC168,13,103391599,103391599,+,5'Flank,SNP,C,C,A,...,False,False,False,20.0,60.0,,,ColoRect-AdenoCA,DO10172,4
58370,CCDC168,13,103391658,103391658,+,5'Flank,SNP,C,C,A,...,False,False,False,24.0,73.0,,,ColoRect-AdenoCA,DO10172,4
3604850,CCDC168,13,103389198,103389198,+,5'Flank,SNP,A,A,C,...,False,False,False,35.0,74.0,,,ColoRect-AdenoCA,DO36556,6
3604851,CCDC168,13,103389294,103389294,+,5'Flank,SNP,T,T,C,...,False,False,False,29.0,62.0,,,ColoRect-AdenoCA,DO36556,5
3604852,CCDC168,13,103391153,103391153,+,5'Flank,SNP,T,T,A,...,False,False,False,21.0,39.0,,,ColoRect-AdenoCA,DO36556,6
3604853,CCDC168,13,103391877,103391877,+,5'Flank,SNP,A,A,C,...,False,False,False,30.0,64.0,,,ColoRect-AdenoCA,DO36556,6
3856608,CCDC168,13,103389602,103389602,+,5'Flank,SNP,G,G,T,...,False,False,False,28.0,42.0,,,ColoRect-AdenoCA,DO8898,4
3856609,CCDC168,13,103391205,103391205,+,5'Flank,SNP,C,C,A,...,False,False,False,39.0,52.0,,,ColoRect-AdenoCA,DO8898,4
4396936,CCDC168,13,103389412,103389412,+,5'Flank,SNP,T,T,G,...,False,False,False,6.0,65.0,,,ColoRect-AdenoCA,DO8264,6
4396937,CCDC168,13,103389615,103389615,+,5'Flank,SNP,C,C,A,...,False,False,False,10.0,55.0,,,ColoRect-AdenoCA,DO8264,4


In [11]:
df[df['Variant_Classification'] == "Silent"]

Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,...,i_signature_R1,i_signature_R2,i_snv_near_indel,t_alt_count,t_ref_count,i_model_score,i_n_vaf,Project_Code,Donor_ID,categ
4396930,CCDC168,13,103384255,103384255,+,Silent,SNP,G,G,A,...,False,False,False,17.0,64.0,,,ColoRect-AdenoCA,DO8264,3
4396934,CCDC168,13,103386553,103386553,+,Silent,SNP,T,T,C,...,False,False,False,11.0,67.0,,,ColoRect-AdenoCA,DO8264,5
6972150,CCDC168,13,103383811,103383811,+,Silent,SNP,T,T,C,...,False,False,False,8.0,88.0,,,ColoRect-AdenoCA,DO9042,5
7200080,CCDC168,13,103382662,103382662,+,Silent,SNP,A,A,G,...,False,False,False,15.0,54.0,,,ColoRect-AdenoCA,DO9876,5
7200081,CCDC168,13,103385743,103385743,+,Silent,SNP,C,C,T,...,False,False,False,8.0,92.0,,,ColoRect-AdenoCA,DO9876,3
7200083,CCDC168,13,103386634,103386634,+,Silent,SNP,C,C,T,...,False,False,False,5.0,79.0,,,ColoRect-AdenoCA,DO9876,3
