In [1]:
### This is the script for specification of what patient we are using and the cohort of the patient
#1. Identification of white list patients and their cohort
    # 1.1 White list patient -- FROM PCAWG PAPER SUPP1
    # 1.2 Patients that have coverage information
#2. Identification of hypermutated patients and filter them from the cohorts to generate no-hypermutator cohort files
#3. Generate cohort file

In [1]:
import os,pickle
import pandas as pd

In [2]:
### Directories
dir_anno = '../data/anno_refs' # annotation directory
dir_proc = '../data/proc_refs'
finfo = 'PCAWG_sample_info.txt'  # PCAWG sample info file name

dir_out = '../data/proc_refs/'

#Patient wig directory
# This is not a permanent directory because of large storage space of wig file, therefore we pickle the list of file names 
# This only need to be run once
# dir_wig = '/gpfs/scratch/yur97/compressed_coverage_tracks'  
# pickle.dump(os.listdir(dir_wig), open(os.path.join(dir_anno, 'wigf_list.pkl'),'wb'))

***1.1 All white list patients from the PCAWG sample-info file***

In [3]:
# Get the white list patients in PCAWG samle info
df_info = pd.read_csv(os.path.join(dir_anno,finfo), sep ='\t')
listPat_info = df_info['tumour_specimen_aliquot_id'].unique().tolist()

In [4]:
print(f'{len(listPat_info)} whitelist PCAWG in info file')

2583 whitelist PCAWG in info file


***1.2 All patients having wig files***

In [5]:
# Load the pickle file that contain all wig files
lwigf = pickle.load(open(os.path.join(dir_proc,'wigf_list.pkl'),'rb'))
listPat_wig = [v.split('.')[0] for v in lwigf]

In [7]:
### the final patient list
lfinalp = list(set(listPat_info)&set(listPat_wig))
print("Number of white-list patients with wig files are: " + str(len(lfinalp)))
# lfinalp.remove('2bde43e2-fd4e-413e-adc7-c50bca75d358') ### Why did I do this?
# pickle the final patient list
# pickle.dump(lfinalp, open(os.path.join(dir_out,'list_all_patients_102121.pkl'),'wb'))

Number of white-list patients with wig files are: 2572


***2. Identify and filter hyper mutators*** -- defined as > 50000 mutations(SNV,MNV,Indels) per genome

In [24]:
### Identify the patients that have >50000 mutations
df_hype = df_info[(df_info['all.SNVs']+df_info['all.MNVs']+df_info['all.Indels']) > 50000]
listPat_hype = df_hype['tumour_specimen_aliquot_id'].unique().tolist()

In [25]:
print(f'{len(df_hype)} patients are hypermutators, break down into histology:')
df_hype.groupby('histology_abbreviation').size()

139 patients are hypermutators, break down into histology:


histology_abbreviation
Biliary-AdenoCA      1
Bladder-TCC          2
Breast-AdenoCA       2
CNS-GBM              1
ColoRect-AdenoCA     9
Eso-AdenoCA         10
Head-SCC             5
Liver-HCC            2
Lung-AdenoCA         8
Lung-SCC            18
Lymph-BNHL           2
Ovary-AdenoCA        1
Panc-AdenoCA         1
Skin-Melanoma       66
Stomach-AdenoCA      7
Uterus-AdenoCA       4
dtype: int64

***3. Save cohort files***

**3.1 Save all patient-cohort files**

In [26]:
### Get aliquot_id, donor_id, histology abbreviation df
df_his = df_info[['tumour_specimen_aliquot_id','icgc_donor_id','histology_abbreviation']]
df_his.columns = ['tumor_aliquot_id', 'donor_id','histology']
df_his = df_his[df_his['tumor_aliquot_id'].isin(lfinalp)]
df_his = df_his.reset_index(drop = True)
### Save the filtered histologies
# df_his.to_csv(os.path.join(dir_out,'histology.csv'))

In [28]:
# df_his.to_csv(os.path.join(dir_out,'histology.csv'))

**3.2 Save hypermutor version patient-cohort files**

In [30]:
df_his_nohypermutator = df_his[~df_his['tumor_aliquot_id'].isin(listPat_hype)]
df_his_nohypermutator = df_his_nohypermutator.reset_index(drop = True)
# df_his_nohypermutator.to_csv(os.path.join(dir_out,'histology_nohypermutator.csv'))

***4. Save ICGC and TCGA patient list***  
More patient than the patient list

In [None]:
### Get tcga patients
dir_maf = '../maf_raw/'
licgc = []
for chunk in tqdm(pd.read_csv(os.path.join(dir_maf, 'icgc.maf'), chunksize=10000, sep = '\t')):
    p = chunk['Tumor_Sample_Barcode'].unique().tolist()
    licgc.extend(p)
ltcga = []
for chunk in tqdm(pd.read_csv(os.path.join(dir_maf, 'tcga.maf'), chunksize=10000, sep = '\t')):
    p = chunk['Tumor_Sample_Barcode'].unique().tolist()
    ltcga.extend(p)

In [None]:
licgc = list(set(licgc))
pickle.dump(licgc,open('../data/proc_refs/list_patient_icgc.pkl','wb'))
ltcga = list(set(ltcga))
pickle.dump(ltcga,open('../data/proc_refs/list_patient_tcga.pkl','wb'))