In [1]:
### This script is for maf file processing -- Prepare multicore for patient with large mutations. May take >12hrs to run
### 1. MAF files are splitted into individual files
### 2. Mutation categories are assigned to individuals
### 3. Individual files are merged into histology files

In [1]:
from functools import partial
from maf_utils import*

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
### Define maf
class maf_process:
    def __init__(self, params):
        self.dir_maf_split = params['dir_maf_split']
        self.dir_out = params['dir_out']
        self.dir_out_intermediate = params['dir_out_intermediate']
        self.dir_out_intermediate_ind_split = os.path.join(params['dir_out_intermediate'],'individual/split/')
        self.dir_out_intermediate_ind_categ = os.path.join(params['dir_out_intermediate'],'individual/categ/')
        self.ncore = params['parallelize_core']
        
        ### reference folder and files
        self.dir_ref = '../data/proc_refs/'
        self.fname_patient_list = 'list_all_patients_102121.pkl'
        self.histology_dfname = 'histology.csv'
        self.histology_nohype_dfname = 'histology_nohypermutator.csv'
        self.fname_lgene_old = 'gene_name_list_062121.pkl' # this is gene name list only, the old one has 19225 genes 
        self.fname_lgene_new = 'gene_name_list_102121.pkl' # the new one has 18000ish genes
        ### output folder and intermediate folder
        if not os.path.exists(self.dir_out):
            os.makedirs(self.dir_out)
        if not os.path.exists(self.dir_out_intermediate):
            os.makedirs(self.dir_out_intermediate)
        if not os.path.exists(self.dir_out_intermediate_ind_split):
            os.makedirs(self.dir_out_intermediate_ind_split)
        if not os.path.exists(self.dir_out_intermediate_ind_categ):
            os.makedirs(self.dir_out_intermediate_ind_categ)
            
    ###-------------------------
    # 1. First create intermediate file individual patient maf files
    ###-------------------------    
    def run_split_patient(self): # not using multiprocessing...afrain file read/write will confilct
        # get the list of splitted maf files
        lmaf_split = os.listdir(self.dir_maf_split)
        for maf in tqdm(lmaf_split):
            split_patient(maf,dir_maf = self.dir_maf_split, dir_out = self.dir_out_intermediate_ind_split)
        print('Finish making individual patient maf files...')
    
    ###-------------------------
    # 2. Assign mutation category to individual patients
    ###-------------------------    
    def run_assign_categ(self):
        ### Get the all patient lists
        self.patient_list = pickle.load(open(os.path.join(self.dir_ref,self.fname_patient_list), 'rb'))
        print('Start assigning categ to individual patients...')
        processes = []
        
        ### Multiprocessing
        # for patients in self.patient_list:
        #     p = mp.Process()
        #     function1 = partial(categ_assign,dir_ind = self.dir_out_intermediate_ind_split,\
        #                    dir_categ_out = self.dir_out_intermediate_ind_categ)
        #     p = mp.Process(target=function1, args=(patients,))
        #     processes.append(p)
        # [x.start() for x in processes]
        # [x.join() for x in processes]
        # [x.close() for x in processes]

        self.runcateg = pickle.load(open('lruncateg.pkl','rb'))

        ## Not multiprocessing
        for patients in tqdm(self.runcateg):
            categ_assign(patients,dir_ind = self.dir_out_intermediate_ind_split,\
                           dir_categ_out = self.dir_out_intermediate_ind_categ )

        print('Finish assigning categ to individual patients...')
    
    ###-------------------------
    # 3. Merge individual mutation file to histology mutation files
    ###-------------------------    
    def load_histology_info(self, gene_name = 'old'):
        print('Loading histology cohort & gene reference data')
        self.histology_df = pd.read_csv(os.path.join(self.dir_ref,self.histology_dfname))
        self.histology_nohype_df = pd.read_csv(os.path.join(self.dir_ref,self.histology_nohype_dfname))
        # if use the old gene names
        if gene_name == 'old':
            self.gene_name_list = pickle.load(open(os.path.join(self.dir_ref, self.fname_lgene_old),'rb'))
        elif gene_name == 'new':
            self.gene_name_list = pickle.load(open(os.path.join(self.dir_ref, self.fname_lgene_new),'rb'))
        print('Finish Loading histology cohort & gene reference data')
    
    # merge patients
    def merge_maf(self, feature, hypermutator = False, filter_gene = False):
        if hypermutator:
            df_sample = self.histology_nohype_df
            self.dir_out_merged = self.dir_out+'_nohypermutator'
            if not os.path.exists(self.dir_out_merged):
                os.makedirs(self.dir_out_merged)
        else: 
            df_sample = self.histology_df
            self.dir_out_merged = self.dir_out

        if os.path.exists(os.path.join(self.dir_out_merged,feature+'.csv.gz')):
            print(f'exists{feature}')
            return

        # Initialize list for append
        ldf = []

        # Get patient list for histology
        df_histology = df_sample[df_sample['histology'] == feature]
        lp = df_histology['tumor_aliquot_id'].unique()

        #read patient file
        for p in tqdm(lp):
            df_maf_ind = pd.read_csv(os.path.join(self.dir_out_intermediate_ind_categ,p+'.to_merge.categ.csv'), index_col = 0)
            if filter_gene:
                df_maf_ind = df_maf_ind[df_maf_ind['Hugo_Symbol'].isin(self.gene_name_list)] ## Filter genes
                ldf.append(df_maf_ind)
            else:
                ldf.append(df_maf_ind)

        df_maf_merged = pd.concat(ldf, axis = 0)
        df_maf_merged['categ'] = df_maf_merged['categ'].astype(int)
        df_maf_merged.to_csv(os.path.join(self.dir_out_merged,feature+'.csv.gz'), sep = '\t', index = False\
         ,chunksize=100000,compression='gzip',encoding='utf-8')
        
        print(f'Finish Merging...')

In [11]:
maf_params = {'dir_maf_split': '../maf_raw/maf_split',
              'dir_out_intermediate': '../data/maf/intermediate',
              'dir_out': '../data/maf/histology',
              'parallelize_core':6} 

In [None]:
### Split patient file, don't run if you already run
res = maf_process(maf_params)
res.run_split_patient()

In [None]:
### Assign categ to individual patient files, don't run if you already run
res = maf_process(maf_params)
res.run_assign_categ()

In [5]:
### Merge patient individual mutations into a cohort
maf_params = {'dir_maf_split': '../maf_raw/maf_split',
              'dir_out_intermediate': '../data/maf/intermediate',
              'dir_out': '../data/maf/histology_all',
              'parallelize_core':6} 
df_cohort = pd.read_csv(os.path.join('../data/proc_refs/histology.csv'))
lfeat = df_cohort['histology'].unique()
print(lfeat)

res = maf_process(maf_params)
res.load_histology_info(gene_name = 'new')

for histologies in lfeat:
    res.merge_maf(histologies, hypermutator = False, filter_gene = True)
# for histologies in lfeat:
#     res.merge_maf(histologies, True)

  2%|▏         | 2/109 [00:00<00:07, 15.03it/s]

['Ovary-AdenoCA' 'CNS-PiloAstro' 'Liver-HCC' 'CNS-Oligo' 'Panc-Endocrine'
 'Kidney-RCC' 'Prost-AdenoCA' 'Thy-AdenoCA' 'ColoRect-AdenoCA'
 'Lymph-BNHL' 'Uterus-AdenoCA' 'Breast-AdenoCA' 'Lung-AdenoCA'
 'Panc-AdenoCA' 'Eso-AdenoCA' 'Head-SCC' 'CNS-Medullo' 'CNS-GBM'
 'SoftTissue-Leiomyo' 'Cervix-SCC' 'Skin-Melanoma' 'Lymph-CLL'
 'SoftTissue-Liposarc' 'Kidney-ChRCC' 'Stomach-AdenoCA' 'Lung-SCC'
 'Bladder-TCC' 'Myeloid-AML' 'Biliary-AdenoCA' 'Breast-LobularCA'
 'Cervix-AdenoCA' 'Bone-Osteosarc' 'Breast-DCIS' 'Myeloid-MPN'
 'Myeloid-MDS' 'Bone-Cart' 'Bone-Osteoblast' 'Bone-Epith' 'Bone-Benign']
Loading histology cohort & gene reference data
Finish Loading histology cohort & gene reference data


100%|██████████| 109/109 [00:05<00:00, 18.47it/s]
  4%|▍         | 4/89 [00:00<00:02, 34.49it/s]

Finish Merging...


100%|██████████| 89/89 [00:02<00:00, 31.52it/s]
  0%|          | 1/312 [00:00<00:45,  6.77it/s]

Finish Merging...


  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
100%|██████████| 312/312 [00:36<00:00,  8.49it/s]
 11%|█         | 2/18 [00:00<00:00, 19.94it/s]

Finish Merging...


100%|██████████| 18/18 [00:00<00:00, 20.58it/s]
  4%|▎         | 3/81 [00:00<00:03, 22.65it/s]

Finish Merging...


100%|██████████| 81/81 [00:04<00:00, 18.37it/s]
  1%|▏         | 2/143 [00:00<00:08, 16.31it/s]

Finish Merging...


  if (await self.run_code(code, result,  async_=asy)):
100%|██████████| 143/143 [00:11<00:00, 12.67it/s]
  2%|▏         | 3/199 [00:00<00:08, 23.91it/s]

Finish Merging...


100%|██████████| 199/199 [00:10<00:00, 19.42it/s]
  8%|▊         | 4/48 [00:00<00:01, 36.47it/s]

Finish Merging...


100%|██████████| 48/48 [00:01<00:00, 28.30it/s]
  0%|          | 0/52 [00:00<?, ?it/s]

Finish Merging...


  mask |= (ar1 == a)
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
100%|██████████| 52/52 [00:45<00:00,  1.15it/s]
  0%|          | 0/107 [00:00<?, ?it/s]

Finish Merging...


  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
100%|██████████| 107/107 [00:11<00:00,  9.08it/s]
  5%|▍         | 2/43 [00:00<00:02, 17.49it/s]

Finish Merging...


100%|██████████| 43/43 [00:06<00:00,  6.21it/s]
  1%|          | 1/193 [00:00<00:31,  6.16it/s]

Finish Merging...


  if (await self.run_code(code, result,  async_=asy)):
100%|██████████| 193/193 [00:15<00:00, 12.43it/s]
  0%|          | 0/37 [00:00<?, ?it/s]

Finish Merging...


  if (await self.run_code(code, result,  async_=asy)):
100%|██████████| 37/37 [00:08<00:00,  4.49it/s]
  0%|          | 0/230 [00:00<?, ?it/s]

Finish Merging...


100%|██████████| 230/230 [00:18<00:00, 12.75it/s]
  0%|          | 0/97 [00:00<?, ?it/s]

Finish Merging...


  if (await self.run_code(code, result,  async_=asy)):
  if (await self.run_code(code, result,  async_=asy)):
100%|██████████| 97/97 [00:19<00:00,  5.04it/s]
  4%|▎         | 2/56 [00:00<00:04, 11.03it/s]

Finish Merging...


100%|██████████| 56/56 [00:07<00:00,  7.76it/s]
  2%|▏         | 3/141 [00:00<00:05, 24.03it/s]

Finish Merging...


100%|██████████| 141/141 [00:06<00:00, 23.49it/s]
  5%|▌         | 2/39 [00:00<00:02, 13.12it/s]

Finish Merging...


100%|██████████| 39/39 [00:04<00:00,  9.21it/s]
 13%|█▎        | 2/15 [00:00<00:01, 12.40it/s]

Finish Merging...


100%|██████████| 15/15 [00:01<00:00, 14.02it/s]
  6%|▌         | 1/18 [00:00<00:01,  9.07it/s]

Finish Merging...


100%|██████████| 18/18 [00:01<00:00, 13.49it/s]
  0%|          | 0/106 [00:00<?, ?it/s]

Finish Merging...


100%|██████████| 106/106 [01:02<00:00,  1.70it/s]
  3%|▎         | 3/90 [00:00<00:03, 23.29it/s]

Finish Merging...


100%|██████████| 90/90 [00:04<00:00, 21.17it/s]
 11%|█         | 2/19 [00:00<00:01, 16.27it/s]

Finish Merging...


100%|██████████| 19/19 [00:01<00:00, 14.04it/s]
  2%|▏         | 1/43 [00:00<00:04,  8.98it/s]

Finish Merging...


100%|██████████| 43/43 [00:02<00:00, 20.64it/s]
  3%|▎         | 2/68 [00:00<00:05, 11.44it/s]

Finish Merging...


  if (await self.run_code(code, result,  async_=asy)):
100%|██████████| 68/68 [00:12<00:00,  5.64it/s]
  0%|          | 0/47 [00:00<?, ?it/s]

Finish Merging...


100%|██████████| 47/47 [00:12<00:00,  3.67it/s]
  4%|▍         | 1/23 [00:00<00:02,  8.41it/s]

Finish Merging...


100%|██████████| 23/23 [00:03<00:00,  6.21it/s]
 15%|█▌        | 2/13 [00:00<00:00, 19.81it/s]

Finish Merging...


100%|██████████| 13/13 [00:00<00:00, 22.45it/s]
  3%|▎         | 1/33 [00:00<00:05,  6.36it/s]

Finish Merging...


100%|██████████| 33/33 [00:04<00:00,  7.51it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.03it/s]

Finish Merging...


100%|██████████| 13/13 [00:01<00:00, 12.77it/s]
100%|██████████| 2/2 [00:00<00:00, 14.01it/s]

Finish Merging...



  6%|▌         | 2/35 [00:00<00:02, 13.66it/s]

Finish Merging...


100%|██████████| 35/35 [00:02<00:00, 15.71it/s]
100%|██████████| 3/3 [00:00<00:00, 23.93it/s]

Finish Merging...



 17%|█▋        | 4/23 [00:00<00:00, 32.03it/s]

Finish Merging...


100%|██████████| 23/23 [00:00<00:00, 25.60it/s]
100%|██████████| 2/2 [00:00<00:00, 21.14it/s]
  0%|          | 0/9 [00:00<?, ?it/s]

Finish Merging...
Finish Merging...


100%|██████████| 9/9 [00:00<00:00, 26.03it/s]
100%|██████████| 5/5 [00:00<00:00, 29.48it/s]

Finish Merging...
Finish Merging...



100%|██████████| 10/10 [00:00<00:00, 22.05it/s]
100%|██████████| 1/1 [00:00<00:00, 52.32it/s]

Finish Merging...
Finish Merging...





***Directly split maf file from icgc and tcga maf files***

In [1]:
import os,pickle
import pandas as pd
from tqdm import tqdm

In [2]:
dir_maf = '../maf_raw/'

In [None]:
dficgc = pd.read_csv(os.path.join(dir_maf, 'icgc.maf'), sep = '\t')

In [6]:
maf_split = [pd.DataFrame(y) for x, y in dficgc.groupby('Tumor_Sample_Barcode', as_index=False)]
for df_split in tqdm(maf_split):
    patient = df_split['Tumor_Sample_Barcode'].unique().tolist()
    if len(patient) == 1:
        # df_split.to_csv(os.path.join('../data/maf/intermediate/individual/split', patient[0]+'.to_merge.csv'))
    else:
        print(f'ERROR{patient}')

100%|██████████| 1950/1950 [06:20<00:00,  5.13it/s]


In [3]:
dftcga = pd.read_csv(os.path.join(dir_maf, 'tcga.maf'), sep = '\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
maf_split = [pd.DataFrame(y) for x, y in dftcga.groupby('Tumor_Sample_Barcode', as_index=False)]
for df_split in tqdm(maf_split):
    patient = df_split['Tumor_Sample_Barcode'].unique().tolist()
    if len(patient) == 1:
        # df_split.to_csv(os.path.join('../data/maf/intermediate/individual/split', patient[0]+'.to_merge.csv'))
    else:
        print(f'ERROR{patient}')

100%|██████████| 828/828 [08:29<00:00,  1.63it/s]  
