In [17]:
### This script is for maf file processing
### 1. MAF files are splitted into individual files
### 2. Mutation categories are assigned to individuals
### 3. Individual files are merged into histology files

In [18]:
from functools import partial
from maf_utils import*
import multiprocessing as mp

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
### Define maf
class maf_process:
    def __init__(self, params):
        self.dir_maf_split = params['dir_maf_split']
        self.dir_out = params['dir_out']
        self.dir_out_intermediate = params['dir_out_intermediate']
        self.dir_out_intermediate_ind_split = os.path.join(params['dir_out_intermediate'],'individual/split/')
        self.dir_out_intermediate_ind_categ = os.path.join(params['dir_out_intermediate'],'individual/categ/')
        self.ncore = params['parallelize_core']
        
        ### reference folder and files
        self.dir_ref = '../data/proc_refs/'
        self.fname_patient_list = 'list_all_patients_102121.pkl'
        self.histology_dfname = 'histology.csv'
        self.histology_nohype_dfname = 'histology_nohypermutator.csv'
        ### output folder and intermediate folder
        if not os.path.exists(self.dir_out):
            os.makedirs(self.dir_out)
        if not os.path.exists(self.dir_out_intermediate):
            os.makedirs(self.dir_out_intermediate)
        if not os.path.exists(self.dir_out_intermediate_ind_split):
            os.makedirs(self.dir_out_intermediate_ind_split)
        if not os.path.exists(self.dir_out_intermediate_ind_categ):
            os.makedirs(self.dir_out_intermediate_ind_categ)
            
    ###-------------------------
    # 1. First create intermediate file individual patient maf files
    ###-------------------------    
    def run_split_patient(self): # not using multiprocessing...afrain file read/write will confilct
        # get the list of splitted maf files
        lmaf_split = os.listdir(self.dir_maf_split)
        for maf in tqdm(lmaf_split):
            split_patient(maf,dir_maf = self.dir_maf_split, dir_out = self.dir_out_intermediate_ind_split)
        print('Finish making individual patient maf files...')
    
    ###-------------------------
    # 2. Assign mutation category to individual patients
    ###-------------------------    
    def run_assign_categ(self):
        ### Get the all patient lists
        self.patient_list = pickle.load(open(os.path.join(self.dir_ref,self.fname_patient_list), 'rb'))
        print('Start assigning categ to individual patients...')
        processes = []
        
        ### Multiprocessing
        for patients in self.patient_list:
            p = mp.Process()
            function1 = partial(categ_assign,dir_ind = self.dir_out_intermediate_ind_split,\
                           dir_categ_out = self.dir_out_intermediate_ind_categ)
            p = mp.Process(target=function1, args=(patients,))
            processes.append(p)
        [x.start() for x in processes]

        
        ### Not multiprocessing
        # for patients in self.patient_list:
        #     categ_assign(patients,dir_ind = self.dir_out_intermediate_ind_split,\
        #                    dir_categ_out = self.dir_out_intermediate_ind_categ )

        print('Finish assigning categ to individual patients...')
    
    ###-------------------------
    # 3. Merge individual mutation file to histology mutation files
    ###-------------------------    
    def load_histology_info(self):
        print('Loading histology cohort & gene reference data')
        self.histology_df = pd.read_csv(os.path.join(self.dir_ref,self.histology_dfname))
        self.histology_nohype_df = pd.read_csv(os.path.join(self.dir_ref,self.histology_nohype_dfname))
        self.gene_name_list = pickle.load(open(os.path.join(self.dir_ref, self.fname_lgene),'rb'))
        print('Finish Loading histology cohort & gene reference data')
    
    # merge patients
    def merge_maf(self, feature, hypermutator = False):
        if hypermutator:
            df_sample = self.histology_nohype_df
            self.dir_out_merged = self.dir_out+'_nohypermutator'
            if not os.path.exists(self.dir_out_merged):
                os.makedirs(self.dir_out_merged)
        else: 
            df_sample = self.histology_df
            self.dir_out_merged = self.dir_out

        if os.path.exists(os.path.join(self.dir_out_merged,feature+'.csv')):
            print(f'exists{feature}')
            return

        # Initialize list for append
        ldf = []

        # Get patient list for histology
        df_histology = df_sample[df_sample['histology'] == feature]
        lp = df_histology['tumor_aliquot_id'].unique()

        #read patient file
        for p in tqdm(lp):
            df_maf_ind = pd.read_csv(os.path.join(self.dir_out_intermediate_ind_categ,p+'.to_merge.categ.csv'), index_col = 0)
            df_maf_ind = df_maf_ind[df_maf_ind['Hugo_Symbol'].isin(self.gene_name_list)] ## Filter genes
            ldf.append(df_maf_ind)

        df_maf_merged = pd.concat(ldf, axis = 0)
        df_maf_merged.to_csv(os.path.join(self.dir_out_merged,feature+'.csv'),sep='\t', index = False)
        print(f'Finish Merging...')
        

In [21]:
maf_params = {'dir_maf_split': '../maf_raw/maf_split',
              'dir_out_intermediate': '../data/maf/intermediate',
              'dir_out': '../data/maf/histology',
              'parallelize_core':6} 

In [22]:
res =  maf_process(maf_params)

In [None]:
### Split patient file, don't run if you already run
res.run_split_patient()

In [23]:
### Assign categ to individual patient files, don't run if you already run
res.run_assign_categ()

Start assigning categ to individual patients...
../data/maf/intermediate/individual/categ/93ff786e-0165-4b02-8d27-806d422e93fc.to_merge.categ.csv
../data/maf/intermediate/individual/categ/14c5b81d-da49-4db1-9834-77711c2b1d38.to_merge.categ.csv
../data/maf/intermediate/individual/categ/c8e961b4-e324-40a2-89f6-736ec3845bc9.to_merge.categ.csv
../data/maf/intermediate/individual/categ/2df02f2b-9f1c-4249-b3b4-b03079cd97d9.to_merge.categ.csv
../data/maf/intermediate/individual/categ/978ae91e-6ebe-4efa-97ff-cfad511ae7b3.to_merge.categ.csv
../data/maf/intermediate/individual/categ/98e8f23c-5970-4fce-9551-4b11a772fe1b.to_merge.categ.csv
../data/maf/intermediate/individual/categ/60413de1-6cd2-4f74-8180-3bdd394d6d16.to_merge.categ.csv


  code = process_obj._bootstrap()
  code = process_obj._bootstrap()
  0%|          | 215/508812 [00:03<1:49:42, 77.26it/s]

Finish assigning categ to individual patients...


  code = process_obj._bootstrap()
  code = process_obj._bootstrap()
  mask |= (ar1 == a)
  code = process_obj._bootstrap()
  code = process_obj._bootstrap()
  code = process_obj._bootstrap()
  mask |= (ar1 == a)
  3%|▎         | 28182/875500 [11:35<7:33:42, 31.13it/s]s]

In [None]:
df_cohort = pd.read_csv(os.path.join('../data/proc_refs/histology.csv'))
lfeat = df_cohort['histology'].unique()
print(lfeat)
res = cov_process(cov_params)
res.load_histology_info()

for histologies in lfeat:
    res.merge_maf(histologies, False)

In [None]:
for histologies in lfeat:
    res.merge_maf(histologies, True)