***This notebook calculate patient coverage by 2 steps***
1. Get patient 0 position
2. Calculate coverage -- output individual coverage file
3. Merge coverage files
4. Filter the genes in coverage files by gene names

In [1]:
import os, pickle, itertools, time
from utils import *
from tqdm import tqdm
from multiprocessing import Pool
import multiprocessing as mp
import pandas as pd
import numpy as np
from functools import partial

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
### Define cov_preprocess
class cov_process:
    def __init__(self, params):
        self.dir_wig = params['dir_wig']
        self.dir_out = params['dir_out']
        self.dir_out_intermediate = params['dir_out_intermediate']
        self.dir_out_intermediate_ind = os.path.join(params['dir_out_intermediate'],'individual/')
        self.dir_out_intermediate_pos = os.path.join(params['dir_out_intermediate'],'position_dict/')

        self.ncore = params['parallelize_core']
        ### reference folder and files
        self.dir_ref = '../data/proc_refs/'
        self.fname_record = 'dict_record_new.pkl'
        self.fname_transcript_info = 'dict_transcript_info_062121.pkl'
        self.fname_gene_name = 'dict_name_forcov_102121.pkl'  # this is transcript- gene name dict
        self.fname_lgene_old = 'gene_name_list_062121.pkl' # this is gene name list only, the old one has 19225 genes 
        self.fname_lgene_new = 'gene_name_list_102121.pkl' # the new one has 18000ish genes
        self.histology_dfname = 'histology.csv'
        self.histology_nohype_dfname = 'histology_nohypermutator.csv'
        ### output folder and intermediate folder
        if not os.path.exists(self.dir_out):
            os.makedirs(self.dir_out)
        if not os.path.exists(self.dir_out_intermediate):
            os.makedirs(self.dir_out_intermediate)
        if not os.path.exists(self.dir_out_intermediate_ind):
            os.makedirs(self.dir_out_intermediate_ind)
        if not os.path.exists(self.dir_out_intermediate_pos):
            os.makedirs(self.dir_out_intermediate_pos)
        
        # get the patient wig file -- as file path-file name list
        self.lwig =  os.listdir(self.dir_wig)   #XXX.coverage
        self.lwig_fullpath = [os.path.join(self.dir_wig,i) for i in self.lwig] ###../data/cov_preprocess/.../XXX.coverage
    ###-------------------------
    # 1. First create intermediate file of patient 0 positions
    ###-------------------------    
    def _get_zero_position(self, coverage_file):
        '''
       Class method call the below function?
        '''
        return cov_preprocess.get_zero_position(coverage_file)

    @staticmethod
    #turn patient wig file into a directory saving 0 positions
    def get_zero_position(coverage_file, dir_out):
        id_aliquot = coverage_file.split('/')[-1].split('.')[0]
        dict_patient = {}
        dict_patient[id_aliquot] = {}
        with open(coverage_file, 'r+b') as wig_f:
            mwig = mmap.mmap(wig_f.fileno(), 0, prot=mmap.PROT_READ)
            itmwig = iter(mwig.readline, b"")
            next(itmwig)
            for lines in itmwig:
                lines = lines.decode("utf-8") 
                if lines.startswith('fixed'):
                    line_list = re.findall(r'\d+', lines)
                    chr_n = line_list[0]
                    start_pos = int(line_list[1])
                    position = start_pos - 1
                else:
                    position += 1
                if chr_n not in dict_patient[id_aliquot]:
                    dict_patient[id_aliquot][chr_n] = []
                if lines == '0\n':
                    dict_patient[id_aliquot][chr_n].append(position)

        pickle.dump(dict_patient, open(os.path.join(dir_out,id_aliquot+'.pkl'),'wb'))
    
    ### This actually call the function to calculate by multiprocessing, output will be saved to intermediate file folder
    def run_get_zero_position(self):
        print('Start getting patient coverage in genome positions...')
        p = Pool(self.ncore)
        function1 = partial(cov_preprocess.get_zero_position,dir_out = self.dir_out_intermediate_pos)
        tqdm(p.imap(function1, self.lwig_fullpath), total=len(self.lwig_fullpath))
        p.close()
        p.join()
        print('Finish getting patient coverage in genome positions...')

    
    ###-------------------------
    # 2. Now calculate the coverage of patient -- individual coverage file
    ###-------------------------
    # Load reference data
    def load_refs(self):
        print('Loading reference data')
        self.record = pickle.load(open(os.path.join(self.dir_ref, self.fname_record),'rb'))
        self.transcript_info = pickle.load(open(os.path.join(self.dir_ref, self.fname_transcript_info),'rb'))
        self.gene_name_dict = pickle.load(open(os.path.join(self.dir_ref, self.fname_gene_name),'rb'))
        print('Finish Loading reference data')   
        
    def _coverage_calculation(self, params):
        '''
       Class method call the below function?
        '''
        return cov_preprocess.covarage_calculation(params)

        
    # Function of calculating coverage
    @staticmethod
    def coverage_calculation(params, dict_name_forcov,dict_transcript_info, dict_record, pat_id,patient_middlef):  # params:[transcript, patient]        
        # get transcript sequence, cdns sequence and positions
        strand = dict_transcript_info[params[0]]['strand']
        chromosome = dict_transcript_info[params[0]]['chr'].strip('chr')
        list_cds = get_mrna_position(params[0], dict_transcript_info, strand)
        seq_transcript = get_transcript_sequence(params[0], dict_transcript_info, dict_record, strand)
        seq_cds = get_cdna_sequence(list_cds, strand, seq_transcript)
        transcript_start_pos = dict_transcript_info[params[0]]['transcript'][0]
        dict_position_cov, flag_lp = calculate_coverage(seq_transcript, seq_cds, list_cds, transcript_start_pos)

        # Initialize empty dataframe
        name_gene = dict_name_forcov[params[0]];zone = ['nonsilent','flank','silent'];categ = [1,2,3,4,5,6,7]
        idx = pd.MultiIndex.from_tuples(list(itertools.product([name_gene],zone,categ)))
        df_out = pd.DataFrame(index=idx, columns = [pat_id])

        for keys in dict_position_cov:
            coverage_patient = calculate_patient(dict_position_cov[keys], patient_middlef[params[1]], chromosome)
            for i in range(1,8):
                df_out.loc[(name_gene,keys,i),:] = coverage_patient[i-1]
    
        return df_out
    
    # call function of calculating coverage, it go 1 by 1 patient
    def run_coverage_calculation(self, transcript_list):
        # Determine the scope of coverage calculation - if all, calculate for all genes
        if transcript_list == 'all':
            transcript_list = self.gene_name_dict.keys()
            
        # load patient intermediate file
        for p in self.lwig:
            pat = p.split('.')[0]
            patient_intermediate = pickle.load(open(os.path.join(self.dir_out_intermediate_pos,pat+'.pkl'),'rb')) #intermediate file
            list_patient = list(patient_intermediate.keys())
            paramlist = list(itertools.product(transcript_list, list_patient)) # parameter list transcipt-patient

           # Start processing
            start1 = time.time()
            # Initialize a list for all dataframes
            list_res_cov = []
            for param in paramlist[0:3]:
                res_cov = cov_preprocess.coverage_calculation(param,self.gene_name_dict,\
                                                              self.transcript_info, self.record,pat,patient_intermediate)
                list_res_cov.append(res_cov)
            df_res_cov = pd.concat(list_res_cov)
            df_res_cov.to_csv(os.path.join(self.dir_out_intermediate_ind,pat+'.csv'), sep = '\t')
            end1 = time.time()
            print(f"finish: {pat}")
            print(f'time used: {end1 - start1}')
    
    ###-------------------------
    # 3. Merge the coverage files by cohort
    ###-------------------------    
    def load_histology_info(self, gene_name = 'old'):
        print('Loading histology cohort & gene reference data')
        self.histology_df = pd.read_csv(os.path.join(self.dir_ref,self.histology_dfname))
        self.histology_nohype_df = pd.read_csv(os.path.join(self.dir_ref,self.histology_nohype_dfname))
        if gene_name == 'old':
            self.gene_name_list = pickle.load(open(os.path.join(self.dir_ref, self.fname_lgene_old),'rb'))
        elif gene_name == 'new':
            self.gene_name_list = pickle.load(open(os.path.join(self.dir_ref, self.fname_lgene_new),'rb'))
        print('Finish Loading histology cohort & gene reference data')
        
    def merge_cov(self, feature, hypermutator = False):
        # use different patient list for hypermutator
        if hypermutator:
            df_sample = self.histology_nohype_df
            self.dir_out_merged = self.dir_out+'_nohypermutator'
            if not os.path.exists(self.dir_out_merged):
                os.makedirs(self.dir_out_merged)
        else: 
            df_sample = self.histology_df
            self.dir_out_merged = self.dir_out   

        # Initialize list for append
        ldf = []

#         # Not run if file already exists
        # if os.path.exists(os.path.join(self.dir_out_merged,feature+'.csv')):
        #     print(f'exists{feature}')
        #     return

        # Get patient list for histology
        df_histology = df_sample[df_sample['histology'] == feature]
        lp = df_histology['tumor_aliquot_id'].unique()

        #read patient file one by one and append the df in a list
        for p in tqdm(lp):
            df_cov_ind = pd.read_csv(os.path.join(self.dir_out_intermediate_ind,p+'.csv'),sep = '\t')
            df_cov_ind.columns = ['gene','zone','categ', p]
            df_cov_ind = df_cov_ind[df_cov_ind['gene'].isin(self.gene_name_list)] ## Filter genes
            df_cov_ind = df_cov_ind.set_index(['gene','zone','categ'])
            ldf.append(df_cov_ind)
        # pickle.dump(ldf, open(os.path.join(self.dir_out_merged,feature+'.pkl'), 'wb')) #pickle dump the pkl file to prevent dataloss
        df_cov_merged = pd.concat(ldf, axis = 1)
        df_cov_merged.to_csv(os.path.join(self.dir_out_merged,feature+'.csv.gz'), sep = '\t'\
         ,chunksize=100000,compression='gzip',encoding='utf-8')
        print(f'Finish Merging...')


***example for calculating coverage(individual patient), only for a few genes and 2 patients***

In [71]:
### example for calculating coverage, only for a few genes and 2 patients
cov_params = {'dir_wig': '../data/cov/example/wig_input',
              'dir_out_intermediate': '../data/cov/example/intermediate',
              'dir_out': '../data/cov/example/histology',
              'parallelize_core':2} 

## Example run til generating individual cov files
## Because you can not really merge individual cov files if you don't have a cohort
res = cov_process(cov_params)
res.run_get_zero_position()
res.load_refs()
res.run_coverage_calculation('all')

***Merge coverage into histology cohort***

In [None]:
#### If you already have individual patient file, continue to merge them according to histology types
### merging real coverage data
# read feature list - e.g. All the histology tyeps
df_cohort = pd.read_csv(os.path.join('../data/proc_refs/histology.csv'))
lfeat = df_cohort['histology'].unique()
print(lfeat)

# running class and load information
cov_params = {'dir_wig': '../data/cov/example/wig_input',
              'dir_out_intermediate': '../data/cov/intermediate',
              'dir_out': '../data/cov/histology_new',
              'parallelize_core':2} 

res = cov_process(cov_params)
res.load_histology_info(gene_name = 'new')

for histologies in lfeat:
    res.merge_cov(histologies, hypermutator = False)

  0%|          | 0/109 [00:00<?, ?it/s]

['Ovary-AdenoCA' 'CNS-PiloAstro' 'Liver-HCC' 'CNS-Oligo' 'Panc-Endocrine'
 'Kidney-RCC' 'Prost-AdenoCA' 'Thy-AdenoCA' 'ColoRect-AdenoCA'
 'Lymph-BNHL' 'Uterus-AdenoCA' 'Breast-AdenoCA' 'Lung-AdenoCA'
 'Panc-AdenoCA' 'Eso-AdenoCA' 'Head-SCC' 'CNS-Medullo' 'CNS-GBM'
 'SoftTissue-Leiomyo' 'Cervix-SCC' 'Skin-Melanoma' 'Lymph-CLL'
 'SoftTissue-Liposarc' 'Kidney-ChRCC' 'Stomach-AdenoCA' 'Lung-SCC'
 'Bladder-TCC' 'Myeloid-AML' 'Biliary-AdenoCA' 'Breast-LobularCA'
 'Cervix-AdenoCA' 'Bone-Osteosarc' 'Breast-DCIS' 'Myeloid-MPN'
 'Myeloid-MDS' 'Bone-Cart' 'Bone-Osteoblast' 'Bone-Epith' 'Bone-Benign']
Loading histology cohort & gene reference data
Finish Loading histology cohort & gene reference data


100%|██████████| 109/109 [00:32<00:00,  3.38it/s]
  0%|          | 0/89 [00:00<?, ?it/s]

Finish Merging...


100%|██████████| 89/89 [00:27<00:00,  3.26it/s]
  0%|          | 0/312 [00:00<?, ?it/s]

Finish Merging...


100%|██████████| 312/312 [01:29<00:00,  3.49it/s]
  0%|          | 0/18 [00:00<?, ?it/s]

Finish Merging...


100%|██████████| 18/18 [00:11<00:00,  1.60it/s]
  0%|          | 0/81 [00:00<?, ?it/s]

Finish Merging...


100%|██████████| 81/81 [00:23<00:00,  3.46it/s]
  0%|          | 0/143 [00:00<?, ?it/s]

Finish Merging...


100%|██████████| 143/143 [00:41<00:00,  3.47it/s]
  0%|          | 0/199 [00:00<?, ?it/s]

Finish Merging...


100%|██████████| 199/199 [00:57<00:00,  3.48it/s]
  0%|          | 0/48 [00:00<?, ?it/s]

Finish Merging...


 48%|████▊     | 23/48 [00:06<00:07,  3.48it/s]