In [3]:
import pandas as pd
import numpy as np
import math
import os

In [6]:
def generic_summary_table_fill_in(metaField, summaryField, summaryTable_, patientMetadata_):
    """ fills in a given metadata field in summaryTable_ """
    for i in range(0,len(summaryTable_.index)):
        currCell = summaryTable_['cell'].iloc[i]
        currPlate = currCell.split('_')[1]
    
        index_to_keep = patientMetadata_['plate'] == currPlate
        keepRow = patientMetadata_[index_to_keep]
        try:
            currField = list(keepRow[metaField])[0]
            summaryTable_[summaryField][i] = currField
        except IndexError:
            continue
            #print('ERROR: plate not found') # these are just the plates were NOT 
                                             # including in the analysis


In [7]:
def fusions_fill_in(fusionsDF_, summaryTable_):
    """ takes the existing fusionsDF and populates summaryTable_ with this shit """
    for i in range(0, len(summaryTable_.index)):
        currCell = summaryTable_['cell'].iloc[i]

        for col in fusionsDF_.columns:
            if currCell in list(fusionsDF_[col]):
                summaryTable_['fusions_found'][i] = col


In [8]:
def translated_muts_fill_in(GOI, summaryTable_):
    """ converts 'raw' mutation calls to something that more resembles
        those reported in our clinical cols. general """
    colName = 'mutations_found_' + GOI
    for i in range(0,len(summaryTable_.index)):
        translatedList = []
        currCell = summaryTable_['cell'].iloc[i]
        currMuts = summaryTable_[colName].iloc[i]
        currMuts_split = currMuts.split(',')
        for item in currMuts_split:
            if item != '' and '?' not in item:
                translatedList.append(GOI + ' ' + item)

        summaryTable_['mutations_found_translated'][i] = summaryTable_['mutations_found_translated'][i] + translatedList


In [9]:
def translated_muts_fill_in_egfr(summaryTable_):
    """ converts 'raw' mutation calls to something that more resembles
        those reported in our clinical cols. egfr, specificially """
    for i in range(0,len(summaryTable_.index)):
        translatedList = []
        currCell = summaryTable_['cell'].iloc[i]
        currMuts_egfr = summaryTable_['mutations_found_EGFR'].iloc[i]
        currMuts_egfr_split = currMuts_egfr.split(',')
        for item in currMuts_egfr_split:
            if 'delELR' in item:
                translatedList.append('EGFR del19')
            elif '745_' in item:
                translatedList.append('EGFR del19')
            elif '746_' in item:
                translatedList.append('EGFR del19')
            elif 'ins' in item:
                translatedList.append('EGFR ins20')
            elif item != '':
                translatedList.append('EGFR ' + item)
        
        summaryTable_['mutations_found_translated'][i] = translatedList

In [10]:
def translated_muts_fill_in_fusions(summaryTable_):
    """ converts 'raw' mutation calls to something that more resembles
        those reported in our clinical cols. for fusions """
    for i in range(0,len(summaryTable_.index)):
        currCell = summaryTable_['cell'].iloc[i]
        currFus = summaryTable_['fusions_found'].iloc[i]
        
        if not pd.isnull(currFus):
            if '?' not in currFus and currFus != '':
                currMuts = summaryTable_['mutations_found_translated'][i]
                currMuts = currMuts + ', ' + currFus + ' fusion'
                
                summaryTable_['mutations_found_translated'][i] = currMuts

In [11]:
def convert_to_string(summaryTable_):
    """ converting mutations_found_translated col from list to str. """
    for i in range(0,len(summaryTable_.index)):
        currStr = str(summaryTable_['mutations_found_translated'][i])
        currStr = currStr.replace("'", "")
        currStr = currStr.replace("]", "")
        currStr = currStr.replace("[", "")
        summaryTable_['mutations_found_translated'][i] = currStr

In [12]:
def clin_mut_found_fill_in(summaryTable_):
    """ fills in clin_mut_found_bool col: 1 if clin mut found, 0 if else """
    for i in range(0,len(summaryTable_.index)):
        currCell = summaryTable_['cell'][i]
        currMuts = summaryTable_['mutations_found_translated'][i]
        currClinGene = summaryTable_['clinical_driver_gene'][i]
        currClinMut = summaryTable_['clinical_mutation'][i]
        currClinMut_str = str(currClinGene) + ' ' + str(currClinMut)
    
        if currClinMut_str in currMuts:
            summaryTable_['clin_mut_found_bool'][i] = 1
        else:
            summaryTable_['clin_mut_found_bool'][i] = 0

In [13]:
def clin_mut_found_fill_in_fus(summaryTable_):
    """ fills in clin_mut_found_bool col: 1 if clin mut found, 0 if else
        but for fusions """
    for i in range(0,len(summaryTable_.index)):
        currCell = summaryTable_['cell'][i]
        currFus = summaryTable_['fusions_found'][i]

        if not pd.isnull(currFus):
            currFus = currFus.split('--')[0]
            summaryTable_['clin_mut_found_bool'][i] = 0
            currClinGene = summaryTable_['clinical_driver_gene'][i]

            if currClinGene == currFus:
                summaryTable_['clin_mut_found_bool'][i] = 1

In [14]:
def tumor_cell_bool_fill_in(summaryTable_, cwd_):
    """ 1 if were calling the cell TUMOR in our seurat obj, 
        0 if else """
    # read in Seurat metadata
    metaPATH = cwd_ + 'metadataSeurat.csv'
    metadataSeurat = pd.read_csv(metaPATH)

    myCols = list(metadataSeurat.columns)
    myCols[0] = 'cell'
    metadataSeurat.columns = myCols
    
    indicies = metadataSeurat['inferCNV_annotation'] == 'perturbed'
    metadataSeurat_pert = metadataSeurat[indicies]
    
    tumorCellsList = list(metadataSeurat_pert['cell'])

    # now fill in 'tumorCell_bool' for summaryTable_
    for i in range(0, len(summaryTable_.index)):
        currCell = summaryTable_['cell'][i]
        if currCell in tumorCellsList:
            summaryTable_['tumorCell_bool'][i] = 1
        else:
            summaryTable_['tumorCell_bool'][i] = 0

In [15]:
def get_non_zero_cov_ROI(gene, mut, cwd_): 
    """ removes non-zero vals from given coverageByCell dataframe """
    fPATH = cwd_ + 'coverage/' + gene + '_' + mut + '_coverageByCell.csv'
    cov = pd.read_csv(fPATH)
    indices = cov['depth_gvcf'] != 0
    cov_nonZero = cov[indices]

    return(cov_nonZero)

In [16]:
def validation_table_metadata_fill_in(metaField, validationField, validationTable_, patientMetadata_):
    """ fills in metadata field for validationTable_ """
    for i in range(0, len(validationTable_.index)):
        currSample = validationTable_['sample'][i]
        try:
            rowToKeep = patientMetadata_['sample_name'] == currSample
            patientRows = patientMetadata_[rowToKeep] # will return MULTIPLE rows
            patientRows = patientRows.reset_index(drop=True)

            fillField = patientRows[metaField][0]
       
            validationTable_[validationField][i] = fillField
        except:
            continue
            #print('ERROR')

In [17]:
def validation_table_dict_muts(validationTable_, summaryTable_):
    """ returns dict that holds vals for all the muts to a 
        given cell """
    d = {}
    samplesList = validationTable_['sample']

    for item in samplesList:
        d.update({item:''})

    for i in range(0, len(summaryTable_.index)):
        currSample = summaryTable_['sample_name'][i]
        currMuts = summaryTable_['mutations_found'][i]
        currMuts = str(currMuts)
        currMutsSplit = currMuts.split(',')

        currDictVal = d[currSample]
    
        for item in currMutsSplit:
            if item not in currDictVal and item != 'nan':
                updateVal = currDictVal + item + ', '
                d.update({currSample:updateVal})

    return(d)

In [18]:
def validation_table_dict_generic(validationTable_, summaryTable_, field):
    """ returns dict that holds values for num cells that are tumor OR
        have coverage to a given ROI """
    d = {}
    samplesList = validationTable_['sample']
    for item in samplesList:
        d.update({item:0})

    for i in range(0, len(summaryTable_.index)):
        currSample = summaryTable_['sample_name'][i]
        currBool = summaryTable_[field][i]

        currDictVal = d[currSample]  

        if not math.isnan(currBool) and currBool != 0:
            updateVal = currDictVal + 1
            d.update({currSample:updateVal})

    return(d)

In [None]:
#///////////////////////////////////////////////////////////////////////////////////
#
# non module code starts here
#
#///////////////////////////////////////////////////////////////////////////////////

In [130]:
cwd = '/Users/lincoln.harris/code/cerebra/cerebra/wrkdir/'
muts_path = cwd + 'geneSearch_tumorExome/'

mutationsDF = pd.DataFrame(columns=['patient', 'AKT1_mut', 'ALK_mut', 'BAP1_mut', 'BRAF_mut', 'DDR2_mut', 
        'DROSHA_mut', 'EGFR_mut', 'ERBB2_mut', 'ERBB4_mut', 'FGFR2_mut', 'GRIN2A_mut', 'HIF1a_mut', 'KDR_mut', 
        'KEAP1_mut', 'KRAS_mut', 'MAP2K1_mut', 'MAP2K2_mut', 'MYCL_mut', 'NFE2L2_mut', 'NKX2-1_mut', 'NOTCH1_mut',
        'PIK3CB_mut', 'PTPN13_mut', 'PTPRT_mut', 'RAD21_mut', 'RB1_mut', 'RBM10_mut', 'SMARCA4_mut', 
        'SOX2_mut', 'STK11_mut', 'TP63_mut'])

genesList = ['AKT1', 'ALK', 'BAP1', 'BRAF', 'DDR2', 'DROSHA', 'EGFR', 'ERBB2', 'ERBB4', 'FGFR2', 'GRIN2A',
            'HIF1a', 'KDR', 'KEAP1', 'KRAS', 'MAP2K1', 'MAP2K2', 'MYCL', 'NFE2L2', 'NKX2-1', 'NOTCH1', 'PIK3CB',
            'PTPN13', 'PTPRT', 'RAD21', 'RB1', 'RBM10', 'SMARCA4', 'SOX2', 'STK11', 'TP63']


In [131]:
# fill in EGFR first
EGFR_path = muts_path + 'EGFR_tumorExome_AA.csv'
EGFR_df = pd.read_csv(EGFR_path, header=None, names=['patient', 'mutations'])
mutationsDF['patient'] = EGFR_df['patient']
mutationsDF['EGFR_mut'] = EGFR_df['mutations'] 
mutationsDF

Unnamed: 0,patient,AKT1_mut,ALK_mut,BAP1_mut,BRAF_mut,DDR2_mut,DROSHA_mut,EGFR_mut,ERBB2_mut,ERBB4_mut,...,PIK3CB_mut,PTPN13_mut,PTPRT_mut,RAD21_mut,RB1_mut,RBM10_mut,SMARCA4_mut,SOX2_mut,STK11_mut,TP63_mut
0,TH171_E3.vcf,,,,,,,[],,,...,,,,,,,,,,
1,TH116_E2.vcf,,,,,,,['L858R'],,,...,,,,,,,,,,
2,TH179_E1.vcf,,,,,,,[],,,...,,,,,,,,,,
3,TH231_E1.vcf,,,,,,,[],,,...,,,,,,,,,,
4,TH226_E3.vcf,,,,,,,"['E746_A750delELREA', 'E746_T751delELREAT']",,,...,,,,,,,,,,
5,TH187_E3.vcf,,,,,,,[],,,...,,,,,,,,,,
6,TH155_E5.vcf,,,,,,,['K754E'],,,...,,,,,,,,,,
7,TH172_E3.vcf,,,,,,,[],,,...,,,,,,,,,,
8,TH169_E4.vcf,,,,,,,"['E746_A750delELREA', 'E746_T751delELREAT']",,,...,,,,,,,,,,
9,TH220_E2.vcf,,,,,,,[],,,...,,,,,,,,,,


In [132]:
mutationsDF['patient'] = mutationsDF['patient'].str.strip('.vcf')
mutationsDF

Unnamed: 0,patient,AKT1_mut,ALK_mut,BAP1_mut,BRAF_mut,DDR2_mut,DROSHA_mut,EGFR_mut,ERBB2_mut,ERBB4_mut,...,PIK3CB_mut,PTPN13_mut,PTPRT_mut,RAD21_mut,RB1_mut,RBM10_mut,SMARCA4_mut,SOX2_mut,STK11_mut,TP63_mut
0,TH171_E3,,,,,,,[],,,...,,,,,,,,,,
1,TH116_E2,,,,,,,['L858R'],,,...,,,,,,,,,,
2,TH179_E1,,,,,,,[],,,...,,,,,,,,,,
3,TH231_E1,,,,,,,[],,,...,,,,,,,,,,
4,TH226_E3,,,,,,,"['E746_A750delELREA', 'E746_T751delELREAT']",,,...,,,,,,,,,,
5,TH187_E3,,,,,,,[],,,...,,,,,,,,,,
6,TH155_E5,,,,,,,['K754E'],,,...,,,,,,,,,,
7,TH172_E3,,,,,,,[],,,...,,,,,,,,,,
8,TH169_E4,,,,,,,"['E746_A750delELREA', 'E746_T751delELREAT']",,,...,,,,,,,,,,
9,TH220_E2,,,,,,,[],,,...,,,,,,,,,,


In [166]:
def mutations_df_fill_in(GOI, GOI_df, mutationsDF_):
    """ creates a cell-wise dataframe with mutations to each GOI """
    mutName = GOI + '_mut'
    for i in range(0,len(mutationsDF_.index)):
        currPatient = mutationsDF_['patient'][i]

        rightIndex = GOI_df['patient'] == currPatient
        rightRow = GOI_df[rightIndex]
    
        rightPatient = rightRow['patient']
        rightPatient = str(rightPatient).split()[1]
    
        rightMutStr = rightRow['mutations']
        rightMutSplit = str(rightMutStr).split()
        
        j = 1
        rightMutClean = []
        while rightMutSplit[j] != 'Name:':
            rightMutClean.append(rightMutSplit[j])
            j += 1
            
        mutationsDF_[mutName][i] = rightMutClean

In [190]:
def remove_extra_characters_mutations_df(GOI, mutationsDF_):
    """ converting df cols from lists to strings """
    mutName = GOI + '_mut'

    t = mutationsDF_.applymap(str)
    t[mutName] = t[mutName].str.replace("[", "")
    t[mutName] = t[mutName].str.replace("]", "")
    t[mutName] = t[mutName].str.replace("'", "")
    t[mutName] = t[mutName].str.replace('"', '')
    t[mutName] = t[mutName].str.replace(" ", "")
    
    return(t)

In [191]:
# now fill in everything else
for gene in genesList:
        gene_path = muts_path + gene + '_tumorExome_AA.csv'
        gene_df = pd.read_csv(gene_path, header=None, names=['patient', 'mutations'])
        gene_df['patient'] = gene_df['patient'].str.strip('.vcf')
        mutations_df_fill_in(gene, gene_df, mutationsDF)
        mutationsDF = remove_extra_characters_mutations_df(gene, mutationsDF)
mutationsDF

Unnamed: 0,patient,AKT1_mut,ALK_mut,BAP1_mut,BRAF_mut,DDR2_mut,DROSHA_mut,EGFR_mut,ERBB2_mut,ERBB4_mut,...,PIK3CB_mut,PTPN13_mut,PTPRT_mut,RAD21_mut,RB1_mut,RBM10_mut,SMARCA4_mut,SOX2_mut,STK11_mut,TP63_mut
0,TH171_E3,,,,,,,,,,...,,,,,,,,,,
1,TH116_E2,,,,,,,L858R,,,...,,,,,,,,,,
2,TH179_E1,,,,V600E,,,,,,...,,,,,,,,,,
3,TH231_E1,,,,,,,,,,...,,,,,,,,,,
4,TH226_E3,,,,,,,"E746_A750delELREA,,E746_T751delELREAT",,,...,,,,,,,,,,
5,TH187_E3,,,,,,,,,,...,,,,,,,,,,
6,TH155_E5,,,,,,,K754E,,,...,,,,,,,,,,
7,TH172_E3,,,,,,,,,,...,,,,,,,,,,
8,TH169_E4,,,,,,,"E746_A750delELREA,,E746_T751delELREAT",,,...,,,,,,,,,,
9,TH220_E2,,,,,,,,,,...,,,,,,,,,,
