In [2]:
### In this notebook we do whole genome sequencing analysis 
import pandas as pd
from os.path import join
data_path = '../data/sequencing/WGS/03.Result_X202SC24094080-Z01-F001_cancer/result/Mutation/SNP/Annotation'
output_path = 'Analysis_output'
f_prefix = 'WGS_analysis'

In [2]:
# Function for calculate the allele frequency of heterozygous mutation 
def calc_allele_freq(df_, colname):
    # Process the allele frequency info of the dataframe
    split_df = df_[colname].str.split(':', expand = True)
    split_df.columns = ['Genotype', 'Allele Count', 'Read Depth', 'GQ', 'PL']
    split_df['GQ'] = split_df['GQ'].astype(int)
    split_df = split_df[split_df['GQ']>60] # Filter out low quality variants

    # Operate only on heterozygous mutations
    het_idx = split_df[split_df['Genotype'] == '0/1'].index
    allele_count =pd.to_numeric(split_df.loc[het_idx,'Allele Count'].str.split(',').str[1])
    allele_depth = pd.to_numeric(split_df.loc[het_idx,'Read Depth'])
    return(allele_count.div(allele_depth))

In [None]:
df_PSMB5 = pd.DataFrame()  # This part is for PSBM5 mutations only

for file_name in ['C2_S1','C2_S2','C2_E1','C2_E2']: # Loop through the file names
    # Define the fields that we need so that we can locate the mutations
    print(f'Start processing {file_name}...')

    fields_e1 = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'GeneName', 
          'Func', 'Gene', 'ExonicFunc', 'AAChange', 'INFO', 'FORMAT', file_name]
    df = pd.read_csv(join(data_path, f'{file_name}.GATK.snp.annovar.hg38_multianno.xls'),\
            usecols=fields_e1, sep = '\t')
    df['name'] = df['CHROM'].astype(str) +'_'+ df['POS'].astype(str)+'_'+df['REF']+'_'+df['ALT']
    df = df.set_index('name')
    df = df[df['GeneName'] == 'PSMB5'] 
    
    df_PSMB5 = pd.concat([df_PSMB5, df[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'GeneName', 
          'Func', 'Gene', 'ExonicFunc', 'AAChange', 'INFO']]], axis = 1)
    df_PSMB5 = df_PSMB5.loc[:,~df_PSMB5.columns.duplicated()].copy()
# df_PSMB5.to_csv(join(output_path, f_prefix+'_analysis_out_PSMB5.csv'))

In [3]:
merge_df = pd.DataFrame() 
merge_df_AF = pd.DataFrame() 

for file_name in ['C2_S1','C2_S2','C2_E1','C2_E2']: # Loop through the file names
    # Define the fields that we need so that we can locate the mutations
    print(f'Start processing {file_name}...')

    fields_e1 = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'GeneName', 
          'Func', 'Gene', 'ExonicFunc', 'AAChange', 'INFO', 'FORMAT', file_name]
    df = pd.read_csv(join(data_path, f'{file_name}.GATK.snp.annovar.hg38_multianno.xls'),\
            usecols=fields_e1, sep = '\t')
    df['name'] = df['CHROM'].astype(str) +'_'+ df['POS'].astype(str)+'_'+df['REF']+'_'+df['ALT']
    df = df.set_index('name')
    mut_freq = calc_allele_freq(df, file_name)
    assert len(df) > len(mut_freq) # Make sure the function return same length
    df.loc[mut_freq.index, f'Allele Freq {file_name}'] = mut_freq
    
    merge_df_AF = pd.concat([merge_df_AF, df[['CHROM', 'POS', 'ID', 'REF', 'ALT', 'GeneName', 
          'Func', 'Gene', 'ExonicFunc', 'AAChange', 'INFO',f'Allele Freq {file_name}']]], axis = 1)

Start processing C2_S1...
Start processing C2_S2...
Start processing C2_E1...
Start processing C2_E2...


In [36]:
# delete the duplicated columns, keeping the information and allele frequency
merge_df = merge_df_AF.loc[:,~merge_df_AF.columns.duplicated()].copy()
# drop the one that don't have start alelle freq
merge_df = merge_df.dropna()
# merge_df.to_csv(join(output_path, f_prefix+'_analysis_out.csv'))

In [38]:
len(merge_df)

1388345

In [4]:
merge_df = merge_df_AF.loc[:,~merge_df_AF.columns.duplicated()].copy()
merge_df.to_csv(join(output_path, f_prefix+'_analysis_out_keepna.csv'))

In [5]:
merge_df = pd.read_csv(join(output_path, f_prefix+'_analysis_out_keepna.csv'))

  exec(code_obj, self.user_global_ns, self.user_ns)


***Filter the dataframe***

In [9]:
# find K562 CBEd mutations
file_name = 'K562_CBEd'
fields = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'GeneName', 
          'Func', 'Gene', 'ExonicFunc', 'AAChange', 'INFO', 'FORMAT', file_name]
df = pd.read_csv(join(data_path, f'{file_name}.GATK.snp.annovar.hg38_multianno.xls'),\
            usecols=fields, sep = '\t')
df['name'] = df['CHROM'].astype(str) +'_'+ df['POS'].astype(str)+'_'+df['REF']+'_'+df['ALT']
df = df.set_index('name')
CBEd_mut = df.index

In [14]:
CBEd_mut

Index(['chr1_16298_C_T', 'chr1_30548_T_G', 'chr1_49298_T_C', 'chr1_51479_T_A',
       'chr1_51803_T_C', 'chr1_52238_T_G', 'chr1_54366_A_G', 'chr1_54490_G_A',
       'chr1_54844_G_A', 'chr1_55164_C_A',
       ...
       'chrM_14905_G_A', 'chrM_15326_A_G', 'chrM_15452_C_A', 'chrM_15607_A_G',
       'chrM_15928_G_A', 'chrM_16126_T_C', 'chrM_16294_C_T', 'chrM_16296_C_T',
       'chrM_16324_T_C', 'chrM_16519_T_C'],
      dtype='object', name='name', length=3534070)

In [17]:
for file_name in ['C2_S1','C2_S2','C2_E1','C2_E2']: # Loop through the file names
    index  = merge_df[f'Allele Freq {file_name}'].dropna().index
    print(file_name)
    print(len([i for i in merge_df.loc[index, 'name'] if i not in CBEd_mut]))

C2_S1
87965
C2_S2
95432
C2_E1
85251
C2_E2
87288


In [49]:
# find mutations in DepMap
df_mut = pd.read_csv(join(data_path, 'K562 mutations.csv'))
df_mut['name'] = df_mut['Chromosome'].astype(str) +'_'+ df_mut['Position'].astype(str)+'_'+df_mut['Ref Allele']+'_'+df_mut['Alt Allele']
df_mut = df_mut.set_index('name')
depmap_mut = df_mut.index

In [None]:
merge_df = pd.read_csv(join(output_path, f_prefix+'_analysis_out.csv'), index_col = 0)

In [53]:
len(merge_df)

1388345

In [62]:
mut_idx = [i for i in merge_df.index if (i not in CBEd_mut) and (i not in depmap_mut)]

In [63]:
len(mut_idx) # The number of mutations that are shared across samples

25424

In [64]:
merge_df.loc[mut_idx,:].to_csv(join(output_path, f_prefix+'_analysis_out_filtered.csv'))

In [73]:
# We want to keep the mutation information
for file_name in ['C2_S1']: # Loop through the file names
    # Define the fields that we need so that we can locate the mutations
    print(f'Start processing {file_name}...')
    df = pd.read_csv(join(data_path, f'{file_name}.GATK.snp.annovar.hg38_multianno.xls'), sep = '\t')
    df['name'] = df['CHROM'].astype(str) +'_'+ df['POS'].astype(str)+'_'+df['REF']+'_'+df['ALT']
    df = df.set_index('name')
    df = df.loc[mut_idx,:]
    
df = pd.concat([merge_df, df], axis = 1)
df = df.loc[:,~df.columns.duplicated()].copy()
df.to_csv(join(output_path, f_prefix+'_filtered_mutation_info.csv'))

Start processing C2_S1...


In [83]:
### Only keep Essential genes
common_ess = pd.read_csv(join(data_path, 'AchillesCommonEssentialControls.csv'))
comm_gene = common_ess['Gene'].str.rsplit('(', expand = True)
ess_genes = [i.strip(' ') for i in comm_gene[0].to_list()]
df[df['GeneName'].isin(ess_genes)].to_csv(join(output_path, f_prefix+'_filtered_mutation_info_ess.csv'))

In [3]:
df = pd.read_csv(join(output_path, f_prefix+'_filtered_mutation_info_ess.csv'))

In [None]:
df['']'gnomad_genome_AF'

In [12]:
[i for i in df.columns]

['name',
 'CHROM',
 'POS',
 'ID',
 'REF',
 'ALT',
 'GeneName',
 'Func',
 'Gene',
 'ExonicFunc',
 'AAChange',
 'INFO',
 'Allele Freq C2_S1',
 'Allele Freq C2_S2',
 'Allele Freq C2_E1',
 'Allele Freq C2_E2',
 'Priority',
 'QUAL',
 'FILTER',
 'Description',
 'GeneDetail',
 'Gencode',
 'cpgIslandExt',
 'cytoBand',
 'wgRna',
 'genomicSuperDups',
 'Repeat',
 'avsnp',
 'CLNALLELEID',
 'CLNDN',
 'CLNDISDB',
 'CLNREVSTAT',
 'CLNSIG',
 'cosmic',
 'gwasCatalog',
 '1000g_SAS',
 '1000g_EUR',
 '1000g_AFR',
 '1000g_AMR',
 '1000g_EAS',
 '1000g_ALL',
 'esp6500siv2_all',
 'gnomad_exome_AF',
 'gnomad_exome_AF_raw',
 'gnomad_exome_AF_afr',
 'gnomad_exome_AF_sas',
 'gnomad_exome_AF_amr',
 'gnomad_exome_AF_eas',
 'gnomad_exome_AF_nfe',
 'gnomad_exome_AF_fin',
 'gnomad_exome_AF_asj',
 'gnomad_exome_AF_oth',
 'gnomad_genome_AF',
 'gnomad_genome_AF_raw',
 'gnomad_genome_AF_afr',
 'gnomad_genome_AF_sas',
 'gnomad_genome_AF_amr',
 'gnomad_genome_AF_eas',
 'gnomad_genome_AF_nfe',
 'gnomad_genome_AF_fin',
 'gnomad

In [22]:
df[df['Func']=='exonic'].to_csv(join(output_path, f_prefix+'candidate.csv'))