##### this version adds more annotation to the summary file including amino acid change in order to make the lollipop plots

In [2]:
import re
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import matplotlib.pyplot as plt
from matplotlib import style, colors
from matplotlib import gridspec
import random
import string
import math

# matplotlib.style.use('ggplot')
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12, 6)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### what does multiple means?
* multi_hit means a gene hit by two or more mutations. 

* use the new strelka results

use all calls from strelka: this is prefered since this takes into consideration of all soamtic calls:snvs and indels, high/moderate/low/modifier impacts.

in most papers, they only report mutation in coding regions including splicing site, excluding non-coding regions, i will get these numbers as well. synonymous/non-synonymous mutation are only in coding regions.

a good post about match

https://stackoverflow.com/questions/24958358/use-regular-expression-in-python-to-find-two-strings-in-line

a table details the equivalence of ontology an classic effect description

http://snpeff.sourceforge.net/SnpEff_manual.html

# 1. parsing strelka vcf files

In [64]:
def parse_strelka_vcf(vcf):
    df = pd.read_csv(vcf, comment='#', sep='\t', header=None, low_memory=False)
    patient = vcf.split('/')[4]
    df = df[[0,1,3,4,7]]
    df.columns = ['chr', 'pos', 'ref', 'alt', 'effects']
    df = df[(df['effects'].str.contains("HIGH"))|(df['effects'].str.contains("MODERATE"))|(df['effects'].str.contains("LOW"))]
#     print(df.head(2))
    if not df.empty:
        df['annotations'] = df['effects'].apply(lambda x: get_annotations(x))      
        df['patient'] = patient

    df = df.drop('effects', axis=1)
    return df

In [69]:
def wide_to_long(df):
    s = df.annotations.str.split('$', expand=True).stack()
    idx = s.index.get_level_values(0)
    print('length of s and idx', len(s), len(idx))
    dfn = df.loc[idx]
    dfn['effect'] = list(s)
    dfn.drop('annotations', inplace=True, axis=1)
    dfn['impact'], dfn['impact_type'], dfn['type'], dfn['AA_change'], dfn['gene'], dfn['transcript'] = dfn['effect'].apply(lambda x: x.split('@')).str
    dfn.drop('effect', axis=1, inplace=True)
    return dfn

In [70]:
# keep it easy for now pick HIGH, MODERATE and then LOW and the first transcript. could pull out all transcripts if needed
def get_annotations(line):
    effs = line.split('EFF=')[1].split(',')
    #     extract impact, impact_type, amino acid change, and gene
    effs = ['@'.join(list(np.array(re.split('\(|\|',ef))[[0,1,2,4,6,9]])) for ef in effs if ('HIGH' in  ef) or ('MODERATE' in ef) or ('LOW' in ef)]
    effs = list(set(effs))
    high = [ef for ef in effs if 'HIGH' in ef]
    moderate = [ef for ef in effs if 'MODERATE' in ef]
    low = [ef for ef in effs if 'LOW' in ef]
    if high:
        anno = '$'.join(high)
    elif moderate:
        anno = '$'.join(moderate)
    elif low:
        anno = '$'.join(low)
    else:
        print('ERROR!')
#     make sure the genes have the same name
#     genes = [ef.split('@')[2] for ef in effs]
#     print(anno)
    # anno= list of annotations
    return anno


In [71]:
# # example run
ft = '/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-03-06-02020/hg19a/GSH/A37247_A56468/strelka/23223/bwa/results/passed.somatic.snvs.eff.dbSNP_v137.cosmic_v64.annotations.vcf'
ft = '/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-03-06-02020/hg19a/GSH/A37247_A56468/strelka/59869/bwa/results/passed.somatic.snvs.eff.dbSNP_v149.cosmic_v82.clinvar_20170801.annotations.classic.vcf'
# assert count_coding_mutations(ft) == 266
df = parse_strelka_vcf(ft)
dfn = wide_to_long(df)

df.head(2)
dfn.head(2)

length of s and idx 745 745


Unnamed: 0,chr,pos,ref,alt,annotations,patient
6,1,908292,G,A,NON_SYNONYMOUS_CODING@MODERATE@MISSENSE@D352N@...,HTMCP-03-06-02020
7,1,935319,G,A,NON_SYNONYMOUS_CODING@MODERATE@MISSENSE@S12L@H...,HTMCP-03-06-02020


Unnamed: 0,chr,pos,ref,alt,patient,impact,impact_type,type,AA_change,gene,transcript
6,1,908292,G,A,HTMCP-03-06-02020,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,D352N,PLEKHN1,ENST00000379410
6,1,908292,G,A,HTMCP-03-06-02020,NON_SYNONYMOUS_CODING,MODERATE,MISSENSE,D404N,PLEKHN1,ENST00000379409


# 2. Scope: genes and cohort size

In [72]:
# 118 patients
f = '/projects/trans_scratch/validations/workspace/szong/Cervical/mutsig2cv/118_patients.txt'
with open(f) as file:
    patients = [line.strip() for line in file]
assert len(patients) == 118

In [73]:
# 12 SMGS
f2 = '/projects/trans_scratch/validations/workspace/szong/Cervical/mutsig2cv/118_patients/smgs_reviewed.txt'
f2 = '/projects/trans_scratch/validations/workspace/szong/Cervical/mutsig2cv/12_sig_genes.txt'
with open (f2) as file:
    genes = [line.strip() for line in file]
assert len(genes) == 13

# 3. Strelka vcf files

In [74]:
# f1 = '/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients/124_patients_bam_vcf.txt'
f1 = '/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/124_patients_bam_vcf_20190115.csv'
df = pd.read_csv(f1, index_col='patient')
df.head(2)

Unnamed: 0_level_0,HIV_status,DNA_lib,DNA_single_vcf,strelka_indel_vcf,strelka_snv_vcf,source,status,RNA_lib,ribodepleted_lib,DNA_bam,...,other_vcf,DNA_tc,RNA_tc,cnv,bbt_transcriptome,bbt_genome,bbt_transcriptome_other_bacterial,bbt_genome_other_bacterial,bbt_transcriptome_other_viral,bbt_genome_other_viral
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HTMCP-03-06-02001,Positive,A37234,/projects/analysis31/A37234/vcall21989_merge13...,/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-...,/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-...,HTMCP_124,Malignant,A37700,A56295,/projects/analysis/analysis22/A37234/merge_bwa...,...,,55.0,55.0,/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...
HTMCP-03-06-02002,Negative,A37235,/projects/analysis31/A37235/vcall21991_merge13...,/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-...,/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-...,HTMCP_125,Malignant,A37701,A56296,/projects/analysis/analysis22/A37235/merge_bwa...,...,,70.0,70.0,/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...,/projects/NCI_validation2_assembly/NCI_SAIC_HI...


In [75]:
ndf = df[['strelka_snv_vcf', 'strelka_indel_vcf']].dropna()
ndf.head(2)

Unnamed: 0_level_0,strelka_snv_vcf,strelka_indel_vcf
patient,Unnamed: 1_level_1,Unnamed: 2_level_1
HTMCP-03-06-02001,/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-...,/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-...
HTMCP-03-06-02002,/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-...,/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-...


In [76]:
ndf = ndf.reindex(patients)

In [77]:
ndf.shape

(118, 2)

In [78]:
for i in ndf['strelka_indel_vcf'].head(2):
    print(i)

/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-03-06-02001/hg19a/GSH/A37234_A37159/strelka/59844/bwa/results/passed.somatic.indels.eff.dbSNP_v149.cosmic_v82.clinvar_20170801.annotations.classic.vcf
/projects/somatic/NCI_SAIC_HIV_Cervical/HTMCP-03-06-02002/hg19a/GSH/A37235_A37160/strelka/59848/bwa/results/passed.somatic.indels.eff.dbSNP_v149.cosmic_v82.clinvar_20170801.annotations.classic.vcf


# 4. concatenate snvs and indels 

In [79]:
dfmg = pd.DataFrame()
for ix, row in ndf.iterrows():
    patient = ix
    snv_vcf = row['strelka_snv_vcf']
    indel_vcf = row['strelka_indel_vcf']
    snv_df = parse_strelka_vcf(snv_vcf)
    indel_df = parse_strelka_vcf(indel_vcf)
    mdf = pd.concat([snv_df, indel_df], sort=False)
    mdf = mdf.drop_duplicates(keep='first')

    if dfmg.empty:
        dfmg = mdf
    else:
        dfmg = pd.concat([dfmg, mdf], sort=False)


In [80]:
dfmg.head()
dfmg.shape

Unnamed: 0,chr,pos,ref,alt,annotations,patient
39,1,2706109,C,G,SYNONYMOUS_CODING@LOW@SILENT@R16@TTC34@ENST000...,HTMCP-03-06-02001
58,1,4771973,C,T,NON_SYNONYMOUS_CODING@MODERATE@MISSENSE@R15C@A...,HTMCP-03-06-02001
78,1,6146051,G,C,NON_SYNONYMOUS_CODING@MODERATE@MISSENSE@G104A@...,HTMCP-03-06-02001
83,1,6579534,C,T,NON_SYNONYMOUS_CODING@MODERATE@MISSENSE@R13Q@P...,HTMCP-03-06-02001
145,1,12170207,C,G,NON_SYNONYMOUS_CODING@MODERATE@MISSENSE@L208V@...,HTMCP-03-06-02001


(36739, 6)

In [81]:
dfmgn = dfmg.reset_index(drop=True)

In [82]:
dfn = wide_to_long(dfmgn)
# wide_to_long(dfmg.iloc[46:416,:].reset_index(drop=True))


length of s and idx 102013 102013


In [20]:
dfn.shape
dfn.head()

(102013, 10)

Unnamed: 0,chr,pos,ref,alt,patient,impact,impact_type,AA_change,gene,transcript
0,1,2706109,C,G,HTMCP-03-06-02001,SYNONYMOUS_CODING,LOW,R16,TTC34,ENST00000401095
0,1,2706109,C,G,HTMCP-03-06-02001,SYNONYMOUS_CODING,LOW,R16,TTC34,ENST00000574621
1,1,4771973,C,T,HTMCP-03-06-02001,NON_SYNONYMOUS_CODING,MODERATE,R15C,AJAP1,ENST00000378191
1,1,4771973,C,T,HTMCP-03-06-02001,NON_SYNONYMOUS_CODING,MODERATE,R15C,AJAP1,ENST00000378190
2,1,6146051,G,C,HTMCP-03-06-02001,NON_SYNONYMOUS_CODING,MODERATE,G104A,KCNAB2,ENST00000462676


In [83]:
of = '/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/strelka_HML_summary_20190527.txt'
# dfn.to_csv(of, index=False, sep='\t')

### make supplementary table, gene by patient matrix to indicate if a gene is mutated in each patient

In [41]:
of = '/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/strelka_HML_summary_20190527.txt'
df = pd.read_csv(of, sep='\t')
df.head(2)
df.shape

Unnamed: 0,chr,pos,ref,alt,patient,impact,impact_type,type,AA_change,gene,transcript
0,1,2706109,C,G,HTMCP-03-06-02001,SYNONYMOUS_CODING,LOW,SILENT,R16,TTC34,ENST00000401095
1,1,2706109,C,G,HTMCP-03-06-02001,SYNONYMOUS_CODING,LOW,SILENT,R16,TTC34,ENST00000574621


(102013, 11)

In [42]:
df1 = pd.pivot_table(df[['gene', 'patient', 'type']], index='gene', columns='patient', values='type', aggfunc=lambda x: 'mutated' if not x.empty else 'WT')

In [43]:
df1.head(2)
df1['number_patients_mutated'] = df1.notnull().sum(axis=1)
df1['percent_patients_mutated'] = df1['number_patients_mutated']/118.0
df2 = df1[['number_patients_mutated', 'percent_patients_mutated']]

df2.head()
of = '/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/somatic_mutation_frequency_summary_20190529.tsv'
df2.to_csv(of, sep='\t')

patient,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02036,HTMCP-03-06-02037,...,HTMCP-03-06-02417,HTMCP-03-06-02424,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7SK,,,,,,,,,,,...,,,,,,mutated,,,,
A1BG,,,,,,,,,,,...,,,,,,,,,,


patient,number_patients_mutated,percent_patients_mutated
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
7SK,1,0.008475
A1BG,2,0.016949
A1CF,2,0.016949
A2LD1,1,0.008475
A2M,6,0.050847


In [20]:
(df1 == 'WT').any().any()

False

In [29]:
df1 = df1.fillna('wt')

In [30]:
of = '/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/somatic_mutation_gene_by_patient_summary_20190529.tsv'
df1.to_csv(of, sep='\t')

In [26]:
df1.head(2)
df1.loc['PSPC1', :].dropna()

patient,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02036,HTMCP-03-06-02037,...,HTMCP-03-06-02417,HTMCP-03-06-02424,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7SK,,,,,,,,,,,...,,,,,,mutated,,,,
A1BG,,,,,,,,,,,...,,,,,,,,,,


patient
HTMCP-03-06-02002    mutated
HTMCP-03-06-02120    mutated
HTMCP-03-06-02260    mutated
HTMCP-03-06-02320    mutated
HTMCP-03-06-02326    mutated
HTMCP-03-06-02441    mutated
Name: PSPC1, dtype: object

In [84]:
dfn[dfn.gene=='PIK3CA'].shape

(51, 11)

In [32]:
# subset 12 genes of interest
dfs = dfn[dfn.gene.isin(genes)&dfn.patient.isin(patients)]
# dfs = dfmg[dfmg.patient.isin(patients)]

dfs['end'] = dfs.pos
dfs.shape
dfs.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


(305, 11)

Unnamed: 0,chr,pos,ref,alt,patient,impact,impact_type,AA_change,gene,transcript,end
71,3,178936091,G,A,HTMCP-03-06-02001,NON_SYNONYMOUS_CODING,MODERATE,E545K,PIK3CA,ENST00000263967,178936091
72,3,178952117,A,T,HTMCP-03-06-02001,NON_SYNONYMOUS_CODING,MODERATE,I1058F,PIK3CA,ENST00000263967,178952117


In [85]:
of = '/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/strelka_HML_summary_sig_genes_20190527.txt'
dfs.to_csv(of, index=False, sep='\t')

### make input files for cbioportal

In [87]:
dfn.head(2)

Unnamed: 0,chr,pos,ref,alt,patient,impact,impact_type,type,AA_change,gene,transcript
0,1,2706109,C,G,HTMCP-03-06-02001,SYNONYMOUS_CODING,LOW,SILENT,R16,TTC34,ENST00000401095
0,1,2706109,C,G,HTMCP-03-06-02001,SYNONYMOUS_CODING,LOW,SILENT,R16,TTC34,ENST00000574621


In [92]:
headers = ['Chromosome', 'Start_Position', 'Reference_Allele', 'Variant_Allele', 'Sample_ID',  'type', 'impact_type', 'Mutation_Type','Protein_Change', 'Hugo_Symbol', 'transcript', 'End_Position' ]
for gene in genes:
    dfs = dfn[(dfn.gene == gene)&(dfn.patient.isin(patients))]
    dfs['end'] = dfs.pos   
    hf_trans = dfs.groupby('transcript')['transcript'].count().sort_values(ascending=False).index[0]
    dfs = dfs[dfs.transcript == hf_trans]
    dfs.columns = headers
    of = f'/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/strelka_HML_summary_{gene}_{hf_trans}_20190527.txt'
    print(f'process {gene}: {of}')
    dfs.to_csv(of, index=False, sep='\t')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


process PIK3CA: /projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/strelka_HML_summary_PIK3CA_ENST00000263967_20190527.txt
process MAPK1: /projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/strelka_HML_summary_MAPK1_ENST00000544786_20190527.txt
process CASP8: /projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/strelka_HML_summary_CASP8_ENST00000432109_20190527.txt
process FBXW7: /projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/strelka_HML_summary_FBXW7_ENST00000281708_20190527.txt
process MLL2: /projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/strelka_HML_summary_MLL2_ENST00000301067_20190527.txt
process ZC3H6: /projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients_new/strelka_HML_summary_ZC3H6_ENST00000409871_20190527.txt
process PCDHGA12: /pro

IndexError: index 0 is out of bounds for axis 0 with size 0

In [61]:
def format_impact(impact):
    impact = impact.upper()
    if '+' in impact:
        impact = impact.split('+')[0]
    return impact

In [62]:
# These are not reasonable, it will remove multi-hits
dfs['impact_new'] = dfs['impact'].apply(format_impact)
dfs.drop('impact', axis=1, inplace=True)
dfs.shape
dfs[dfs.duplicated()]

# dfs.drop_duplicates(keep='first', inplace=True)
# dfs.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(171, 8)

Unnamed: 0,chr,pos,ref,alt,impact_type,gene,patient,impact_new


In [63]:
dfs['impact_new'].unique()

array(['NON_SYNONYMOUS_CODING', 'FRAME_SHIFT', 'STOP_GAINED',
       'SYNONYMOUS_CODING', 'SPLICE_SITE_ACCEPTOR', 'SPLICE_SITE_REGION',
       'START_GAINED', 'CODON_DELETION',
       'CODON_CHANGE_PLUS_CODON_DELETION'], dtype=object)

In [64]:
dfs.drop_duplicates().shape

(171, 8)

In [65]:
dfs = dfs.drop_duplicates().groupby(['gene', 'patient']).agg({'impact_new': ','.join}).reset_index()
dfs.head()

Unnamed: 0,gene,patient,impact_new
0,CASP8,HTMCP-03-06-02001,FRAME_SHIFT
1,CASP8,HTMCP-03-06-02012,STOP_GAINED
2,CASP8,HTMCP-03-06-02036,NON_SYNONYMOUS_CODING
3,CASP8,HTMCP-03-06-02239,NON_SYNONYMOUS_CODING
4,CASP8,HTMCP-03-06-02260,FRAME_SHIFT


In [66]:
def impact_type(x):
#     print(x)
    xsplit = x.split(',')
#     print(x)
#     type = xsplit[0].upper()
    if len(xsplit) > 1:
        type = 'Multi_hit'
    elif len(xsplit) == 1:
        type = xsplit[0]
        if type == 'MISSENSE_VARIANT' or type == 'NON_SYNONYMOUS_CODING':
           type = 'Non-synonymous'
        elif type == 'SYNONYMOUS_VARIANT' or type == 'SYNONYMOUS_CODING':
           type = 'Synonymous'
        elif  type == 'SPLICE_SITE_ACCEPTOR' or  type == 'SPLICE_SITE_DONOR' or type == 'SPLICE_SITE_REGION':
           type = 'Splice site'
        elif type == 'STOP_LOST':
           type = 'Stop lost'
        elif type == 'STOP_GAINED':
           type = 'Stop gained'
        elif  type == 'START_LOST':
           type = 'Start lost'
        elif type == 'START_GAINED':
           type = 'Start gained'
        elif type == 'FRAME_SHIFT' or type == 'FRAMESHIFT_VARIANT':
           type = 'Frameshift'
        elif type == 'INFRAME_DELETION' or type == 'CODON_INSERTION' or type == 'CODON_DELETION' or type == 'CODON_CHANGE_PLUS_CODON_INSERTION' or type == 'CODON_CHANGE_PLUS_CODON_DELETION' or type.lower() == 'disruptive_inframe_deletion':
           type = 'Codon indel'                
        else: exit(1);print('ERROR');print(x)
    return type

In [67]:
dfs['impact'] = dfs['impact_new'].apply(lambda x: impact_type(x))
dfs.shape
dfs = dfs[['gene', 'patient', 'impact']].drop_duplicates()
dfs.shape
dfs = dfs.set_index(['gene', 'patient'])['impact'].unstack()
dfs.head(2)
dfs.shape

(139, 4)

(139, 3)

patient,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02006,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02036,HTMCP-03-06-02040,HTMCP-03-06-02042,HTMCP-03-06-02047,...,HTMCP-03-06-02344,HTMCP-03-06-02346,HTMCP-03-06-02354,HTMCP-03-06-02392,HTMCP-03-06-02411,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02435,HTMCP-03-06-02441,HTMCP-03-06-02447
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CASP8,Frameshift,,,Stop gained,,,Non-synonymous,,,,...,,,,,,,,,,
FAT1,,,,,,,Stop gained,Synonymous,,,...,,,Stop gained,,,Frameshift,Multi_hit,Non-synonymous,,


(12, 80)

# 5. add in covariates

In [99]:
# get meta track for example histology
f3 ='/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients/mutation_load_clinic.txt'
ddf = pd.read_csv(f3, sep='\t')
ddf = ddf[['patient', 'HIV_status', 'Putative_histology']]
edf = ddf.set_index('patient').T
edf.loc['Putative_histology',].unique()
edf = edf[patients]
edf.head(2)
edf.shape

array(['Squamous', 'Adenosquamous', 'Adeno', 'Neuroendocrine'],
      dtype=object)

patient,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02036,HTMCP-03-06-02037,...,HTMCP-03-06-02417,HTMCP-03-06-02424,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447
HIV_status,Positive,Negative,Positive,Negative,Negative,Negative,Negative,Positive,Negative,Negative,...,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive
Putative_histology,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,...,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous


(2, 118)

In [97]:
s = (edf.loc['Putative_histology',:].isin(['Adenosquamous', 'Neuroendocrine']))
s[s]
edf[s[s].index]

patient
HTMCP-03-06-02047    True
HTMCP-03-06-02110    True
HTMCP-03-06-02176    True
HTMCP-03-06-02203    True
HTMCP-03-06-02216    True
HTMCP-03-06-02235    True
Name: Putative_histology, dtype: bool

patient,HTMCP-03-06-02047,HTMCP-03-06-02110,HTMCP-03-06-02176,HTMCP-03-06-02203,HTMCP-03-06-02216,HTMCP-03-06-02235
HIV_status,Negative,Positive,Negative,Negative,Positive,Positive
Putative_histology,Neuroendocrine,Adenosquamous,Adenosquamous,Adenosquamous,Adenosquamous,Neuroendocrine


In [98]:
cases other than squamous or adeno
# patient	HIV_status	Histology	APOBEC_signature	Clinical_stage	Age_at_diagnosis
# HTMCP-03-06-02235	Positive	Neuroendocrine	weak	Stage IV	<45
# HTMCP-03-06-02110	Positive	Adenosquamous	weak	Stage IV	<45
# HTMCP-03-06-02216	Positive	Adenosquamous	weak	Stage III	45-65
# HTMCP-03-06-02203	Negative	Adenosquamous	moderate	Stage II	45-65
# HTMCP-03-06-02047	Negative	Neuroendocrine	weak	Stage IV	45-65
# HTMCP-03-06-02176	Negative	Adenosquamous	moderate	Stage III	45-65

In [100]:
dfs = pd.concat([edf, dfs], sort=False)

In [101]:
dfs.shape

(14, 118)

In [102]:
dfs = dfs[patients]

# 6. add in mutation rate (fraction of patients)

In [104]:
dfs['occurrence'] = (dfs.notnull().sum(axis=1)/(dfs.shape[1] -2))
dfs['percentage'] = ['{0}({1}%)'.format(i[0], int(round(i[1]*100))) for i in zip(dfs.index.tolist(), dfs.occurrence)]
# dfs.loc['HIV_status', 'percentage'] = 'HIV_Status'
# dfs.loc['Putative_histology', 'percentage' ] = 'Puatative_histology'
sdf = dfs.drop('occurrence', axis=1).set_index('percentage', drop=True)

In [105]:
sdf

Unnamed: 0_level_0,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02036,HTMCP-03-06-02037,...,HTMCP-03-06-02417,HTMCP-03-06-02424,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447
percentage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HIV_status(102%),Positive,Negative,Positive,Negative,Negative,Negative,Negative,Positive,Negative,Negative,...,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive
Putative_histology(102%),Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,...,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous
CASP8(7%),Frameshift,,,,,Stop gained,,,Non-synonymous,,...,,,,,,,,,,
FAT1(19%),,,,,,,,,Stop gained,,...,,,Frameshift,Multi_hit,,Non-synonymous,,,,
FBXW7(10%),,,,Non-synonymous,,,,,,,...,,,,Non-synonymous,,Frameshift,,,,
MAPK1(5%),,,,,,,,,,,...,,,,Non-synonymous,,,,,,
MLL2(15%),,Stop gained,,,,,,,,,...,,,,,,Frameshift,,,,
PCDHA9(3%),,,,,,,,,,,...,,,,,,,,,,
PCDHGA12(5%),,,,Non-synonymous,,,,,,,...,,,,,,Non-synonymous,,,,
PIK3CA(35%),Multi_hit,Non-synonymous,,,,Multi_hit,Non-synonymous,,Non-synonymous,,...,,,,Non-synonymous,,,,,,Non-synonymous


In [106]:
of = '/projects/trans_scratch/validations/workspace/szong/Cervical/mutsig2cv/118_patients/smgs_reviewed_details_20190326.txt'
# sdf.to_csv(of, sep='\t')

In [108]:
sdf[:2]

Unnamed: 0_level_0,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02036,HTMCP-03-06-02037,...,HTMCP-03-06-02417,HTMCP-03-06-02424,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447
percentage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HIV_status(102%),Positive,Negative,Positive,Negative,Negative,Negative,Negative,Positive,Negative,Negative,...,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive
Putative_histology(102%),Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,...,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous


In [127]:
pd.unique(sdf.values.ravel())

array(['Positive', 'Negative', 'Squamous', 'Neuroendocrine', 'Adeno',
       'Adenosquamous', 'Frameshift', nan, 'Stop gained',
       'Non-synonymous', 'Synonymous', 'Multi_hit', 'Start gained'],
      dtype=object)

In [109]:
df = sdf.copy()

In [110]:
dftmp = pd.DataFrame([list(np.array(re.split('\(|\)', i))[[0,1]]) for i in df.index.tolist()], columns=['gene', 'mfreq'])

df['gene'] = dftmp.gene.tolist()
df['mfreq'] = dftmp.mfreq.tolist()

df['mfreq'] = df['mfreq'].apply(lambda x: int(x.replace('%', '')))

In [112]:
df = df.sort_values('mfreq', ascending=False)

df.set_index('gene', drop=True, inplace=True)

df.head()

Unnamed: 0_level_0,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02036,HTMCP-03-06-02037,...,HTMCP-03-06-02424,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447,mfreq
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HIV_status,Positive,Negative,Positive,Negative,Negative,Negative,Negative,Positive,Negative,Negative,...,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,102
Putative_histology,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,...,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous,102
PIK3CA,Multi_hit,Non-synonymous,,,,Multi_hit,Non-synonymous,,Non-synonymous,,...,,,Non-synonymous,,,,,,Non-synonymous,35
FAT1,,,,,,,,,Stop gained,,...,,Frameshift,Multi_hit,,Non-synonymous,,,,,19
MLL2,,Stop gained,,,,,,,,,...,,,,,Frameshift,,,,,15


In [114]:
dfa = (df.loc['HIV_status',] == 'Negative') 
dfa = df[dfa[dfa].index]
df['hiv_neg_mfreq'] = (dfa.notnull().sum(axis=1)/dfa.shape[1]*100).astype(int)
dfb = (df.loc['HIV_status',] == 'Positive') 
dfb = df[dfb[dfb].index]
df['hiv_pos_mfreq'] = (dfb.notnull().sum(axis=1)/dfb.shape[1]*100).astype(int)

In [None]:
df = df.sort_values(df.index.tolist(), axis=1)

In [157]:
df['percentage'] = ['{0}({1}%)'.format(i[0], i[1]) for i in [(i[0], i[1]) for i in zip(df.index, df.mfreq)] ]

In [159]:
df.set_index('percentage', drop=True, inplace=True)

In [162]:
f2 = '/projects/trans_scratch/validations/workspace/szong/Cervical/mutsig2cv/118_patients/smgs_reviewed_details_with_frequency_20190326.txt'
df.to_csv(f2)