In [1]:
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib import style, colors
from matplotlib import gridspec

matplotlib.style.use('ggplot')
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (8, 6)

# make oncoprint for 124 patients

In [2]:
f1 ='/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients/HIGH_MODERATE_INDEL_summary_with_normal_strelka_only.tsv'
indf = pd.read_csv(f1, sep='\t')
indf.head(2)

Unnamed: 0,gene,num_patients_gene_level,num_INDELs_gene_level,chromosome,position,ref_base,alt_base,num_patients_INDEL_level,patient_ID,snp_ID,...,pileup_AltC,pileup_AF,strelka_n_Cov,strelka_n_RefC,strelka_n_AltC,strelka_n_AF,strelka_t_Cov,strelka_t_RefC,strelka_t_AltC,strelka_t_AF
0,ABCA9,2,3,17,66992092,C,CA,1,HTMCP-03-06-02007,novel_snp,...,na,na,54,51,0,0.0,78,41,29,0.41
1,ABCB5,1,1,7,20767995,TATA,T,1,HTMCP-03-06-02202,novel_snp,...,na,na,44,45,0,0.0,66,42,18,0.3


In [3]:
indf['impact_tmp'] = indf['snpeff_details'].apply(lambda x: x.split('(')[0])
indf = indf[['gene', 'patient_ID','impact_tmp']]
indf.head()

Unnamed: 0,gene,patient_ID,impact_tmp
0,ABCA9,HTMCP-03-06-02007,FRAME_SHIFT
1,ABCB5,HTMCP-03-06-02202,inframe_deletion
2,ABHD14A-ACY1,HTMCP-03-06-02441,FRAME_SHIFT
3,ABHD16A,HTMCP-03-06-02435,FRAME_SHIFT
4,AC006035.2,HTMCP-03-06-02007,FRAME_SHIFT


In [4]:
# get gene list
f2 = '/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients/genes.txt'
cdf = pd.read_csv(f2)
genes = cdf.genes.tolist()
print(genes)

['YAP1', 'AJUBA', 'ARID1A', 'ATR', 'ATRX', 'B2M', 'CASP8', 'CBFB', 'ELF3', 'EP300', 'ERBB2', 'ERBB3', 'FAT1', 'FAT2', 'FBXW7', 'HLA-A', 'HLA-B', 'HLA-C', 'KRAS', 'LRP1B', 'MAGEC1', 'MAPK1', 'MLL', 'MLL2', 'MLL3', 'NFE2L2', 'PIK3CA', 'PTEN', 'RB1', 'SF3B1', 'SHKBP1', 'STK11', 'TGFBR2', 'TP53', 'TPR', 'U2AF1', 'ZRSR2']


In [5]:
# get meta track for example histology
f3 ='/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients/mutation_load_clinic.txt'
ddf = pd.read_csv(f3, sep='\t')
ddf = ddf[['patient', 'reanne_HIV_status', 'reanne_Putative_histology']]
edf = ddf.set_index('patient').T
edf.head(2)

patient,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02007,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02026,...,HTMCP-03-06-02424,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447,HTMCP-03-06-02448
reanne_HIV_status,Positive,Negative,Positive,Negative,Negative,Negative,Negative,Negative,Positive,Negative,...,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive
reanne_Putative_histology,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous,Squamous,Squamous,Adeno,...,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous,Adeno


In [6]:
# get snvs
f4 = '/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients/124_patients_selected_columns_new_filtered.txt'
pdf = pd.read_csv(f4, sep='\t')
pdf.head(2)
pdf['impact_tmp'] = pdf['snpeff_details'].apply(lambda x: x.split('(')[0])
mdf = pdf[['gene', 'patient_ID','impact_tmp']]

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# concat indel and snvs
rdf = pd.concat([mdf,indf])
rdf.head(2)

Unnamed: 0,gene,patient_ID,impact_tmp
0,SAMD11,HTMCP-03-06-02007,NON_SYNONYMOUS_CODING
1,SAMD11,HTMCP-03-06-02214,NON_SYNONYMOUS_CODING


In [8]:
rdf = rdf.drop_duplicates().groupby(['gene', 'patient_ID']).agg({'impact_tmp': ','.join}).reset_index()
rdf.head(2)

Unnamed: 0,gene,patient_ID,impact_tmp
0,A1BG,HTMCP-03-06-02179,NON_SYNONYMOUS_CODING
1,A1CF,HTMCP-03-06-02020,NON_SYNONYMOUS_CODING


In [9]:
def impact_type(x):
#     print(x)
    xsplit = list(set(x.split(',')))
    type = xsplit[0].upper()
    if len(xsplit) > 1:
        impact = 'Multiple'
    elif len(xsplit) == 1:
         if '+' in xsplit[0]:
             impact = 'Multiple'
         elif type == 'MISSENSE_VARIANT' or type == 'NON_SYNONYMOUS_CODING':
             impact = 'NON_SYNONYMOUS_CODING'
         elif  type == 'SPLICE_SITE_ACCEPTOR' or  type == 'SPLICE_SITE_DONOR':
             impact = 'SPLICE_ACCEPTOR_DONOR'
         elif type == 'STOP_LOST':
             impact = 'STOP_LOST'
         elif type == 'STOP_GAINED':
             impact = 'STOP_GAINED'
         elif  type == 'START_LOST':
             impact = 'START_LOST'
         elif type == 'START_GAINED':
             impact = 'START_GAINED'
         elif type == 'FRAME_SHIFT' or type == 'FRAMESHIFT_VARIANT':
             impact = 'FRAME_SHIFT'
         elif type == 'INFRAME_DELETION' or type == 'CODON_INSERTION' or type == 'CODON_DELETION' or type == 'CODON_CHANGE_PLUS_CODON_INSERTION' or type == 'CODON_CHANGE_PLUS_CODON_DELETION' or type.lower() == 'disruptive_inframe_deletion':
             impact = 'CODON_INSERTION_DELETION'                
         else: exit(1);print('ERROR')
    return impact

In [10]:
rdf['impact'] = rdf['impact_tmp'].apply(lambda x: impact_type(x))
rdf = rdf[['gene', 'patient_ID', 'impact']].drop_duplicates()
rdf = rdf.set_index(['gene', 'patient_ID'])['impact'].unstack()
rdf.head(2)

patient_ID,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02007,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02026,...,HTMCP-03-06-02424,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447,HTMCP-03-06-02448
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,,,,,,,,,,,...,,,,,,,,,,
A1CF,,,,,,,,,NON_SYNONYMOUS_CODING,,...,,,,,,,,,,


In [11]:
# of3 = '/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/107_patients/mutations_in_oncoprint.txt'
# rdf.fillna('NA').to_csv(of3, sep='\t')

In [12]:
rdf = pd.concat([rdf, edf])

In [13]:
rdf.index

Index(['A1BG', 'A1CF', 'A2LD1', 'A2M', 'A2ML1', 'A4GALT', 'A4GNT', 'AAAS',
       'AACS', 'AADAC',
       ...
       'ZXDC', 'ZYG11A', 'ZYG11B', 'ZZEF1', 'ZZZ3', 'hsa-mir-3187',
       'hsa-mir-4763', 'yR211F11.2', 'reanne_HIV_status',
       'reanne_Putative_histology'],
      dtype='object', length=14575)

In [14]:
genes = [ 'reanne_HIV_status', 'reanne_Putative_histology'] + genes

In [15]:
rdf = rdf.reindex(genes)#.loc['PIK3CA'].value_counts()
rdf.head(3)
pd.Series(rdf.values.ravel()).unique()

array(['Positive', 'Negative', nan, 'Squamous', 'Adeno', None,
       'NON_SYNONYMOUS_CODING', 'FRAME_SHIFT', 'Multiple', 'STOP_GAINED',
       'SPLICE_ACCEPTOR_DONOR', 'CODON_INSERTION_DELETION'], dtype=object)

In [16]:
rdf['occurrence'] = (rdf.notnull().sum(axis=1)/(rdf.shape[1] -2))

In [17]:
genes = rdf['occurrence'].sort_values(ascending=False).index.tolist()
# genes

In [18]:
rdf = rdf.reindex(genes)
rdf.head(2)

Unnamed: 0,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02007,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02026,...,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447,HTMCP-03-06-02448,occurrence
reanne_HIV_status,Positive,Negative,Positive,Negative,Negative,Negative,Negative,Negative,Positive,Negative,...,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,1.008197
reanne_Putative_histology,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous,Squamous,Squamous,Adeno,...,Squamous,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous,Adeno,1.008197


In [19]:
rdf['percentage'] = ['{0}({1}%)'.format(i[0], int(round(i[1]*100))) for i in zip(rdf.index.tolist(), rdf.occurrence)]

In [20]:
rdf.loc['reanne_HIV_status', 'percentage'] = 'HIV_Status'
rdf.loc['reanne_Putative_histology', 'percentage' ] = 'Puatative_Histology'

In [21]:
rdf.head(3)

Unnamed: 0,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02007,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02026,...,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447,HTMCP-03-06-02448,occurrence,percentage
reanne_HIV_status,Positive,Negative,Positive,Negative,Negative,Negative,Negative,Negative,Positive,Negative,...,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,1.008197,HIV_Status
reanne_Putative_histology,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous,Squamous,Squamous,Adeno,...,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous,Adeno,1.008197,Puatative_Histology
PIK3CA,NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING,,,Multiple,,NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING,,STOP_GAINED,...,NON_SYNONYMOUS_CODING,,,,,,NON_SYNONYMOUS_CODING,,0.360656,PIK3CA(36%)


In [22]:
sdf = rdf.drop('occurrence', axis=1).set_index('percentage', drop=True)

In [23]:
# sdf = sdf.fillna(0)

In [24]:
sdf.head(3)

Unnamed: 0_level_0,HTMCP-03-06-02001,HTMCP-03-06-02002,HTMCP-03-06-02003,HTMCP-03-06-02006,HTMCP-03-06-02007,HTMCP-03-06-02008,HTMCP-03-06-02012,HTMCP-03-06-02013,HTMCP-03-06-02020,HTMCP-03-06-02026,...,HTMCP-03-06-02424,HTMCP-03-06-02427,HTMCP-03-06-02428,HTMCP-03-06-02434,HTMCP-03-06-02435,HTMCP-03-06-02437,HTMCP-03-06-02441,HTMCP-03-06-02442,HTMCP-03-06-02447,HTMCP-03-06-02448
percentage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HIV_Status,Positive,Negative,Positive,Negative,Negative,Negative,Negative,Negative,Positive,Negative,...,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive,Positive
Puatative_Histology,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous,Squamous,Squamous,Adeno,...,Squamous,Squamous,Squamous,Squamous,Squamous,Squamous,Adeno,Squamous,Squamous,Adeno
PIK3CA(36%),NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING,,,Multiple,,NON_SYNONYMOUS_CODING,NON_SYNONYMOUS_CODING,,STOP_GAINED,...,,,NON_SYNONYMOUS_CODING,,,,,,NON_SYNONYMOUS_CODING,


In [26]:
sdf.to_csv('/projects/trans_scratch/validations/workspace/szong/Cervical/variant_bwamem/124_patients/mutations_for_oncoprint_124_patients.txt')