# Export preprocessed SNV file to NBS matrix

In [1]:
##STEP 1.1: Import required python libraries 

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### 15Feb2022: 
#### STEP 1: After successfully done Fisher's Exact test for 198 drivers, 198 were reduced to 47 drivers --> Export table to NBS (Network-Based Stratification)

In [2]:
# open cleaned snv file
df1 = pd.read_csv("./output/6.2_LUAD_snv_mc3_OUT_cleaned_cSIF.tsv",sep='\t',header=0)
df2 = df1.copy()

# Drop unused multiple (all) columns EXCEPT 'sample_id' and 'gene'
df3=df2.drop(['chr','start','end','reference','alt','effect','Amino_Acid_Change','DNA_VAF',
               'SIFT','PolyPhen','tumor_stage','SIFT2','PolyPhen2'],axis=1)

df4 = df3.set_index('gene')
df4 = df4.sort_index(ascending=True)
with pd.option_context('display.max_rows',2): display(df4) # 101757 rows × 1 columns

Unnamed: 0_level_0,sample_id
gene,Unnamed: 1_level_1
A1BG,TCGA-38-4631-01
...,...
ZZZ3,TCGA-86-6851-01


In [3]:
# open list of driver genes file
df5=pd.read_csv("../01_Gene_lists/SuppTable1_All_Drivers_GeneList.tsv",sep='\t',header=0)

drivlist47 = df5['LUAD-Early_47'].dropna().unique().tolist()
len(drivlist47) # = 47

print(drivlist47)

['AFF2', 'AMER1', 'ARID1A', 'ASXL3', 'ATM', 'BAZ2B', 'BRAF', 'CDH12', 'CDK12', 'CDKN2A', 'COL1A1', 'CREBBP', 'CTNNB1', 'DNMT3A', 'EGFR', 'EIF4G1', 'EPHA4', 'FAT1', 'FN1', 'KDR', 'KEAP1', 'KMT2A', 'KMT2C', 'KRAS', 'LPHN2', 'MAP3K4', 'MET', 'MGA', 'MMP16', 'MMP2', 'NCAM1', 'NF1', 'NTRK2', 'NUP98', 'PIK3CA', 'PRKCB', 'PTPRD', 'RB1', 'RBM10', 'RYR2', 'SETD2', 'SMAD4', 'SMARCA4', 'SORCS3', 'STK11', 'SVEP1', 'TP53']


In [10]:
# Select rows where index is gene names of 47 drivers (kept in "drivlist47")
df6 = df4.loc[drivlist47, ["sample_id"]]
with pd.option_context('display.max_rows',2): display(df6) # 2379 rows (mutations) × 1 columns

Unnamed: 0_level_0,sample_id
gene,Unnamed: 1_level_1
AFF2,TCGA-55-8302-01
...,...
TP53,TCGA-55-7570-01


In [12]:
# Turn gene index back to column
df7 = df6.reset_index(drop=False) # get column 'gene' back
df7.head(2)
len(df7.gene.unique().tolist()) # 47 genes

47

In [13]:
# Make pivot table using gene as columns: because will feed input to NBS as matrix file (list file get error)
df7['COUNT'] = 1
df8= df7.pivot_table('COUNT', index=('sample_id'),columns="gene").fillna(0)
df8 = pd.DataFrame(df8)

with pd.option_context('display.max_rows',2): display(df8) # -->  505 reduced to 477 samples and 47 genes

gene,AFF2,AMER1,ARID1A,ASXL3,ATM,BAZ2B,BRAF,CDH12,CDK12,CDKN2A,...,RB1,RBM10,RYR2,SETD2,SMAD4,SMARCA4,SORCS3,STK11,SVEP1,TP53
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4244-01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-O1-A52J-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
# Export pivot table as matrix file for input in NBS
df8.to_csv('./output/6.6_OUT_cSIF_Driv47_477LUAD_toNBS_matrix_15Feb.tsv', sep="\t",index=True) 