In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [13]:
mutations_df = pd.read_csv(
    "Dataset/chol_tcga_pan_can_atlas_2018/data_mutations.txt",
    sep="\t",
    header=0
)
mutations_df.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,SYMBOL_SOURCE,TREMBL,TSL,UNIPARC,VARIANT_CLASS,all_effects,cDNA_position,n_depth,t_depth,Annotation_Status
0,TRUB1,142940,.,GRCh37,10,116734973,116734973,+,stop_gained,Nonsense_Mutation,...,HGNC,"B4DZ90_HUMAN,B3KWQ1_HUMAN",.,UPI000006DEBE,SNV,"TRUB1,stop_gained,p.Trp295Ter,ENST00000298746,;",946,50,58,SUCCESS
1,ASCC1,51008,.,GRCh37,10,73956736,73956736,+,stop_gained,Nonsense_Mutation,...,HGNC,E9PJM2_HUMAN,.,UPI000006F7E9,SNV,"ASCC1,stop_gained,p.Gln108Ter,ENST00000524829,...",508,71,97,SUCCESS
2,RRM1,6240,.,GRCh37,11,4148284,4148284,+,missense_variant,Missense_Mutation,...,HGNC,"F5H861_HUMAN,E9PL69_HUMAN,E9PD78_HUMAN,B4DNN4_...",.,UPI0000000C7C,SNV,"RRM1,missense_variant,p.Arg400His,ENST00000423...",1694,50,82,SUCCESS
3,MAP3K11,4296,.,GRCh37,11,65373425,65373425,+,missense_variant,Missense_Mutation,...,HGNC,"E9PLB1_HUMAN,E9PID4_HUMAN,B4DS76_HUMAN",.,UPI0000049BF7,SNV,"MAP3K11,missense_variant,p.Gln577His,ENST00000...",2217,28,59,SUCCESS
4,LMO3,55885,.,GRCh37,12,16757898,16757898,+,5_prime_UTR_variant,5'UTR,...,HGNC,.,.,UPI00017A6FD2,SNV,"LMO3,5_prime_UTR_variant,,ENST00000261169,;LMO...",.,26,35,SUCCESS


In [16]:
print(mutations_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Columns: 114 entries, Hugo_Symbol to Annotation_Status
dtypes: float64(1), int64(11), object(102)
memory usage: 3.9+ MB
None


In [18]:
print(mutations_df.columns)

Index(['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome',
       'Start_Position', 'End_Position', 'Strand', 'Consequence',
       'Variant_Classification',
       ...
       'SYMBOL_SOURCE', 'TREMBL', 'TSL', 'UNIPARC', 'VARIANT_CLASS',
       'all_effects', 'cDNA_position', 'n_depth', 't_depth',
       'Annotation_Status'],
      dtype='object', length=114)


### Transforming data into one line per sample_id and all mutated genes comma-separated in the same line

In [58]:
mutations_df1 = mutations_df[["Hugo_Symbol", "Tumor_Sample_Barcode"]]

# Group by sample_id and aggregate mutations into a comma-separated string
grouped = mutations_df1.groupby('Tumor_Sample_Barcode')


mutations_df2 = grouped.agg(
    Mutation_Count=('Hugo_Symbol', 'count'),           # Define the name for the mutation count column
    Mutations=('Hugo_Symbol', lambda x: ', '.join(x))  # Define the name for the aggregated mutation column
).reset_index()

mutations_df2.head()

Unnamed: 0,Tumor_Sample_Barcode,Mutation_Count,Mutations
0,TCGA-3X-AAV9-01,75,"TRUB1, ASCC1, RRM1, MAP3K11, LMO3, ABCC9, ANKR..."
1,TCGA-3X-AAVA-01,60,"ASAH2, ROBO4, ARFGAP2, TMEM132C, KRT3, DCLK1, ..."
2,TCGA-3X-AAVB-01,48,"WDFY4, A1CF, RAN, KRAS, STAT6, TUBGCP3, ISLR2,..."
3,TCGA-3X-AAVC-01,84,"BEND7, MUC6, ANO3, TSPAN18, UBE2L6, OR9I1, OR1..."
4,TCGA-3X-AAVE-01,68,"NET1, BIRC2, OR5D16, POC1B, ASB7, EXD1, ALPK3,..."


In [61]:
mutations_df2.to_csv('Dataset/mydata/chol_tcga_pan_can_atlas_2018/mutations_summary.csv', index=False)

### Using Gene2Vec-pretrained + GIT-finetuned gene embeddings to get a feature representation of each sample