In [14]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

import sklearn.linear_model
import statsmodels.api as sm
from scipy import stats
from scipy.stats import zscore

from collections import Counter

In [3]:
cancer_genes = pd.read_csv("../../../COSMIC/Census_allTue Nov 10 13_38_57 2020.csv")
cancer_genes = cancer_genes["Gene Symbol"].tolist()

In [4]:
celllines_mapped = pd.read_csv("../../MCF7_removal/Celllines_mapping_manual_noMCF7.csv")
celllines_mutations = pd.read_csv("../../../GEMICCL/Mutation.csv", sep = "\t")

  celllines_mutations = pd.read_csv("../../../GEMICCL/Mutation.csv", sep = "\t")


In [5]:
# Get only cell lines with ID
celllines_mapped = celllines_mapped.drop_duplicates(subset = ["dsIdx"])
celllines_mapped = celllines_mapped.dropna(subset = ["ID"])
sample_data_tissue = celllines_mapped["DI"].tolist()

In [7]:
tissue_origin = pd.read_excel("../../../Cellosaurus_data/Cellline_tissue_origin.xlsx", names = ["Cancer", "Origin"])
tissue_origin_dict = tissue_origin.set_index("Cancer")["Origin"].to_dict()

In [8]:
tissue_type = []
for item in sample_data_tissue:
    if pd.isnull(item):
        tissue_type.append("Unknown") 
    elif item == "[]":
        tissue_type.append("Unknown")
    else:
        item = item.split(";")[2].split("'")[0].strip()
        tissue_type.append(item)
        
cell_origin = [tissue_origin_dict.get(item) for item in tissue_type]
celllines_mapped["Tissue"] = cell_origin

In [9]:
celllines_list = celllines_mapped["ID"].unique().tolist()

In [10]:
celllines_notinmutationset = set(celllines_list) - set(celllines_mutations['CellLineName_Cellosaurus'])
celllines_mapped_mutations = celllines_mapped[~celllines_mapped["ID"].isin(celllines_notinmutationset)]

In [11]:
tissue_labelled = pd.get_dummies(celllines_mapped_mutations, columns=["Tissue"])

In [12]:
metabolomics_data = pd.read_csv("../../MCF7_removal/Metabolomics_data_noMCF7.csv")
metabolomics_data = metabolomics_data.set_index("ionIdx")

In [69]:
gene_array = []

count = 1
#iterate through genes

cancer_genes = ["KRAS"]
for gene in cancer_genes:
    print("Calculating for " + gene + " %s/%s" % (str(count), str(len(cancer_genes))))
    count+=1
    ## Figure out which cell lines have mutations in the gene
    gene_of_interest = celllines_mutations[celllines_mutations["HGNC"] == gene]
    gene_of_interest = gene_of_interest[gene_of_interest["MutationType"] != "Silent"]
    mutationsingene = gene_of_interest["AA_Mutation"].unique().tolist()
    
    
    for mutation in mutationsingene:
        print(mutation)
        gene_of_interest_mutation = gene_of_interest[gene_of_interest["AA_Mutation"] == mutation]["CellLineName_Cellosaurus"]
        gene_of_interest_celllines = set(gene_of_interest["CellLineName_Cellosaurus"])
    
    ## Get the cell lines with mutations
        cellines_with_mutation = celllines_mapped_mutations[celllines_mapped_mutations["ID"].isin(gene_of_interest_mutation)]["dsIdx"].tolist()
        cellines_without_mutation = celllines_mapped_mutations[~celllines_mapped_mutations["ID"].isin(gene_of_interest_celllines)]["dsIdx"].tolist()
        cellines_with_mutation = [str(x) for x in cellines_with_mutation]
        cellines_without_mutation = [str(x) for x in cellines_without_mutation]
    
        ## Get the metabolomics data
        mutated_celllines = metabolomics_data[cellines_with_mutation].transpose()
        nonmutated_celllines = metabolomics_data[cellines_without_mutation].transpose()
    
        # If there are no mutations found then ignore this gene
        if mutated_celllines.empty:
            print("No mutations found for " + gene)
            continue
    
        ## Generate an array with the cell line metabolomics plus a tag for mutated for gene of interest
        mutated_celllines["Mutation"] = 1
        nonmutated_celllines["Mutation"] = 0
    
        celllines_labeled = pd.concat([mutated_celllines, nonmutated_celllines])
        celllines_labeled.index = celllines_labeled.index.map(str)
    
        ## Calculate tissue labels
        tissue_labels = []
        tissue_labels.append("dsIdx")
        for item in tissue_labelled.columns:
            if item.startswith("Tissue"):
                tissue_labels.append(item)
            
        tissue_labelled_tissue = tissue_labelled[tissue_labels].set_index("dsIdx")
        tissue_labelled_tissue.index = tissue_labelled_tissue.index.rename("") 
        tissue_labelled_tissue.index = tissue_labelled_tissue.index.map(str)
    
    ## Add tissue labels to regression model
        celllines_labeled = celllines_labeled.join(tissue_labelled_tissue)
    
    ## Set up for the logistic regression -> X is all metabolomics, Y is the mutated vs non mutated
        X = (celllines_labeled.drop(["Mutation"] + tissue_labels[1:], axis =1))
        Y = celllines_labeled["Mutation"]
    ## Zscore the metabolites across the cell line axis
        X= X.apply(zscore, axis =0)
    
        df_array = []
        Xsubset = X.copy()
    
    ## Generate a logistic regression model for every metabolite
        for item in range(1,X.shape[1]+1):
            Xsubset2 = Xsubset[item]
            X2 = sm.add_constant(Xsubset2)
            X2 = X2.join(tissue_labelled_tissue)
            est = sm.OLS(Y, X2)
            est2 = est.fit(disp=0)
            results_as_html = est2.summary().tables[1].as_html()
            summary = pd.read_html(results_as_html, header=0, index_col=0)[0]
            df_array.append(summary)
        
    ## Combine all of the resultant model data    
        array_concat = pd.concat(df_array)
        combined_array = array_concat[~array_concat.index.duplicated(keep='first')]
#    combined_array = combined_array.iloc[combined_array["t"].abs().argsort()]
    
    ## Get only the T values, and then append them to the master array for concatenation later
        tvalue_only = pd.DataFrame(combined_array["t"])
        t_name = "T_" + gene + "_" + mutation
        tvalue_only.columns = ([t_name])
        tvalue_only = tvalue_only.drop(tissue_labels[1:])
    
        gene_array.append(tvalue_only)
    
allgenes_scores = pd.concat(gene_array, axis =1)
allgenes_scores = allgenes_scores.drop("const", axis =0)


Calculating for KRAS 1/1
nan
No mutations found for KRAS
p.K185fs
No mutations found for KRAS
p.C180*
No mutations found for KRAS
p.I171M
No mutations found for KRAS
p.R164*
No mutations found for KRAS
p.V160A
No mutations found for KRAS
p.A146V
No mutations found for KRAS
p.A146T
No mutations found for KRAS
p.P140H
No mutations found for KRAS
p.G138V
No mutations found for KRAS
p.P121H
p.D119N
No mutations found for KRAS
p.K117N
No mutations found for KRAS
p.P110H
No mutations found for KRAS
p.R97I
No mutations found for KRAS
p.T74P
No mutations found for KRAS
p.Q61H
p.Q61L
No mutations found for KRAS
p.Q61R
No mutations found for KRAS
p.Q61K
p.A59G
No mutations found for KRAS
p.A59T
No mutations found for KRAS
p.L23R
No mutations found for KRAS
p.L19F
p.A18D
No mutations found for KRAS
p.V14L
No mutations found for KRAS
p.V14I
No mutations found for KRAS
p.G13D
p.G13C
p.G12V
p.G12F
p.G12A
p.G12D
p.G12C
p.G12R
No mutations found for KRAS
p.G12S
p.10_11insG
No mutations found for KRAS


In [71]:
allgenes_scores.to_csv("KRAS_mutations_scores.csv")

In [81]:
metabolite_mapping = pd.read_csv("../../../AZ_data/Metabolite_reference_table.csv")

In [82]:
metabolite_mapping

Unnamed: 0,ionIdx,ionMz,average intensity,mz difference,id,score,formula,ion,name
0,1,57.034024,7869.672329,0.000560,HMDB01659,100,C3H6O,-H(+),Acetone
1,1,57.034024,7869.672329,0.000560,HMDB03366,100,C3H6O,-H(+),Propanal
2,2,58.029217,2950.367707,0.000617,HMDB01122,100,C2H5NO,-H(+),N-Methylformamide
3,2,58.029217,2950.367707,0.000617,HMDB03656,100,C2H5NO,-H(+),Acetaldehyde oxime
4,3,59.013330,54409.188430,0.000519,HMDB00042,100,C2H4O2,-H(+),Acetic acid
...,...,...,...,...,...,...,...,...,...
3469,1098,993.451012,1604.664817,0.000199,HMDB10073,100,C43H81O19P3,-H(+),PIP2(18:1(11Z)/16:1(9Z))
3470,1098,993.451012,1604.664817,0.000199,HMDB10086,100,C43H81O19P3,-H(+),PIP2(18:1(9Z)/16:1(9Z))
3471,1098,993.451012,1604.664817,0.000199,HMDB10098,100,C43H81O19P3,-H(+),"PIP2(18:2(9Z,12Z)/16:0)"
3472,1099,997.479165,1700.739140,0.003345,HMDB10035,45,C43H85O19P3,-H(+),PIP2(16:0/18:0)


In [97]:
mutation = ["p.G12D"]

allgenes_scores_high = allgenes_scores[allgenes_scores["T_KRAS_" + mutation[0]] >= 5]
allgenes_scores_low = allgenes_scores[allgenes_scores["T_KRAS_" + mutation[0]] <= -5]

highlist = allgenes_scores_high.index.tolist()
lowlist = allgenes_scores_low.index.tolist()

siglist = highlist+lowlist
siglist = [int(i) for i in siglist]
for item in metabolite_mapping[metabolite_mapping["ionIdx"].isin(siglist)]["id"].tolist():
    print(item)

HMDB00191
HMDB06483
HMDB11753
HMDB00301
HMDB02730
HMDB00098
HMDB00283
HMDB00366
HMDB00621
HMDB00646
HMDB00751
HMDB01644
HMDB03371
HMDB12194
HMDB12325
HMDB00539
HMDB00867
HMDB00126
HMDB02520
HMDB02916
HMDB06200
HMDB00127
HMDB02545
HMDB02704
HMDB03363
HMDB03402
HMDB06334
HMDB11731
HMDB11732
HMDB00086
HMDB03869
HMDB04207
HMDB13134
HMDB04808
HMDB06101
HMDB00230
HMDB00773
HMDB06824
HMDB11610
HMDB06806
HMDB01533
HMDB06486
HMDB01178
HMDB12276
HMDB01176
HMDB07032
HMDB07033
HMDB07060
HMDB07114
HMDB07141
HMDB07142
HMDB07193
HMDB07222
HMDB07249
HMDB07250
HMDB07277
HMDB07306
HMDB07333
HMDB07334
HMDB07505
HMDB07534
HMDB07562
HMDB07676
HMDB07704
HMDB07733
HMDB01081
HMDB06581
HMDB06584
HMDB06537
HMDB06566
HMDB06567
HMDB06699
HMDB01379
HMDB07887
HMDB07919
HMDB07978
HMDB08010
HMDB08037
HMDB08038
HMDB08069
HMDB08102
HMDB08266
HMDB08298
HMDB08526
HMDB08558
HMDB08915
HMDB09747
HMDB08392
HMDB08393
HMDB08424
HMDB08425
HMDB08456
HMDB08489
HMDB08652
HMDB11226
HMDB11253
HMDB11254
HMDB11284
HMDB11285
HMDB11317


In [103]:
mutation = ["p.G13D"]

allgenes_scores_high = allgenes_scores[allgenes_scores["T_KRAS_" + mutation[0]] >= 5]
allgenes_scores_low = allgenes_scores[allgenes_scores["T_KRAS_" + mutation[0]] <= -5]

highlist = allgenes_scores_high.index.tolist()
lowlist = allgenes_scores_low.index.tolist()

siglist = highlist+lowlist
siglist = [int(i) for i in siglist]
for item in metabolite_mapping[metabolite_mapping["ionIdx"].isin(siglist)]["id"].tolist():
    print(item)

HMDB00056
HMDB00161
HMDB00271
HMDB01310
HMDB00562
HMDB01870
HMDB11718
HMDB00408
HMDB00491
HMDB00695
HMDB01864
HMDB06024
HMDB10717
HMDB12882
HMDB01232
HMDB02658
HMDB13188
HMDB00730
HMDB00808
HMDB01263
HMDB03681
HMDB12131
HMDB12151
HMDB03464
HMDB01218
HMDB02359
HMDB00292
HMDB00786
HMDB01182
HMDB03352
HMDB11623
HMDB00798
HMDB00847
HMDB02038
HMDB06009
HMDB01431
HMDB12289
HMDB00511
HMDB00078
HMDB01024
HMDB00590
HMDB01138
HMDB02062
HMDB00124
HMDB00213
HMDB00645
HMDB00994
HMDB01076
HMDB01078
HMDB01313
HMDB01401
HMDB01586
HMDB02985
HMDB03498
HMDB03971
HMDB06328
HMDB06330
HMDB06797
HMDB06800
HMDB06814
HMDB06873
HMDB02248
HMDB04985
HMDB04987
HMDB02384
HMDB00299
HMDB00788
HMDB05765
HMDB01016
HMDB00125
HMDB00417
HMDB00553
HMDB00801
HMDB02212
HMDB02088
HMDB13078
HMDB13645
HMDB00489
HMDB00944
HMDB11143
HMDB01939
HMDB10213
HMDB13620
HMDB13621
HMDB07003
HMDB06806
HMDB00569
HMDB07004
HMDB07850
HMDB07854
HMDB00585
HMDB10382
HMDB11481
HMDB11511
HMDB12108
HMDB10384
HMDB11128
HMDB11493
HMDB11523
HMDB11491
