In [1]:
import pandas as pd
import numpy as np

import sklearn.linear_model
import statsmodels.api as sm
from scipy import stats
from scipy.stats import zscore

from collections import Counter

In [2]:
cancer_genes = pd.read_csv("../../COSMIC/Census_allTue Nov 10 13_38_57 2020.csv")
cancer_genes = cancer_genes["Gene Symbol"].tolist()

In [3]:
celllines_mapped = pd.read_csv("../MCF7_removal/Celllines_mapping_manual_noMCF7.csv")
celllines_mutations = pd.read_csv("../../GEMICCL/Mutation.csv", sep = "\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Get only cell lines with ID
celllines_mapped = celllines_mapped.drop_duplicates(subset = ["dsIdx"])
celllines_mapped = celllines_mapped.dropna(subset = ["ID"])
sample_data_tissue = celllines_mapped["DI"].tolist()

In [5]:
tissue_origin = pd.read_excel("../../Cellosaurus_data/Cellline_tissue_origin.xlsx", columns = ["Cancer", "Origin"])
tissue_origin_dict = tissue_origin.set_index("Cancer")["Origin"].to_dict()

In [6]:
tissue_type = []
for item in sample_data_tissue:
    if pd.isnull(item):
        tissue_type.append("Unknown") 
    elif item == "[]":
        tissue_type.append("Unknown")
    else:
        item = item.split(";")[2].split("'")[0].strip()
        tissue_type.append(item)
        
cell_origin = [tissue_origin_dict.get(item) for item in tissue_type]
celllines_mapped["Tissue"] = cell_origin

In [7]:
celllines_list = celllines_mapped["ID"].unique().tolist()

In [8]:
celllines_notinmutationset = set(celllines_list) - set(celllines_mutations['CellLineName_Cellosaurus'])
celllines_mapped_mutations = celllines_mapped[~celllines_mapped["ID"].isin(celllines_notinmutationset)]

In [9]:
tissue_labelled = pd.get_dummies(celllines_mapped_mutations, columns=["Tissue"])

In [10]:
metabolomics_data = pd.read_csv("../MCF7_removal/Metabolomics_data_noMCF7.csv")
metabolomics_data = metabolomics_data.set_index("ionIdx")

In [11]:
gene_array = []

count = 1
#iterate through genes
for gene in cancer_genes:
    print("Calculating for " + gene + " %s/%s" % (str(count), str(len(cancer_genes))))
    count+=1
    ## Figure out which cell lines have mutations in the gene
    gene_of_interest = celllines_mutations[celllines_mutations["HGNC"] == gene]
    gene_of_interest = gene_of_interest[gene_of_interest["MutationType"] != "Silent"]
    gene_of_interest_celllines = set(gene_of_interest["CellLineName_Cellosaurus"])
    
    ## Get the cell lines with mutations
    cellines_with_mutation = celllines_mapped_mutations[celllines_mapped_mutations["ID"].isin(gene_of_interest_celllines)]["dsIdx"].tolist()
    cellines_without_mutation = celllines_mapped_mutations[~celllines_mapped_mutations["ID"].isin(gene_of_interest_celllines)]["dsIdx"].tolist()
    cellines_with_mutation = [str(x) for x in cellines_with_mutation]
    cellines_without_mutation = [str(x) for x in cellines_without_mutation]
    
    ## Get the metabolomics data
    mutated_celllines = metabolomics_data[cellines_with_mutation].transpose()
    nonmutated_celllines = metabolomics_data[cellines_without_mutation].transpose()
    
    # If there are no mutations found then ignore this gene
    if mutated_celllines.empty:
        print("No mutations found for " + gene)
        continue
    
    ## Generate an array with the cell line metabolomics plus a tag for mutated for gene of interest
    mutated_celllines["Mutation"] = 1
    nonmutated_celllines["Mutation"] = 0
    
    celllines_labeled = pd.concat([mutated_celllines, nonmutated_celllines])
    celllines_labeled.index = celllines_labeled.index.map(str)
    
    ## Calculate tissue labels
    tissue_labels = []
    tissue_labels.append("dsIdx")
    for item in tissue_labelled.columns:
        if item.startswith("Tissue"):
            tissue_labels.append(item)
            
    tissue_labelled_tissue = tissue_labelled[tissue_labels].set_index("dsIdx")
    tissue_labelled_tissue.index = tissue_labelled_tissue.index.rename("") 
    tissue_labelled_tissue.index = tissue_labelled_tissue.index.map(str)
    
    ## Add tissue labels to regression model
    celllines_labeled = celllines_labeled.join(tissue_labelled_tissue)
    
    ## Set up for the logistic regression -> X is all metabolomics, Y is the mutated vs non mutated
    X = (celllines_labeled.drop(["Mutation"] + tissue_labels[1:], axis =1))
    Y = celllines_labeled["Mutation"]
    ## Zscore the metabolites across the cell line axis
    X= X.apply(zscore, axis =0)
    
    df_array = []
    Xsubset = X.copy()
    
    ## Generate a logistic regression model for every metabolite
    for item in range(1,X.shape[1]+1):
        Xsubset2 = Xsubset[item]
        X2 = sm.add_constant(Xsubset2)
        X2 = X2.join(tissue_labelled_tissue)
        est = sm.OLS(Y, X2)
        est2 = est.fit(disp=0)
        results_as_html = est2.summary().tables[1].as_html()
        summary = pd.read_html(results_as_html, header=0, index_col=0)[0]
        df_array.append(summary)
        
    ## Combine all of the resultant model data    
    array_concat = pd.concat(df_array)
    combined_array = array_concat[~array_concat.index.duplicated(keep='first')]
#    combined_array = combined_array.iloc[combined_array["t"].abs().argsort()]
    
    ## Get only the T values, and then append them to the master array for concatenation later
    tvalue_only = pd.DataFrame(combined_array["t"])
    t_name = "T_" + gene
    tvalue_only.columns = ([t_name])
    tvalue_only = tvalue_only.drop(tissue_labels[1:])
    
    gene_array.append(tvalue_only)
    
allgenes_scores = pd.concat(gene_array, axis =1)
allgenes_scores = allgenes_scores.drop("const", axis =0)
allgenes_scores.to_csv("OLS_tissue_controlled_regressionTstats_noMCF7.csv")

Calculating for A1CF 1/723
Calculating for ABI1 2/723
Calculating for ABL1 3/723
Calculating for ABL2 4/723
Calculating for ACKR3 5/723
No mutations found for ACKR3
Calculating for ACSL3 6/723
Calculating for ACSL6 7/723
Calculating for ACVR1 8/723
Calculating for ACVR2A 9/723
Calculating for AFDN 10/723
No mutations found for AFDN
Calculating for AFF1 11/723
Calculating for AFF3 12/723
Calculating for AFF4 13/723
Calculating for AKAP9 14/723
Calculating for AKT1 15/723
Calculating for AKT2 16/723
No mutations found for AKT2
Calculating for AKT3 17/723
Calculating for ALDH2 18/723
Calculating for ALK 19/723
Calculating for AMER1 20/723
Calculating for ANK1 21/723
Calculating for APC 22/723
Calculating for APOBEC3B 23/723
Calculating for AR 24/723
Calculating for ARAF 25/723
Calculating for ARHGAP26 26/723
Calculating for ARHGAP5 27/723
Calculating for ARHGEF10 28/723
Calculating for ARHGEF10L 29/723
Calculating for ARHGEF12 30/723
Calculating for ARID1A 31/723
Calculating for ARID1B 32

Calculating for FOXO1 264/723
Calculating for FOXO3 265/723
Calculating for FOXO4 266/723
Calculating for FOXP1 267/723
Calculating for FOXR1 268/723
Calculating for FSTL3 269/723
No mutations found for FSTL3
Calculating for FUBP1 270/723
No mutations found for FUBP1
Calculating for FUS 271/723
Calculating for GAS7 272/723
Calculating for GATA1 273/723
Calculating for GATA2 274/723
Calculating for GATA3 275/723
Calculating for GLI1 276/723
Calculating for GMPS 277/723
Calculating for GNA11 278/723
Calculating for GNAQ 279/723
Calculating for GNAS 280/723
Calculating for GOLGA5 281/723
Calculating for GOPC 282/723
Calculating for GPC3 283/723
Calculating for GPC5 284/723
Calculating for GPHN 285/723
Calculating for GRIN2A 286/723
Calculating for GRM3 287/723
Calculating for H3F3A 288/723
Calculating for H3F3B 289/723
Calculating for HERPUD1 290/723
Calculating for HEY1 291/723
Calculating for HIF1A 292/723
Calculating for HIP1 293/723
Calculating for HIST1H3B 294/723
Calculating for HIS

Calculating for PPFIBP1 514/723
Calculating for PPM1D 515/723
Calculating for PPP2R1A 516/723
Calculating for PPP6C 517/723
Calculating for PRCC 518/723
Calculating for PRDM1 519/723
Calculating for PRDM16 520/723
Calculating for PRDM2 521/723
Calculating for PREX2 522/723
Calculating for PRF1 523/723
Calculating for PRKACA 524/723
Calculating for PRKAR1A 525/723
Calculating for PRKCB 526/723
Calculating for PRPF40B 527/723
Calculating for PRRX1 528/723
Calculating for PSIP1 529/723
Calculating for PTCH1 530/723
Calculating for PTEN 531/723
Calculating for PTK6 532/723
Calculating for PTPN11 533/723
Calculating for PTPN13 534/723
Calculating for PTPN6 535/723
Calculating for PTPRB 536/723
Calculating for PTPRC 537/723
Calculating for PTPRD 538/723
Calculating for PTPRK 539/723
Calculating for PTPRT 540/723
Calculating for PWWP2A 541/723
Calculating for QKI 542/723
Calculating for RABEP1 543/723
Calculating for RAC1 544/723
Calculating for RAD17 545/723
No mutations found for RAD17
Calc

In [12]:
count = 1
#iterate through genes
for gene in ["STK11", "KEAP1"]:
    print("Calculating for " + gene + " %s/%s" % (str(count), str(len(cancer_genes))))
    count+=1
    ## Figure out which cell lines have mutations in the gene
    gene_of_interest = celllines_mutations[celllines_mutations["HGNC"] == gene]
    gene_of_interest = gene_of_interest[gene_of_interest["MutationType"] != "Silent"]
    gene_of_interest_celllines = set(gene_of_interest["CellLineName_Cellosaurus"])
    
    ## Get the cell lines with mutations
    cellines_with_mutation = celllines_mapped_mutations[celllines_mapped_mutations["ID"].isin(gene_of_interest_celllines)]["dsIdx"].tolist()
    cellines_without_mutation = celllines_mapped_mutations[~celllines_mapped_mutations["ID"].isin(gene_of_interest_celllines)]["dsIdx"].tolist()
    cellines_with_mutation = [str(x) for x in cellines_with_mutation]
    cellines_without_mutation = [str(x) for x in cellines_without_mutation]
    
    ## Get the metabolomics data
    mutated_celllines = metabolomics_data[cellines_with_mutation].transpose()
    nonmutated_celllines = metabolomics_data[cellines_without_mutation].transpose()
    
    # If there are no mutations found then ignore this gene
    if mutated_celllines.empty:
        print("No mutations found for " + gene)
        continue
    
    ## Generate an array with the cell line metabolomics plus a tag for mutated for gene of interest
    mutated_celllines["Mutation"] = 1
    nonmutated_celllines["Mutation"] = 0
    
    celllines_labeled = pd.concat([mutated_celllines, nonmutated_celllines])
    celllines_labeled.index = celllines_labeled.index.map(str)
    
    ## Calculate tissue labels
    tissue_labels = []
    tissue_labels.append("dsIdx")
    for item in tissue_labelled.columns:
        if item.startswith("Tissue"):
            tissue_labels.append(item)
            
    tissue_labelled_tissue = tissue_labelled[tissue_labels].set_index("dsIdx")
    tissue_labelled_tissue.index = tissue_labelled_tissue.index.rename("") 
    tissue_labelled_tissue.index = tissue_labelled_tissue.index.map(str)
    
    ## Add tissue labels to regression model
    celllines_labeled = celllines_labeled.join(tissue_labelled_tissue)
    
    ## Set up for the logistic regression -> X is all metabolomics, Y is the mutated vs non mutated
    X = (celllines_labeled.drop(["Mutation"] + tissue_labels[1:], axis =1))
    Y = celllines_labeled["Mutation"]
    ## Zscore the metabolites across the cell line axis
    X= X.apply(zscore, axis =0)
    
    df_array = []
    Xsubset = X.copy()
    
    ## Generate a logistic regression model for every metabolite
    for item in range(1,X.shape[1]+1):
        Xsubset2 = Xsubset[item]
        X2 = sm.add_constant(Xsubset2)
        X2 = X2.join(tissue_labelled_tissue)
        est = sm.OLS(Y, X2)
        est2 = est.fit(disp=0)
        results_as_html = est2.summary().tables[1].as_html()
        summary = pd.read_html(results_as_html, header=0, index_col=0)[0]
        df_array.append(summary)
        
    ## Combine all of the resultant model data    
    array_concat = pd.concat(df_array)
    combined_array = array_concat[~array_concat.index.duplicated(keep='first')]
#    combined_array = combined_array.iloc[combined_array["t"].abs().argsort()]
    
    ## Get only the T values, and then append them to the master array for concatenation later
    tvalue_only = pd.DataFrame(combined_array["t"])
    t_name = "T_" + gene
    tvalue_only.columns = ([t_name])
    tvalue_only = tvalue_only.drop(tissue_labels[1:])
    
    gene_array.append(tvalue_only)

Calculating for STK11 1/723
Calculating for KEAP1 2/723


In [13]:
allgenes_scores = pd.concat(gene_array, axis =1)
allgenes_scores = allgenes_scores.drop("const", axis =0)

In [14]:
allgenes_scores

Unnamed: 0,T_A1CF,T_ABI1,T_ABL1,T_ABL2,T_ACSL3,T_ACSL6,T_ACVR1,T_ACVR2A,T_AFF1,T_AFF3,...,T_ZMYM3,T_ZNF331,T_ZNF384,T_ZNF429,T_ZNF479,T_ZNF521,T_ZNRF3,T_ZRSR2,T_STK11,T_KEAP1
1,0.512,-1.411,0.820,0.965,1.062,0.819,1.636,0.406,2.722,-0.033,...,0.709,0.146,-0.380,-0.068,2.859,-1.630,1.854,0.363,-0.696,-0.119
2,-0.717,0.131,-0.478,-0.526,-1.938,-1.302,0.695,0.664,-2.435,1.229,...,-0.598,-1.078,0.378,-0.007,0.488,0.858,-0.352,-0.813,0.320,-0.878
3,4.089,-1.856,2.502,2.380,0.433,0.437,1.715,0.913,1.878,0.053,...,1.354,-1.655,0.865,0.899,1.003,3.796,3.286,2.629,-0.280,2.450
4,2.491,-1.549,1.566,0.412,1.760,-0.144,0.205,0.515,3.425,0.119,...,0.903,-0.927,0.194,0.948,1.234,0.376,1.619,1.253,-0.132,0.753
5,1.690,-1.875,-1.367,-0.445,0.498,-3.144,3.060,-1.079,2.092,-0.496,...,0.426,-0.249,-0.382,0.765,0.269,-1.725,0.790,-0.955,0.464,1.899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,-1.478,-0.983,2.628,-0.130,2.124,3.727,-1.970,-0.691,-2.864,-1.769,...,-1.567,1.667,0.374,4.238,1.901,-2.139,-0.182,-0.500,5.101,-1.182
1096,3.084,-3.491,-0.026,0.319,-1.155,2.897,-3.322,-3.216,-2.406,-2.742,...,-1.524,4.066,-0.121,-0.538,-1.456,-2.254,-0.281,0.011,-1.999,-1.700
1097,-1.422,-1.013,-1.836,8.034,-2.382,-3.233,2.700,0.218,1.093,-0.351,...,-2.770,7.673,-2.782,-0.686,1.912,-1.992,-1.927,-1.477,6.480,7.699
1098,0.493,-0.783,1.186,1.395,2.422,2.337,-3.228,-2.723,-1.300,-0.236,...,-2.038,2.777,1.432,7.701,-0.390,3.943,2.689,-1.609,1.319,0.862


In [15]:
allgenes_scores.to_csv("OLS_tissue_controlled_regressionTstats_noMCF7.csv")