In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV
from sklearn.linear_model import MultiTaskLassoCV
from yellowbrick.datasets import load_concrete
from yellowbrick.regressor import AlphaSelection
from yellowbrick.regressor.alphas import alphas





In [2]:
md = pd.read_excel(io='CCLE metabolomics dataset.xlsx',sheet_name="All")
mt = md.drop(['Tissue', 'Medium','Culture'], axis=1)
hm = pd.read_csv('GCP_proteomics_remapped.csv')
merge_tb = mt.merge(hm,how='inner',left_on='CCL', right_on='Cell Line')
MET = merge_tb.iloc[:,1:226]
GCP = merge_tb.iloc[:,227:269]

GCP = np.nan_to_num(GCP, nan=0)
MET = np.nan_to_num(MET, nan=0)
Xtrain, Xtest, Ytrain, Ytest =train_test_split(GCP, MET, test_size=0.3, random_state=0)
# df_Xtrain = pd.DataFrame(Xtrain)
# df_Ytrain = pd.DataFrame(Ytrain)



In [3]:
## GCP_to_MET

# Tuning the alpha
alphas = [10**-6, 10**-5,10**-4,10**-3,10**-2,10**-1,1,2,3,4,5,6,7,8,9,10]
mdl = list()
error = list()
for i,a in enumerate(alphas):
    x=linear_model.Lasso(alpha=a).fit(Xtrain, Ytrain)
#     print(x)
    mdl.append(x)
    ypred = mdl[i].predict(Xtest)
#     print(ypred)
    error.append(mean_squared_error(ypred,Ytest))
print(error)
minmse_index = error.index(min(error))
best_alpha = alphas[minmse_index]
print(best_alpha)

[0.15690549625435365, 0.1568768009563384, 0.1565983665118773, 0.15455604403226686, 0.15037133539544123, 0.15243451117552742, 0.15288557559058466, 0.15288557559058466, 0.15288557559058466, 0.15288557559058466, 0.15288557559058466, 0.15288557559058466, 0.15288557559058466, 0.15288557559058466, 0.15288557559058466, 0.15288557559058466]
0.01


In [4]:
GCP2MET_models = []
for i in range(Ytrain.shape[1]):
    mdl_G2M =linear_model.Lasso(alpha=best_alpha).fit(Xtrain, Ytrain[:, i])
    GCP2MET_models.append(mdl_G2M)
print(len(GCP2MET_models))


MET2GCP_models = []

for j in range(Xtrain.shape[1]):
    mdl_M2G = linear_model.Lasso(alpha=best_alpha).fit(Ytrain, Xtrain[:,j])
    MET2GCP_models.append(mdl_M2G)
print(len(MET2GCP_models))

225
42


In [5]:
def evaluate_models(models, Xtest, Ytest):
    """
    evaluate_models returns results from the model predictions, including the pearson
    correlation coefficient, p-Values, and MSE.

    :param models:         A list of scikit-learn model objects.
    :param Xtest:          A numpy array or pandas dataframe containing validation set input data.
    :param Ytest:          A numpy array or pandas dataframe containing validation set output data.
    :return pred_resul:    A dictionary containing the final MSE, pValue, or rValue.
    
    """
    
    predictions = []
    rValue = list()
    pValue = list()
    MSE = list()

    for i in range(len(models)):
        mdl = models[i]
        Ypred = mdl.predict(Xtest)
        predictions.append(Ypred)   
        r, pvalue = pearsonr(Ypred, Ytest[:, i])
        rValue.append(r)
        pValue.append(pvalue)
        mse = mean_squared_error(Ypred, Ytest[:, i])
        MSE.append(mse)
#     print(Ypred)
#     print(Xtest)
    df_MSE = pd.DataFrame(MSE)
    df_pValue = pd.DataFrame(pValue)
    df_rValue = pd.DataFrame(rValue)
    
    return df_MSE, df_pValue, df_rValue

mse1,p1,pearson1 = evaluate_models(GCP2MET_models, Xtest, Ytest)
mse1.columns = ["GCP2MET_models"]
print(mse1)

     GCP2MET_models
0          0.101491
1          0.104198
2          0.212115
3          0.117747
4          0.096323
..              ...
220        0.155878
221        0.169268
222        0.257659
223        0.326099
224        0.258227

[225 rows x 1 columns]


In [6]:
kf = KFold(n_splits=3)

KFold(n_splits=3, random_state=None, shuffle=True)
mse_result = pd.DataFrame()
pvalue_result = pd.DataFrame()
rvalue_result = pd.DataFrame()
for train_index, test_index in kf.split(Xtrain, Ytrain):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = Xtrain[train_index], Xtrain[test_index]
    y_train, y_test = Ytrain[train_index], Ytrain[test_index]
    v_GCP2MET_models = []
    final_metrics = []
    for i in range(y_train.shape[1]):
        mdl_G2M = linear_model.Lasso(alpha=best_alpha).fit(X_train, y_train[:, i])
        v_GCP2MET_models.append(mdl_G2M)
        df_MSE, df_pValue, df_rValue = evaluate_models(v_GCP2MET_models, X_test, y_test)
    mse_result = pd.concat([mse_result,df_MSE],axis = 1)
    pvalue_result = pd.concat([pvalue_result,df_pValue],axis = 1)
    rvalue_result = pd.concat([rvalue_result,df_rValue],axis = 1)
mse_result.columns = ["mse1","mse2","mse3"]
mse_result = mse_result.T
print(mse_result)   

           0         1         2         3         4         5         6    \
mse1  0.089575  0.094861  0.296596  0.100531  0.090906  0.129799  0.042273   
mse2  0.106462  0.099635  0.302412  0.120697  0.094360  0.142247  0.036579   
mse3  0.093020  0.096180  0.252817  0.121182  0.085328  0.131163  0.037862   

           7         8         9    ...       215       216       217  \
mse1  0.071508  0.218492  0.097096  ...  0.134649  0.193515  0.180864   
mse2  0.065053  0.267308  0.107793  ...  0.096789  0.184076  0.178244   
mse3  0.064426  0.180017  0.092881  ...  0.117294  0.195196  0.173290   

           218       219       220       221       222       223       224  
mse1  0.205080  0.156296  0.124726  0.163350  0.278675  0.328726  0.269641  
mse2  0.231610  0.180965  0.152861  0.181191  0.285273  0.363526  0.272564  
mse3  0.239074  0.188212  0.158302  0.190638  0.272416  0.310359  0.237598  

[3 rows x 225 columns]


In [7]:
minMSE = mse_result.idxmin()
# print(minMSE)

mse_min = pd.DataFrame()
MSE_min = list()
for index in mse_result:
    v = mse_result[index][minMSE[index]]
    MSE_min.append(v)
#     print(v)
mse_min = pd.DataFrame(MSE_min)
mse_min.columns = ["v_GCP2MET_models"]
# print(mse_min)
raw = mse1.join(mse_min)
print(raw)

     GCP2MET_models  v_GCP2MET_models
0          0.101491          0.089575
1          0.104198          0.094861
2          0.212115          0.252817
3          0.117747          0.100531
4          0.096323          0.085328
..              ...               ...
220        0.155878          0.124726
221        0.169268          0.163350
222        0.257659          0.272416
223        0.326099          0.310359
224        0.258227          0.237598

[225 rows x 2 columns]


In [8]:
new_model = list()
for index, row in raw.iterrows():
    if row['GCP2MET_models'] < row['v_GCP2MET_models']:
        
        new_model.append(GCP2MET_models[index])
    else:
        new_model.append(v_GCP2MET_models[index])
        
    
print(new_model)

[Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha

In [12]:
## MET TO GCP
mse2,p2,pearson2 = evaluate_models(MET2GCP_models, Ytest, Xtest)
mse2.columns = ["MET2GCP_models"]

kf = KFold(n_splits=3)
KFold(n_splits=3, random_state=None, shuffle=True)
mse2_result = pd.DataFrame()
# pvalue_result = pd.DataFrame()
# rvalue_result = pd.DataFrame()
for train_index, test_index in kf.split(Xtrain, Ytrain):
    X_train, X_test = Xtrain[train_index], Xtrain[test_index]
    y_train, y_test = Ytrain[train_index], Ytrain[test_index]
    v_MET2GCP_models = []
    metrics = []
    for i in range(X_train.shape[1]):
        mdl_M2G = linear_model.Lasso(alpha=best_alpha).fit(y_train, X_train[:, i])
        v_MET2GCP_models.append(mdl_M2G)
        df_MSE, df_pValue, df_rValue = evaluate_models(v_MET2GCP_models, y_test, X_test)
    mse2_result = pd.concat([mse2_result,df_MSE],axis = 1)
#     pvalue_result = pd.concat([pvalue_result,df_pValue],axis = 1)
#     rvalue_result = pd.concat([rvalue_result,df_rValue],axis = 1)
mse2_result.columns = ["mse1","mse2","mse3"]
mse2_result = mse2_result.T
# print(mse2_result) 

            0         1         2         3         4         5         6   \
mse1  0.175227  0.380132  0.588737  1.051244  0.255749  0.202275  0.181518   
mse2  0.084202  0.258606  0.237459  0.978266  0.177839  0.107370  0.138614   
mse3  0.091701  0.185900  0.319462  1.054341  0.194316  0.108792  0.159261   

            7         8         9   ...        32        33        34  \
mse1  0.315831  0.487928  0.406070  ...  0.837352  0.870477  0.683326   
mse2  0.226162  0.393376  0.260588  ...  1.080634  1.303954  0.837324   
mse3  0.236521  0.418744  0.288605  ...  0.933157  1.076697  0.992893   

            35        36        37        38        39        40        41  
mse1  0.912046  0.376208  0.041114  1.344137  0.088397  0.240640  0.542718  
mse2  0.920801  0.435253  0.042624  2.027280  0.134786  0.118106  0.557401  
mse3  1.177404  0.395815  0.044395  1.224501  0.101151  0.195334  0.645212  

[3 rows x 42 columns]


In [14]:
minMSE2 = mse2_result.idxmin()
# print(minMSE)

mse2_min = pd.DataFrame()
MSE2_min = list()
for index in mse2_result:
    v = mse2_result[index][minMSE2[index]]
    MSE2_min.append(v)
#     print(v)
mse2_min = pd.DataFrame(MSE2_min)
mse2_min.columns = ["v_GCP2MET_models"]
# print(mse2_min)
raw2 = mse2.join(mse2_min)
# print(raw2)

new_MET2GCP = list()
for index, row in raw.iterrows():
    if row['GCP2MET_models'] < row['v_GCP2MET_models']:
        
        new_MET2GCP.append(GCP2MET_models[index])
    else:
        new_MET2GCP.append(v_GCP2MET_models[index])
        
    
print(new_MET2GCP)

[Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha=0.01), Lasso(alpha