In [None]:
import os
import pickle
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np
import re 

import xgboost as xgb
from sklearn import ensemble
from sklearn import dummy
from sklearn import linear_model
from sklearn import svm
from sklearn import neural_network
from sklearn import metrics
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.utils.fixes import loguniform
import scipy
import argparse

from misc import save_model, load_model, regression_results, grid_search_cv, supervised_learning_steps, calculate_regression_metrics
plt.rcParams["font.family"] = "Arial"

In [None]:
#Get the setting with different X_trains and X_tests
train_options = ["../Data/dose_response_with_full_data_inflammasome_with_ls_train.pkl",
                 "../Data/dose_response_with_full_data_inflammasome_with_mfp_train.pkl",
                 ".."]
test_options = ["../Data/dose_response_with_full_data_inflammasome_with_ls_test.pkl",
                "../Data/dose_response_with_full_data_inflammasome_with_mfp_test.pkl",
                ".."]
data_type_options = ["LS_LS","MFP_LS"]

In [None]:
#Choose the options
input_option = 0                                                  #Choose 0 for LS for Drug and LS for Cell Line , 1 for MFP for Drug and LS for Cell Line 
classification_task = False
data_type = data_type_options[input_option]

#Get the data for your choice: LS or MFP
print("Loaded training file")
big_train_df = pd.read_pickle(train_options[input_option],compression="zip")
big_test_df = pd.read_pickle(test_options[input_option],compression="zip")
total_length = len(big_train_df.columns)
metadata_X_train, X_train, Y_train = big_train_df.loc[:,['ARXSPAN_ID','DRUG_NAME']], big_train_df.iloc[:, range(16,total_length)], big_train_df["y_ic50"].to_numpy().flatten()
metadata_X_test, X_test, Y_test = big_test_df.loc[:,['ARXSPAN_ID','DRUG_NAME']], big_test_df.iloc[:,range(16,total_length)], big_test_df["y_ic50"].to_numpy().flatten()

In [None]:
#Build the Neural Network model
model = neural_network.MLPRegressor(activation='identity', solver='adam', alpha=1e-5, batch_size=2056, max_iter=1000, random_state=42, tol=1e-4, shuffle=False, verbose=False, warm_start=False, early_stopping=False,
                                    beta_2=0.999, epsilon=1e-08, n_iter_no_change=50, validation_fraction=0.2)
# Grid parameters
params_nn = {
        "hidden_layer_sizes": [(256,64), (512, 128, 32), (256, 128, 64)],
        "alpha": loguniform(1e-8,1e-2),
        "learning_rate_init" : loguniform(1e-4,1e-3),
        "beta_1" : [0.7,0.8,0.9]
}   

        
#It will select 200 random combinations for the CV and do 5-fold CV for each combination
n_iter = 100
scaler = preprocessing.StandardScaler()
X_train_copy = scaler.fit_transform(X_train)
nn_gs=supervised_learning_steps("nn","r2",data_type,classification_task,model,params_nn,X_train_copy,Y_train,n_iter=n_iter,n_splits=5)
        
#Build the model and get 5-fold CV results    
print(nn_gs.cv_results_)
save_model(scaler, "%s_models/%s_%s_scaling_gs.pk" % ("nn","nn",data_type))

In [None]:
#Test the linear regression model on separate test set   
nn_gs = load_model("nn_models/nn_"+data_type+"_regressor_gs.pk")
scaler = load_model("nn_models/nn_"+data_type+"_scaling_gs.pk")
np.max(nn_gs.cv_results_["mean_test_score"])
nn_best = nn_gs.best_estimator_
y_pred_nn=nn_best.predict(scaler.transform(X_test))
test_metrics = calculate_regression_metrics(Y_test,y_pred_nn)
print(test_metrics)

#Write the prediction of NN model
metadata_X_test['predictions']=y_pred_nn
metadata_X_test['labels']=Y_test
metadata_X_test.to_csv("../results/NN_"+data_type+"_supervised_test_predictions.csv",index=False)
print("Finished writing predictions")

fig = plt.figure()
plt.style.use('classic')
fig.set_size_inches(2.5,2.5)
fig.set_dpi(300)
fig.set_facecolor("white")

ax = sn.regplot(x="labels", y="predictions", data=metadata_X_test, scatter_kws={"color": "lightblue",'alpha':0.5}, 
                line_kws={"color": "red"})
ax.axes.set_title("NN Predictions (LS + LS)",fontsize=10)
ax.set_xlim(-5, 5)
ax.set_ylim(-4, 4)
ax.set_xlabel("",fontsize=10)
ax.set_ylabel("",fontsize=10)
ax.tick_params(labelsize=10, color="black")
plt.text(-4, 3, 'Pearson r =' +str(test_metrics[3]), fontsize = 10)
plt.text(-4, 2, 'MAE ='+str(test_metrics[0]),fontsize=10)
outfilename = "../results/NN_"+data_type+"_supervised_test_prediction.pdf"
plt.savefig(outfilename, bbox_inches="tight")

In [None]:
plt.plot(nn_gs.best_estimator_.loss_curve_)