In [1]:
from IPython.core.pylabtools import figsize
import math
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn import linear_model, metrics
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, LogisticRegression, Perceptron
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, classification_report, mean_squared_error, r2_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.svm import SVR
import time
import warnings

In [2]:
def token_simple(data):
  model_checkpoint = "facebook/esm2_t12_35M_UR50D"
  from transformers import AutoTokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  y=data.iloc[:, -1]
  print("labels:", y.shape)
# before alignment
  fingerprint = []
  sequence = []
  for i in range(len(data)):
    compound, protein, interaction = data.iloc[i, :]

    mol = Chem.AddHs(Chem.MolFromSmiles(compound))
    fingerprint.append(np.array([int(i) for i in AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useChirality=True).ToBitString()]))
    sequence.append( np.array(tokenizer(protein,padding='max_length')['input_ids']))
# alignments
  sequence =pd.DataFrame(sequence)
  sequence= sequence.fillna(0)
# after alignments, combine different parts of data
  fingerprint=pd.DataFrame(fingerprint)
  X = pd.concat([fingerprint, sequence], axis=1)
  print("dataset:", X.shape)
  print("Number of positive/negative labels:", y.value_counts())
  return X, y

def token_poly(data):
  y=data.iloc[:, -1]
  print("labels:", y.shape)
  fingerprint = []
  for i in range(len(data)):
    compound, protein, interaction = data.iloc[i, :]
    mol = Chem.AddHs(Chem.MolFromSmiles(compound))
    fingerprint.append(np.array([int(i) for i in AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=64, useChirality=True).ToBitString()]))
# after alignments
  X=pd.DataFrame(fingerprint)
  print("dataset:", X.shape)
  return X, y

### Future Work

In [3]:
# apply different protein tokenization for different models
ESM_models = ['facebook/esm2_t48_15B_UR50D', 'facebook/esm2_t36_3B_UR50D', 'facebook/esm2_t33_650M_UR50D', 'facebook/esm2_t30_150M_UR50D', 'facebook/esm2_t12_35M_UR50D', 'facebook/esm2_t6_8M_UR50D']

# apply different fingerprint algorithms for the compounds 
fpgen1 = AllChem.GetRDKitFPGenerator()
fpgen2 = AllChem.GetAtomPairGenerator()
fpgen3 = AllChem.GetMorganGenerator(radius=2)

# fairness: with different ratios of positive and negative samples for CPIs prediction.
#dataset_0 = dataset.['interaction' = 0][0:1000]
#dataset_1 = dataset.['interaction' = 1][0:1000]

## Supervised Models Comparison

In [4]:
# change the directory to that of the CSV files.

current = os.getcwd()
os.chdir('../data/csv/')

print(os.getcwd())

C:\Users\rabin badree\OneDrive\Documents\badreeRoziena\machineLearning\project\github\data\csv


### Regression models

In [5]:
# data processing
data = pd.read_csv('test_Kd.csv', header=None)
data = data.sample(frac=1)
X, y = token_simple(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Ridge
reg = linear_model.Ridge(alpha=.1)
reg = reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
test_rmse_ridge=math.sqrt(mean_squared_error(y_test, y_pred))
test_r2_ridge = r2_score(y_test, y_pred)

# Lasso
clf = linear_model.Lasso(alpha=.1)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
test_rmse_lasso=math.sqrt(mean_squared_error(y_test, y_pred))
test_r2_lasso = r2_score(y_test, y_pred)

# SVM
svm = SVR(kernel='rbf',C=1000, gamma=0.1)
svm.fit(X_train, y_train)
y_test_predicted_svm = svm.predict(X_test)
test_rmse_svm = math.sqrt(mean_squared_error(y_test, y_test_predicted_svm))
test_r2_svm = r2_score(y_test, y_test_predicted_svm)

# Random Forest
rnd_forest_reg = RandomForestRegressor(n_estimators=500,  criterion='squared_error',verbose=1, max_depth=20, n_jobs=-1)
rnd_forest_reg.fit(X_train, y_train)
y_pred = rnd_forest_reg.predict(X_test)
test_rmse_rf=math.sqrt(mean_squared_error(y_test, y_pred))
test_r2_rf = r2_score(y_test, y_pred)

# MLP regressor
MLP_reg = MLPRegressor(random_state=1, max_iter=500)
MLP_reg.fit(X_train, y_train)
y_pred = MLP_reg.predict(X_test)
test_rmse_MLP=math.sqrt(mean_squared_error(y_test, y_pred))
test_r2_MLP = r2_score(y_test, y_pred)

data = [
        ["Ridge Linear Regression", test_rmse_ridge, test_r2_ridge], 
        ["Lasso Linear Regression", test_rmse_lasso, test_r2_lasso],
        ["Support Vector Machine (Gaussian RBF)", test_rmse_svm, test_r2_svm],
        ["Random Forest", test_rmse_rf, test_r2_rf],
        ["MLP",test_rmse_MLP, test_r2_MLP]
       ]


pd.DataFrame(data, columns=["Model", "RMSE", "R2 Score"])

labels: (3811,)
dataset: (3811, 2510)
Number of positive/negative labels: 5.920819    31
6.000000    31
5.698970    29
5.958607    28
5.886057    27
            ..
5.075905     1
6.990911     1
8.489455     1
9.494850     1
6.417937     1
Name: 2, Length: 1477, dtype: int64


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.6min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.0s finished


Unnamed: 0,Model,RMSE,R2 Score
0,Ridge Linear Regression,2.285426,-1.132574
1,Lasso Linear Regression,1.367706,0.236242
2,Support Vector Machine (Gaussian RBF),1.421241,0.175282
3,Random Forest,1.184066,0.427571
4,MLP,1.259279,0.352538


In [6]:
# %%time
warnings.filterwarnings('ignore')

# The param_grid tells Scikit-Learn to evaluate all combinations of the hyperparameter values
param_grid = {'alpha': [0.1, 0.01, 0.001], 'learning_rate': ["constant", "optimal", "invscaling"], 
              'l1_ratio': [1, 0.2, 0], 'max_iter':[100, 400, 1000],'eta0': [0.01, 0.001, 0.0001], 'penalty':['l2', 'l1', 'elasticnet']}

sgd = SGDRegressor()
sgd_cv = GridSearchCV(sgd, param_grid, cv=2, n_jobs=-1)
sgd_cv.fit(X_train[:1000], y_train[:1000])

params_optimal_sgd = sgd_cv.best_params_

print("Best Score in SGD: %f" % sgd_cv.best_score_)
print("Optimal Hyperparameter Values in SGD: ", params_optimal_sgd)
print("\n")

# test the model
lin_reg_sgd = SGDRegressor(**params_optimal_sgd)
lin_reg_sgd.fit(X_train, y_train)
y_test_predicted = lin_reg_sgd.predict(X_test)

print("Root Mean squared error in SGD: %.2f" %math.sqrt(mean_squared_error(y_test, y_test_predicted)))
print("Coefficient of determination r^2 variance score [1 is perfect prediction] in SGD: %.2f"  % r2_score(y_test, y_test_predicted))

Best Score in SGD: -653568935819211520.000000
Optimal Hyperparameter Values in SGD:  {'alpha': 0.1, 'eta0': 0.0001, 'l1_ratio': 0.2, 'learning_rate': 'invscaling', 'max_iter': 400, 'penalty': 'l2'}


Root Mean squared error in SGD: 421269225.50
Coefficient of determination r^2 variance score [1 is perfect prediction] in SGD: -72458490740040784.00


### Polynomial degree: used to find out the coefficiency between binding affinity and drugs, which means a particular characteristic among drugs may play a significant role with regards to binding affinity no matter which protein is involved.

In [8]:
# data processing
data = pd.read_csv('test_Kd.csv', header=None)
X, y = token_poly(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# SGD Regression
lin_reg_sgd = SGDRegressor(max_iter=1000, eta0=0.01, penalty="elasticnet", l1_ratio=0.0, alpha=0.01)
lin_reg_sgd.fit(X_train.iloc[:,0:1024], y_train)
y_train_predicted = lin_reg_sgd.predict(X_train.iloc[:,:1024])
print("Root Mean squared error for the coefficiency between binding affinity and drugs in SGD: %.2f" %math.sqrt(mean_squared_error(y_train, y_train_predicted)))

# Add polynomial terms with the feature vector using the sklearn PolynomialFeatures class
poly_features = PolynomialFeatures(2)
X_train_poly = poly_features.fit_transform(X_train.iloc[:,:1024])
print("No. of Original Features: ", X_train.iloc[:,:1024].shape[1])
print("No. of Augmented Features: ", X_train_poly.shape[1])
lin_reg_sgd.fit(X_train_poly, y_train)
y_train_predicted = lin_reg_sgd.predict(X_train_poly)
print("Root Mean squared error for the coefficiency between binding affinity and drugs in polynomial solution: %.2f" %math.sqrt(mean_squared_error(y_train, y_train_predicted)))

labels: (3811,)
dataset: (3811, 64)
Root Mean squared error for the coefficiency between binding affinity and drugs in SGD: 1.45
No. of Original Features:  64
No. of Augmented Features:  2145
Root Mean squared error for the coefficiency between binding affinity and drugs in polynomial solution: 50.23
