# **Bioinformatics Project - Computational Drug Discovery [Part 4] Regression Models with Random Forest**


## **1. Import libraries**

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

## **2. Load the data set**

In [None]:
! wget https://github.com/powhenagbo/UELPROJECT/raw/main/Diphtheria_06_bioactivity_data_3class_pIC50_pubchem_fp.csv

--2024-04-04 23:33:38--  https://github.com/powhenagbo/UELPROJECT/raw/main/Diphtheria_06_bioactivity_data_3class_pIC50_pubchem_fp.csv
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/powhenagbo/UELPROJECT/main/Diphtheria_06_bioactivity_data_3class_pIC50_pubchem_fp.csv [following]
--2024-04-04 23:33:38--  https://raw.githubusercontent.com/powhenagbo/UELPROJECT/main/Diphtheria_06_bioactivity_data_3class_pIC50_pubchem_fp.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4207630 (4.0M) [text/plain]
Saving to: ‘Diphtheria_06_bioactivity_data_3class_pIC50_pubchem_fp.csv’


2024-04-04 23:33:38 (45.2 M

In [None]:
df = pd.read_csv('Diphtheria_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

## **3. Input features**
The ***Diptheria*** data set contains 881 input features and 1 output variable (pIC50 values).

### **3.1. Input features**

In [None]:
X = df.drop('pIC50', axis=1)
X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2354,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2355,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2356,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2357,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### **3.2. Output features**

In [None]:
Y = df.pIC50
Y

0       7.698970
1       7.522879
2       7.455932
3       7.823909
4       7.301030
          ...   
2354    9.309804
2355    7.481486
2356    8.562249
2357    8.337242
2358    6.705534
Name: pIC50, Length: 2359, dtype: float64

### **3.3. Let's examine the data dimension**

In [None]:
X.shape

(2359, 881)

In [None]:
Y.shape

(2359,)

### **3.4. Remove low variance features**

In [None]:
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
X = selection.fit_transform(X)

In [None]:
X.shape

(2359, 127)

## **4. Data split (80/20 ratio)**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
X_train.shape, Y_train.shape

((1887, 127), (1887,))

In [None]:
X_test.shape, Y_test.shape

((472, 127), (472,))

## **5. Building a Regression Model using Random Forest**

In [None]:
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, Y_train)
r2 = model.score(X_test, Y_test)
r2

0.5182727965854872

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

# Assuming you have already split your data
# X_train, X_test, Y_train, Y_test

# Initialize and fit the KNeighborsRegressor
knn_regressor = KNeighborsRegressor(n_neighbors=5)  # The default number of neighbors is 5
knn_regressor.fit(X_train, Y_train)

# Predict and evaluate using R-squared
knn_r2 = knn_regressor.score(X_test, Y_test)
print(f'KNN Regressor R-squared: {knn_r2:.3f}')


KNN Regressor R-squared: 0.504


In [None]:
from sklearn.svm import SVR
from sklearn.metrics import r2_score

# Assuming you have already split your data
# X_train, X_test, Y_train, Y_test

# Initialize and fit the SVR model
svr_regressor = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr_regressor.fit(X_train, Y_train)

# Predict and evaluate using R-squared
svr_r2 = svr_regressor.score(X_test, Y_test)
print(f'SVR R-squared: {svr_r2:.3f}')


SVR R-squared: 0.565


In [None]:
from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import numpy as np

# Assuming your data is already split
# X_train, X_test, Y_train, Y_test

# Initialize and fit the SVR model
svr_regressor = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr_regressor.fit(X_train, Y_train)

# Predicting on training and testing data
y_train_pred = svr_regressor.predict(X_train)
y_test_pred = svr_regressor.predict(X_test)

# Choose a threshold for binarization
# The choice of threshold can significantly affect the outcomes; choose based on your specific context
threshold = np.median(Y_train)  # This is just an example; adjust it as necessary

# Binarize predictions and actual values based on the threshold
y_train_pred_binary = np.where(y_train_pred >= threshold, 1, 0)
y_train_binary = np.where(Y_train >= threshold, 1, 0)
y_test_pred_binary = np.where(y_test_pred >= threshold, 1, 0)
y_test_binary = np.where(Y_test >= threshold, 1, 0)

# Confusion matrices for training and testing sets
cm_train = confusion_matrix(y_train_binary, y_train_pred_binary)
cm_test = confusion_matrix(y_test_binary, y_test_pred_binary)

# Calculate accuracy for training and testing sets
accuracy_train = accuracy_score(y_train_binary, y_train_pred_binary)
accuracy_test = accuracy_score(y_test_binary, y_test_pred_binary)

# Generate classification reports for both training and testing sets
report_train = classification_report(y_train_binary, y_train_pred_binary)
report_test = classification_report(y_test_binary, y_test_pred_binary)

# Output the confusion matrices, accuracies, and classification reports
print("Training Confusion Matrix:")
print(cm_train)
print(f"Training Accuracy: {accuracy_train:.3f}\n")

print("Testing Confusion Matrix:")
print(cm_test)
print(f"Testing Accuracy: {accuracy_test:.3f}\n")

print("Training Classification Report:")
print(report_train)
print("Testing Classification Report:")
print(report_test)


Training Confusion Matrix:
[[807 135]
 [178 767]]
Training Accuracy: 0.834

Testing Confusion Matrix:
[[196  43]
 [ 50 183]]
Testing Accuracy: 0.803

Training Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       942
           1       0.85      0.81      0.83       945

    accuracy                           0.83      1887
   macro avg       0.83      0.83      0.83      1887
weighted avg       0.83      0.83      0.83      1887

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       239
           1       0.81      0.79      0.80       233

    accuracy                           0.80       472
   macro avg       0.80      0.80      0.80       472
weighted avg       0.80      0.80      0.80       472



In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

# Assuming you have already split your data
# X_train, X_test, Y_train, Y_test

# Initialize and fit the DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train, Y_train)

# Predict and evaluate using R-squared
dt_r2 = dt_regressor.score(X_test, Y_test)
print(f'Decision Tree Regressor R-squared: {dt_r2:.3f}')



Decision Tree Regressor R-squared: 0.176


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Assuming you have already split your data
# X_train, X_test, Y_train, Y_test

# Initialize and fit the Linear Regression model
lr_regressor = LinearRegression()
lr_regressor.fit(X_train, Y_train)

# Predict and evaluate using R-squared
lr_r2 = lr_regressor.score(X_test, Y_test)
print(f'Linear Regression R-squared: {lr_r2:.3f}')



Linear Regression R-squared: -2095070178731885592576.000


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Assuming you have already split your data
# X_train, X_test, Y_train, Y_test

# Initialize and fit the RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, Y_train)

# Predict and evaluate using R-squared
rf_r2 = rf_regressor.score(X_test, Y_test)
print(f'RandomForestRegressor R-squared: {rf_r2:.3f}')


RandomForestRegressor R-squared: 0.534


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
import numpy as np

# Assuming your data is already split
# X_train, X_test, Y_train, Y_test

# Initialize and fit the RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, Y_train)

# Predicting with the RandomForestRegressor
y_pred = rf_regressor.predict(X_test)

# Binarizing the predictions and actual values based on a chosen threshold
# This threshold should be chosen based on your specific problem context
threshold = np.median(Y_train)  # Example threshold, adjust as needed
y_pred_binary = np.where(y_pred >= threshold, 1, 0)
y_test_binary = np.where(Y_test >= threshold, 1, 0)

# Generating the confusion matrix
cm = confusion_matrix(y_test_binary, y_pred_binary)

print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[194  45]
 [ 57 176]]


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

# Assuming your data is already split
# X_train, X_test, Y_train, Y_test

# Initialize and fit the RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, Y_train)

# Making predictions
y_train_pred = rf_regressor.predict(X_train)
y_test_pred = rf_regressor.predict(X_test)

# Choose a threshold for binarization
threshold = np.median(Y_train)  # Adjust this threshold as needed

# Binarize predictions and actual values based on the threshold
y_train_pred_binary = np.where(y_train_pred >= threshold, 1, 0)
y_train_binary = np.where(Y_train >= threshold, 1, 0)
y_test_pred_binary = np.where(y_test_pred >= threshold, 1, 0)
y_test_binary = np.where(Y_test >= threshold, 1, 0)

# Confusion matrices for training and testing sets
cm_train = confusion_matrix(y_train_binary, y_train_pred_binary)
cm_test = confusion_matrix(y_test_binary, y_test_pred_binary)

# Calculate accuracy for training and testing sets
accuracy_train = accuracy_score(y_train_binary, y_train_pred_binary)
accuracy_test = accuracy_score(y_test_binary, y_test_pred_binary)

# Output the confusion matrices and accuracies
print("Training Confusion Matrix:")
print(cm_train)
print(f"Training Accuracy: {accuracy_train:.3f}\n")

print("Testing Confusion Matrix:")
print(cm_test)
print(f"Testing Accuracy: {accuracy_test:.3f}")


Training Confusion Matrix:
[[864  78]
 [122 823]]
Training Accuracy: 0.894

Testing Confusion Matrix:
[[194  45]
 [ 57 176]]
Testing Accuracy: 0.784


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
import numpy as np

# Assuming your data is already split
# X_train, X_test, Y_train, Y_test

# Initialize and fit the RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, Y_train)

# Predicting on training and testing data
y_train_pred = rf_regressor.predict(X_train)
y_test_pred = rf_regressor.predict(X_test)

# Choose a threshold for binarization
threshold = np.median(Y_train)  # Adjust this threshold as needed for your application

# Binarize predictions and actual values based on the threshold
y_train_pred_binary = np.where(y_train_pred >= threshold, 1, 0)
y_train_binary = np.where(Y_train >= threshold, 1, 0)
y_test_pred_binary = np.where(y_test_pred >= threshold, 1, 0)
y_test_binary = np.where(Y_test >= threshold, 1, 0)

# Generate classification reports for both training and testing sets
report_train = classification_report(y_train_binary, y_train_pred_binary)
report_test = classification_report(y_test_binary, y_test_pred_binary)

# Output the classification reports
print("Training Classification Report:")
print(report_train)
print("Testing Classification Report:")
print(report_test)


Training Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.92      0.90       942
           1       0.91      0.87      0.89       945

    accuracy                           0.89      1887
   macro avg       0.89      0.89      0.89      1887
weighted avg       0.89      0.89      0.89      1887

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       239
           1       0.80      0.76      0.78       233

    accuracy                           0.78       472
   macro avg       0.78      0.78      0.78       472
weighted avg       0.78      0.78      0.78       472



In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np

# Assuming your data is already split and the RandomForestRegressor is trained as above
# rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_regressor.fit(X_train, Y_train)

# Instead of R-squared, predict probabilities or continuous outcomes
y_pred = rf_regressor.predict(X_test)

# Convert predictions to binary based on a threshold, e.g., median, specific value, etc.
# This step is domain-specific and requires careful consideration
threshold = np.median(Y_train)  # Example threshold, can be any domain-specific value
y_pred_binary = np.where(y_pred >= threshold, 1, 0)

# Similarly, convert Y_test to binary based on the same threshold
y_test_binary = np.where(Y_test >= threshold, 1, 0)

# Compute AUC
# Note: This requires true binary labels and predicted probabilities or scores, not applicable directly in regression without modification
# Here, 'y_pred' is treated as a score since AUC requires scores rather than binary predictions
auc_score = roc_auc_score(y_test_binary, y_pred)

print(f'AUC Score: {auc_score:.3f}')


AUC Score: 0.860


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Assuming your data is already split and the RandomForestRegressor is trained as above
# rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_regressor.fit(X_train, Y_train)

# Predicting with the RandomForestRegressor
y_pred = rf_regressor.predict(X_test)

# Binarizing the predictions and actual values based on the chosen threshold
threshold = np.median(Y_train)  # This is an example threshold. Choose one that's appropriate for your task
y_pred_binary = np.where(y_pred >= threshold, 1, 0)
y_test_binary = np.where(Y_test >= threshold, 1, 0)

# Calculating Precision, Recall, and F1-Score
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)
f1 = f1_score(y_test_binary, y_pred_binary)

print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1-Score: {f1:.3f}')


Precision: 0.796
Recall: 0.755
F1-Score: 0.775


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:
models = {
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True),
    "DT": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "RF": RandomForestClassifier()
}


In [None]:
Y_pred = model.predict(X_test)

AttributeError: 'KNeighborsClassifier' object has no attribute 'n_samples_fit_'

## **6. Scatter Plot of Experimental vs Predicted pIC50 Values**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure Y_test and Y_pred are defined here
# Y_test = ...
# Y_pred = ...

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(x=Y_test, y=Y_pred, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set_xlim(0, 12)
ax.set_ylim(0, 12)
ax.figure.set_size_inches(5, 5)

plt.show()


NameError: name 'Y_pred' is not defined