## **Importing Libraries**

In [1]:
%%capture
!pip install mordred
!pip install rdkit


In [2]:
# Importing Libraries
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, AllChem

import mordred
from mordred import Calculator, descriptors


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [None]:
df = pd.read_csv('3DRDKit.csv')

## **Building Machine Learning Model**

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

In [6]:
df.head()

Unnamed: 0,PMI1,PMI2,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,SpherocityIndex,PBF,values
0,-0.416267,-0.260116,1.345511,-0.602951,-1.931896,0.585006,0.157481,-1.340883,1
1,-0.25662,-0.175355,0.052397,-0.147606,-0.548179,-0.046893,0.233452,-0.067527,1
2,0.885918,0.014085,2.078184,-1.046947,0.087002,-0.138292,2.132503,2.278375,1
3,-0.174179,-0.106656,-0.30693,-0.348407,-0.117882,-0.080889,-0.94596,-1.007047,1
4,-0.082943,-0.000443,-0.475669,0.449613,0.191616,-0.093591,-0.16423,0.28157,1


In [None]:
df.shape

In [8]:
y = df['values']

In [9]:
df

Unnamed: 0,PMI1,PMI2,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,SpherocityIndex,PBF,values
0,-0.416267,-0.260116,1.345511,-0.602951,-1.931896,0.585006,0.157481,-1.340883,1
1,-0.256620,-0.175355,0.052397,-0.147606,-0.548179,-0.046893,0.233452,-0.067527,1
2,0.885918,0.014085,2.078184,-1.046947,0.087002,-0.138292,2.132503,2.278375,1
3,-0.174179,-0.106656,-0.306930,-0.348407,-0.117882,-0.080889,-0.945960,-1.007047,1
4,-0.082943,-0.000443,-0.475669,0.449613,0.191616,-0.093591,-0.164230,0.281570,1
...,...,...,...,...,...,...,...,...,...
21995,1.071132,0.478336,0.040879,-0.258905,1.487316,-0.138609,-0.321212,1.233134,1
21996,-0.150497,-0.017155,-0.627898,0.652159,0.290115,-0.078479,0.016143,0.319011,1
21997,-0.363654,-0.219584,-0.261513,0.059847,-0.981859,0.103982,0.002396,-0.808583,1
21998,-0.321579,-0.064096,-1.168117,1.338770,0.239160,0.044634,-0.110649,0.358026,1


In [10]:
X = df.drop(['values'], axis=1)

In [11]:
X.head()

Unnamed: 0,PMI1,PMI2,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,SpherocityIndex,PBF
0,-0.416267,-0.260116,1.345511,-0.602951,-1.931896,0.585006,0.157481,-1.340883
1,-0.25662,-0.175355,0.052397,-0.147606,-0.548179,-0.046893,0.233452,-0.067527
2,0.885918,0.014085,2.078184,-1.046947,0.087002,-0.138292,2.132503,2.278375
3,-0.174179,-0.106656,-0.30693,-0.348407,-0.117882,-0.080889,-0.94596,-1.007047
4,-0.082943,-0.000443,-0.475669,0.449613,0.191616,-0.093591,-0.16423,0.28157


In [None]:
y.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Count each class and store in a variable
class_counts = df['values'].value_counts()

# Print the class counts
print(class_counts)

# Create a countplot
plt.figure(figsize=(5, 4))
sns.countplot(x=df["values"], palette="coolwarm")
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=45)

In [None]:
X_test.shape

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

# **XGBoost with gridcv, stratification and best features with smote**

In [None]:
pip install xgboost


In [22]:
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import f1_score


In [None]:
# Define hyperparameter grid
param_grid = {
    "n_estimators": [100, 200, 300],      # Number of boosting rounds
    "learning_rate": [0.01, 0.1, 0.2],    # Step size shrinkage
    "max_depth": [3, 5, 7],               # Maximum tree depth
    "min_child_weight": [1, 3, 5],        # Minimum sum of instance weight
    "subsample": [0.8, 1.0],              # Fraction of samples used per tree
    "colsample_bytree": [0.8, 1.0]        # Fraction of features used per tree
}

# Initialize XGBoost Classifier
xgb_model = xgb.XGBClassifier(objective="binary:logistic", eval_metric="logloss", use_label_encoder=False, random_state=42)

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid Search with Cross-Validation
grid_search = GridSearchCV(xgb_model, param_grid, cv=skf, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)


In [None]:
# Train XGBoost with best hyperparameters
best_xgb = xgb.XGBClassifier(**grid_search.best_params_, objective="binary:logistic", eval_metric="logloss", use_label_encoder=False, random_state=42)
best_xgb.fit(X_train, y_train)

# Make predictions
y_pred = best_xgb.predict(X_test)
y_pred_proba = best_xgb.predict_proba(X_test)[:, 1]  # For ROC AUC


In [None]:
from sklearn.metrics import f1_score
# Calculate the F1 score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

In [None]:
# prompt: cross validation
cv_scores = cross_val_score(best_xgb, X_train, y_train, cv=skf, scoring='accuracy') # Example using accuracy

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

In [None]:
# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="blue", label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")  # Diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


In [None]:
import joblib
joblib.dump(best_xgb, '3-3D_xgb_model.pkl')

# **Random forest with smote, gridcv, stratification, best features**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.tree import plot_tree

# Define hyperparameter grid
param_grid = {
    "n_estimators": [50, 100, 200],        # Number of trees
    "max_depth": [10, 20, None],           # Max depth of trees
    "min_samples_split": [2, 5, 10],       # Min samples to split
    "min_samples_leaf": [1, 2, 5],         # Min samples per leaf
    "criterion": ["gini", "entropy"]       # Splitting criterion
}

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Train Random Forest with best hyperparameters
best_rf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
best_rf.fit(X_train, y_train)

# Make predictions
y_pred = best_rf.predict(X_test)
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]  # For ROC AUC

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification Report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Class 0", "Class 1"], yticklabels=["Class 0", "Class 1"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="blue", label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")  # Diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Use StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=skf, scoring='accuracy')

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


In [None]:
from sklearn.metrics import f1_score
# Calculate the F1 score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

In [None]:
import joblib
joblib.dump(best_rf, '3-3D_RF_model.pkl')