## **Importing Libraries**

In [1]:
%%capture
!pip install mordred
!pip install rdkit


In [2]:
# Importing Libraries
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools, AllChem

import mordred
from mordred import Calculator, descriptors


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings


In [None]:
df = pd.read_csv('3DRDKit.csv')

## **Building Machine Learning Model**

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
import pandas as pd

In [None]:
df_final = pd.read_csv('3DRDKit.csv')

In [8]:
df_final.head()

Unnamed: 0,PMI1,PMI2,NPR1,NPR2,RadiusOfGyration,InertialShapeFactor,SpherocityIndex,PBF,values
0,-0.416267,-0.260116,1.345511,-0.602951,-1.931896,0.585006,0.157481,-1.340883,1
1,-0.25662,-0.175355,0.052397,-0.147606,-0.548179,-0.046893,0.233452,-0.067527,1
2,0.885918,0.014085,2.078184,-1.046947,0.087002,-0.138292,2.132503,2.278375,1
3,-0.174179,-0.106656,-0.30693,-0.348407,-0.117882,-0.080889,-0.94596,-1.007047,1
4,-0.082943,-0.000443,-0.475669,0.449613,0.191616,-0.093591,-0.16423,0.28157,1


In [None]:
df_final.shape

In [10]:
y = df_final['values']

In [11]:
X = df_final.drop(['values'], axis=1)

In [None]:
# prompt: count each class

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Count each class and store in a variable
class_counts = df_final['values'].value_counts()

# Print the class counts
print(class_counts)

# Create a countplot
plt.figure(figsize=(5, 4))
sns.countplot(x=df_final["values"], palette="coolwarm")
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=45)

In [None]:
X_test.shape

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

# **Data Dimensionality**

In [19]:
from sklearn.manifold import TSNE
X_tsne = TSNE(n_components=2, random_state=42).fit_transform(X)


In [None]:
# Create scatter plot
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=y, palette="coolwarm", alpha=0.7)
plt.title("t-SNE Visualization of Data")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.legend(title="Class")
plt.show()

# **Linearity and non-linearity of data**

In [None]:
# prompt: train and test logistic model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train logistic model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
logistic_predictions = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, logistic_predictions)
print(f"Logistic Regression Accuracy: {accuracy}")


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Train linear SVM
svm_linear = SVC(kernel="linear")
svm_linear.fit(X_train, y_train)
linear_acc = accuracy_score(y_test, svm_linear.predict(X_test))

# Train non-linear SVM (RBF)
svm_rbf = SVC(kernel="rbf")
svm_rbf.fit(X_train, y_train)
rbf_acc = accuracy_score(y_test, svm_rbf.predict(X_test))

print("Linear SVM Accuracy:", linear_acc)
print("RBF SVM Accuracy:", rbf_acc)


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


svm_model2 = SVC(kernel='poly', degree=3, C=1.0, gamma='scale')
svm_model2.fit(X_train, y_train)

# 📌 Predictions & Evaluation
y_pred = svm_model2.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


svm_model2 = SVC(kernel='poly', degree=6, C=1.0, gamma='scale')
svm_model2.fit(X_train, y_train)

# 📌 Predictions & Evaluation
y_pred = svm_model2.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# **Non-Linear, High Dimensional Data so select best features and optimize SVM with RBF**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 1],
    'kernel': ['rbf']
}

grid = GridSearchCV(SVC(), param_grid, cv=5)
grid.fit(X_train, y_train)

print(f"Best Parameters: {grid.best_params_}")


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 📌 Train SVM with RBF Kernel
svm_model = SVC(kernel='rbf', C=10, gamma=1, probability=True)
svm_model.fit(X_train, y_train)

# 📌 Predictions & Evaluation
y_pred = svm_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [29]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv_scores = cross_val_score(svm_model, X_train, y_train, cv=skf, scoring='accuracy') # Example using accuracy
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

In [None]:
# prompt: confusion matrix

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming y_test and y_pred are defined from your model's predictions
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
from sklearn.metrics import f1_score
# Calculate the F1 score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt

# Predict probabilities instead of class labels
y_prob = svm_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC-AUC Score: {roc_auc:.4f}")

# Compute ROC curve and ROC area
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [34]:
import joblib
joblib.dump(svm_model, '3-3DRdkit_svm_model.pkl')

['3-3DRdkit_svm_model.pkl']