#Build a SVM based classification model to predict bioactivity.

In [None]:
#Install rdkit.
!pip install rdkit

In [None]:
# Install required packages
!pip install -q chembl_webresource_client pandas

# Import libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client
from google.colab import files

#  Assign known ChEMBL Target ID
selected_target = "CHEMBL1293224"  # MAPT, can be replaced with other target as well.
print(f"Selected target: {selected_target}")

# Fetch IC50 activity data
activity = new_client.activity
results = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(results)

#  Filter by standard_relation = '='
df = df[df['standard_relation'] == '=']  # Keep only '=' relations

#  Clean the data
df = df[df.standard_value.notna()].copy()
df["standard_value"] = df["standard_value"].astype(float)

# Fetch smiles
def fetch_smiles(chembl_id):
    record = new_client.molecule.get(chembl_id)
    if not record:
        return None
    structures = record.get('molecule_structures')
    if not structures:
        return None
    return structures.get('canonical_smiles')

df["smiles"] = df["molecule_chembl_id"].apply(fetch_smiles)

#  Binary Classification (1 = Active, 0 = Inactive)
def binary_class(value):
    if value <= 1000:
        return 1  # Active
    elif value >= 10000:
        return 0  # Inactive
    else:
        return None  # Intermediate (excluded)

df["activity"] = df["standard_value"].apply(binary_class)

#  Drop intermediates (NaN) and convert to int
df = df[df["activity"].notna()]
df["activity"] = df["activity"].astype(int)

#  Select relevant columns
columns_to_keep = ["molecule_chembl_id", "smiles", "standard_type", "standard_value", "standard_units", "activity"]
df = df[columns_to_keep]

#  Show counts of active (1) vs inactive (0) compounds
binary_counts = df["activity"].value_counts()
print("Counts of Active (1) vs Inactive (0):")
print(binary_counts)

#  Show first few rows
df.head(5)


In [None]:
df

In [None]:
#Now we will convert the smiles format to RDkit readable molecule list of objects

from rdkit import Chem
# Remove missing SMILES
df = df[df.smiles.notnull()].copy()
mol_list= []
for element in df.smiles:
  mol = Chem.MolFromSmiles(element)
  mol_list.append(mol)

In [None]:
#Take a look at the RDKit object corresponding to the first smiles structre
mol_list[20]

In [None]:
#checking the lenght of the molecule list
len(mol_list)

In [None]:
#Now we convert the molecule list to pandas dataframe

df_ml = pd.DataFrame(mol_list ,columns =['object'])

In [None]:
df_ml

In [None]:
from rdkit.Chem import rdFingerprintGenerator
import pandas as pd

# Define parameters for the Morgan fingerprint (ECFP6)
radius = 3
nBits = 1024

# Create a fingerprint generator for Morgan fingerprints (ECFP6)
fpg = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=nBits)

# Generate the ECFP6 fingerprints for each molecule in df_ml['object']
ECFP6 = [fpg.GetFingerprint(mol) for mol in df_ml['object']]

# Convert the fingerprints into a DataFrame for inspection
fingerprint_df = pd.DataFrame([list(fp.ToBitString()) for fp in ECFP6])

# Add the fingerprints as a new column to the original DataFrame (df_ml)
df_ml['ECFP6'] = ECFP6

# Display the DataFrame with the ECFP6 fingerprints
df_ml


In [None]:
import pandas as pd
import numpy as np

# Convert the ECFP6 fingerprints into a NumPy array (0s and 1s)
ECFP6_array = np.array([list(fp.ToBitString()) for fp in ECFP6], dtype=int)

# Create a DataFrame from the bit strings, each column represents a bit in the fingerprint
X = pd.DataFrame(ECFP6_array, columns=[f'FP{i}' for i in range(nBits)])

# You can now use 'X' as the feature set for model training
X


In [None]:
#Now we assign activity of each molecule from input data as dependendt variable

y = df.activity
y

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


In [None]:
from sklearn.svm import SVC

# Initialize the Support Vector Machine (SVM) classifier
svclassifier = SVC(kernel='linear')

# Train the SVM classifier
svclassifier.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = svclassifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

# Compute ROC curve and AUC (Area Under the Curve)
fpr, tpr, thresholds = roc_curve(y_test, svclassifier.decision_function(X_test))
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
import joblib

# Save the best model to a file
joblib.dump(svclassifier, 'svm_model.pkl')


In [None]:
#using the svm based classification model that we have generated , now we will try to predict the tau inhibition activity of some known AChE inhibitors .
#The smiles structure of known AChE inhibitors can be downloaded from Chembel database .

import pandas as pd
act_ache = pd.read_csv('https://raw.githubusercontent.com/Rajnishphe/AIDD-2022/main/ML%20Based%20QSAR/ache_activity.csv')

In [None]:
#Take a look at whats inside the ache.csv file

act_ache

In [None]:
from rdkit import Chem

mol_list = [Chem.MolFromSmiles(smi) for smi in act_ache['Smiles']]
df_ache = pd.DataFrame(mol_list, columns=['object'])
df_ache

In [None]:
from rdkit.Chem import rdFingerprintGenerator

radius = 3
nBits = 1024

fpg = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=nBits)
ECFP6_ache = [fpg.GetFingerprint(mol) for mol in df_ache['object']]
X_ache = pd.DataFrame([list(fp.ToBitString()) for fp in ECFP6_ache]).astype(int)
X_ache

In [None]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import pandas as pd
import numpy as np


# Convert to DataFrame with matching column names
ECFP6_array_ache = np.array([list(fp.ToBitString()) for fp in ECFP6_ache], dtype=int)
X_ache = pd.DataFrame(ECFP6_array_ache, columns=[f'FP{i}' for i in range(1024)])

# Load your saved model (and scaler if used)
import joblib
svm_model = joblib.load('svm_model.pkl')

# Predict activity
act_ache['Predicted_Tau_Activity'] = svm_model.predict(X_ache)
act_ache.head(25)


In [None]:
# Save results
act_ache.to_csv('ache_activity_predictions.csv', index=False)

In [None]:
ls

In [None]:
!pip install lazypredict


In [None]:
import pandas as pd
import numpy as np
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit and evaluate models
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display results
print(models)
