<a href="https://colab.research.google.com/github/quickbrainlab/Project_2_Protein_Sequence_Classifier_with_ML/blob/main/Toxicity_Prediction_of_Drug_like_Compounds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1:Prepare the Dataset**

In [2]:
data = """Compound,SMILES,Label
Aspirin,CC(=O)OC1=CC=CC=C1C(=O)O,0
Caffeine,CN1C=NC2=C1C(=O)N(C(=O)N2C)C,0
Nicotine,CN1CCCC1C2=CN=CC=C2,1
Paracetamol,CC(=O)NC1=CC=C(C=C1)O,0
Benzene,C1=CC=CC=C1,1
"""
with open("toxicity_data.csv", "w") as f:
    f.write(data)

**Step 2:Install Required Libraries**

In [3]:
!pip install rdkit-pypi pandas scikit-learn

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


**Step 3:Generate Molecular Descriptors**

In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

# Load data
df = pd.read_csv("toxicity_data.csv")

# Function to calculate descriptors
def calc_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return [
        Descriptors.MolWt(mol),
        Descriptors.NumRotatableBonds(mol),
        Descriptors.TPSA(mol),
        Descriptors.MolLogP(mol)
    ]

# Apply to dataset
df[['MolWt', 'RotBonds', 'TPSA', 'LogP']] = df['SMILES'].apply(lambda x: pd.Series(calc_descriptors(x)))

**Step 4:Train Machine Learning Model**

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Prepare features and label
X = df[['MolWt', 'RotBonds', 'TPSA', 'LogP']]
y = df['Label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict & evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


**Step 5:Save Your Model**

In [5]:
import joblib
joblib.dump(model, "tox_model.pkl")

['tox_model.pkl']

**Step 6:Use the Model for New Prediction**

In [6]:
# Example compound
smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # Aspirin
features = calc_descriptors(smiles)
prediction = model.predict([features])
print("Prediction (1 = Toxic, 0 = Non-toxic):", prediction[0])

Prediction (1 = Toxic, 0 = Non-toxic): 0


