<a href="https://colab.research.google.com/github/quickbrainlab/Project_2_Protein_Sequence_Classifier_with_ML/blob/main/Drug_Activity_Prediction_using_Python_(ML)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Step 1: Choose Your Drug Compounds

Start with a focused goal. Example task:

> “Predict if a compound is active or inactive against breast cancer cells (MCF-7).”

### ✅ Example Compounds (to get started):

Use common compounds from PubChem like:

* Tamoxifen (CID: 2733526)
* Doxorubicin (CID: 31703)
* Paclitaxel (CID: 36314)
* Cisplatin (CID: 441203)
* Fluorouracil (CID: 3385)


Step 2: Download Data from PubChem
Use Python (PubChemPy)

In [6]:
!pip install pubchempy
import pubchempy as pcp

compound = pcp.get_compounds(['Tamoxifen','Doxorubicin','Palcitaxel','Cisplatin','flourouracil'], 'name')
print("SMILES:", compound)

SMILES: []


Step 3: Generate Molecular Descriptors (Features)

Install and use *RDKit* to convert SMILES to features.

In [7]:
!pip install rdkit-pypi
from rdkit import Chem
from rdkit.Chem import Descriptors

smiles = 'CC/C(=C(/c1ccc(O)cc1)\c1ccc(N(c2ccccc2)S(=O)(=O)C)cc1)C(=O)O'
mol = Chem.MolFromSmiles(smiles)
features = {
    'MolWt': Descriptors.MolWt(mol),
    'TPSA': Descriptors.TPSA(mol),
    'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
    'LogP': Descriptors.MolLogP(mol),
}
print(features)

{'MolWt': 437.5170000000001, 'TPSA': 94.91000000000001, 'NumRotatableBonds': 7, 'LogP': 4.786400000000005}


Code to Generate CSV in Colab

In [8]:
!pip install rdkit-pypi

from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd

# List of compounds with their names and SMILES
compounds = [
    {'Name': 'Tamoxifen', 'SMILES': 'CC/C(=C(/c1ccc(O)cc1)\\c1ccc(N(c2ccccc2)S(=O)(=O)C)cc1)C(=O)O', 'Active': 1},
    {'Name': 'Cisplatin', 'SMILES': 'Cl[Pt](Cl)(N)N', 'Active': 0},
    {'Name': 'Doxorubicin', 'SMILES': 'CC1=C(C(=O)C2=C(C1=O)C=CC(=C2O)O)O', 'Active': 1},
    {'Name': 'Paclitaxel', 'SMILES': 'CC1=C2C(=CC(=O)OC2=C(C=C1)O)O', 'Active': 1},
    {'Name': 'Fluorouracil', 'SMILES': 'C1=C(C(=O)NC(=O)N1)F', 'Active': 0}
]

# Store descriptors in a list
data = []

for compound in compounds:
    mol = Chem.MolFromSmiles(compound['SMILES'])
    if mol:
        features = {
            'Name': compound['Name'],
            'MolWt': Descriptors.MolWt(mol),
            'TPSA': Descriptors.TPSA(mol),
            'NumRotBonds': Descriptors.NumRotatableBonds(mol),
            'LogP': Descriptors.MolLogP(mol),
            'Active': compound['Active']
        }
        data.append(features)

# Create DataFrame
df = pd.DataFrame(data)

# Save as CSV
csv_path = "/content/drug_data.csv"
df.to_csv(csv_path, index=False)

# Show saved file path
print(f"CSV file saved to: {csv_path}")
df.head()


CSV file saved to: /content/drug_data.csv


Unnamed: 0,Name,MolWt,TPSA,NumRotBonds,LogP,Active
0,Tamoxifen,437.517,94.91,7,4.7864,1
1,Cisplatin,298.03,52.04,0,0.1953,0
2,Doxorubicin,220.18,94.83,0,1.3088,1
3,Paclitaxel,192.17,70.67,0,1.51262,1
4,Fluorouracil,130.078,65.72,0,-0.7977,0


Load the Data in Google Colab

In [10]:
import pandas as pd

# Load the CSV
df = pd.read_csv("/content/drug_data.csv")

# Display first few rows
display(df.head())

Unnamed: 0,Name,MolWt,TPSA,NumRotBonds,LogP,Active
0,Tamoxifen,437.517,94.91,7,4.7864,1
1,Cisplatin,298.03,52.04,0,0.1953,0
2,Doxorubicin,220.18,94.83,0,1.3088,1
3,Paclitaxel,192.17,70.67,0,1.51262,1
4,Fluorouracil,130.078,65.72,0,-0.7977,0


Convert Labels and Split Data

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode 'Active' to 1/0
df["Active"] = LabelEncoder().fit_transform(df["Active"])

# Features & label
X = df[["MolWt", "LogP", "TPSA", "NumRotBonds"]]
y = df["Active"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.0

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
from pubchempy import get_compounds
compound = get_compounds("Aspirin", 'name')[0]
print(compound.to_dict())

{'atom_stereo_count': 0, 'atoms': [{'aid': 1, 'number': 8, 'element': 'O', 'y': -0.06, 'x': 3.7321}, {'aid': 2, 'number': 8, 'element': 'O', 'y': 1.44, 'x': 6.3301}, {'aid': 3, 'number': 8, 'element': 'O', 'y': 1.44, 'x': 4.5981}, {'aid': 4, 'number': 8, 'element': 'O', 'y': -1.56, 'x': 2.866}, {'aid': 5, 'number': 6, 'element': 'C', 'y': -0.56, 'x': 4.5981}, {'aid': 6, 'number': 6, 'element': 'C', 'y': -0.06, 'x': 5.4641}, {'aid': 7, 'number': 6, 'element': 'C', 'y': -1.56, 'x': 4.5981}, {'aid': 8, 'number': 6, 'element': 'C', 'y': -0.56, 'x': 6.3301}, {'aid': 9, 'number': 6, 'element': 'C', 'y': -2.06, 'x': 5.4641}, {'aid': 10, 'number': 6, 'element': 'C', 'y': -1.56, 'x': 6.3301}, {'aid': 11, 'number': 6, 'element': 'C', 'y': 0.94, 'x': 5.4641}, {'aid': 12, 'number': 6, 'element': 'C', 'y': -0.56, 'x': 2.866}, {'aid': 13, 'number': 6, 'element': 'C', 'y': -0.06, 'x': 2}, {'aid': 14, 'number': 1, 'element': 'H', 'y': -1.87, 'x': 4.0611}, {'aid': 15, 'number': 1, 'element': 'H', 'y': 

In [18]:
pip install gradio



In [20]:
import gradio as gr
import joblib
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd

# Save the trained model (assuming 'model' is your trained model from a previous step)
joblib.dump(model, "drug_model.pkl")

model = joblib.load("drug_model.pkl")

def predict_activity(smiles):
    # convert smiles to features
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        features = {
            'MolWt': Descriptors.MolWt(mol),
            'TPSA': Descriptors.TPSA(mol),
            'NumRotBonds': Descriptors.NumRotatableBonds(mol),
            'LogP': Descriptors.MolLogP(mol),
        }
        # Convert features to a DataFrame row for prediction
        features_df = pd.DataFrame([features])
        return model.predict(features_df)[0]
    else:
        return "Invalid SMILES string"


gr.Interface(fn=predict_activity, inputs="text", outputs="text").launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1d94a436c679524479.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [21]:
!pip install streamlit
import streamlit as st

st.title("Drug Activity Predictor")

molwt = st.number_input("Molecular Weight")
tpsa = st.number_input("TPSA")
rot_bonds = st.number_input("Number of Rotatable Bonds")
logp = st.number_input("LogP")

if st.button("Predict Activity"):
    result = model.predict([[molwt, tpsa, rot_bonds, logp]])
    if result[0] == 1:
        st.success("Prediction: ACTIVE")
    else:
        st.warning("Prediction: INACTIVE")

Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m92.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hI

2025-07-08 12:59:45.419 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-07-08 12:59:45.430 Session state does not function when running a script without `streamlit run`
