<a href="https://colab.research.google.com/github/quantam665/-AI-Powered-HIV-Drug-Discovery-System/blob/main/Drug_Discovery_and_Development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ STEP 1: Install everything needed
!pip install -q rdkit-pypi gradio scikit-learn tensorflow pandas matplotlib seaborn

# ✅ STEP 2: Import libraries
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import gradio as gr
import matplotlib.pyplot as plt
from rdkit.Chem import Draw
import io
import base64

# ✅ STEP 3: Load the dataset (already uploaded as /content/HIV.csv)
df = pd.read_csv("/content/HIV.csv")
print("🧪 Dataset Loaded:", df.shape)

# ✅ STEP 4: Convert SMILES to ECFP4 Fingerprints
def smiles_to_ecfp(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits))
    else:
        return np.zeros(n_bits)

df = df.dropna()
X = np.array([smiles_to_ecfp(s) for s in df['smiles']])
y = df['HIV_active'].values

# ✅ STEP 5: Train-test split and scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ✅ STEP 6: Handle class imbalance using class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
cw = {0: class_weights[0], 1: class_weights[1]}
print("⚖️ Class Weights:", cw)

# ✅ STEP 7: Build the deep learning model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])

# ✅ STEP 8: Train the model
model.fit(X_train_scaled, y_train, epochs=30, batch_size=64, class_weight=cw, verbose=1, validation_split=0.1)

# ✅ STEP 9: Evaluate model on test data
test_preds = model.predict(X_test_scaled).flatten()
test_classes = (test_preds > 0.5).astype(int)
acc = accuracy_score(y_test, test_classes)
print(f"🧪 Test Accuracy: {acc:.4f}")

# ✅ STEP 10: Define Gradio Dashboard
def predict_from_smiles(smiles_input):
    mol = Chem.MolFromSmiles(smiles_input)
    if not mol:
        return "Invalid SMILES", "-", "<b style='color:red;'>Invalid SMILES structure</b>"

    fp = smiles_to_ecfp(smiles_input)
    fp_scaled = scaler.transform([fp])
    prob = float(model.predict(fp_scaled)[0])
    pred = "HIV Active" if prob > 0.5 else "Not Active"

    img = Draw.MolToImage(mol, size=(250,250))
    buffer = io.BytesIO()
    img.save(buffer, format='PNG')
    img_b64 = base64.b64encode(buffer.getvalue()).decode()
    html_img = f'<img src="data:image/png;base64,{img_b64}"/>'

    return pred, f"{prob:.2f}", html_img

# ✅ STEP 11: Launch Gradio Interface
iface = gr.Interface(
    fn=predict_from_smiles,
    inputs=gr.Textbox(label="Enter SMILES String"),
    outputs=[
        gr.Label(label="Prediction"),
        gr.Label(label="Confidence Score"),
        gr.HTML(label="Molecule Structure")
    ],
    title="🧠 AI-Powered HIV Activity Prediction",
    description="Enter a molecule’s SMILES string to predict if it is HIV Active. Built using Deep Learning + RDKit + ECFP4.",
)

iface.launch(share=True)


🧪 Dataset Loaded: (41127, 3)




⚖️ Class Weights: {0: np.float64(0.5180444024563061), 1: np.float64(14.354712041884817)}



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



Epoch 1/30
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.6175 - loss: 0.8558 - val_accuracy: 0.8095 - val_loss: 0.5582
Epoch 2/30
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.7902 - loss: 0.4391 - val_accuracy: 0.7086 - val_loss: 0.5702
Epoch 3/30
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 23ms/step - accuracy: 0.8377 - loss: 0.3161 - val_accuracy: 0.8833 - val_loss: 0.3527
Epoch 4/30
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 22ms/step - accuracy: 0.9014 - loss: 0.2430 - val_accuracy: 0.9031 - val_loss: 0.2797
Epoch 5/30
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - accuracy: 0.9269 - loss: 0.1677 - val_accuracy: 0.8508 - val_loss: 0.3997
Epoch 6/30
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 24ms/step - accuracy: 0.9286 - loss: 0.1597 - val_accuracy: 0.9012 - val_loss: 0.2925
Epoch 7/30
[1m4

