<a href="https://colab.research.google.com/github/robinrb7/MolGen/blob/main/MolGen1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
file_path = "/content/drive/MyDrive/155.csv"  # Replace with actual path
df = pd.read_csv(file_path)

# Drop non-numeric columns
df = df.drop(columns=["PDB_ID", "Ligand_Name", "Ligand_Mol_Weight"])

# Define site features (input X) and ligand features (output Y)
site_features = [
    "Score", "Druggability_Score", "Alpha_Spheres", "Total_SASA", "Polar_SASA",
    "Apolar_SASA", "Volume", "Hydrophobicity_Density", "Alpha_Sphere_Radius",
    "Solvent_Access", "Apolar_Proportion", "Hydrophobicity_Score", "Volume_Score",
    "Polarity_Score", "Charge_Score", "Polar_Atom_Proportion", "Alpha_Sphere_Density",
    "COM_Max_Dist", "Flexibility"
]

ligand_features = [
    "Ligand_TPSA", "Ligand_LogP", "Ligand_Num_HDonors",
    "Ligand_Num_HAcceptors", "Ligand_Heavy_Atom", "Ligand_Frac_CSP3",
    "Ligand_Num_Rotatable_Bonds", "Ligand_Ring_Count", "Ligand_Num_Saturated_Rings",
    "Ligand_Num_Aliphatic_Rings", "Ligand_Mol_Refractivity", "Ligand_Num_Atom_StereoCenters",
    "Ligand_BalabanJ", "Ligand_Bertz_Complexity", "Ligand_Hall_Kier_Alpha",
    "Ligand_Kappa1", "Ligand_Kappa2"
]

X = df[site_features]
Y = df[ligand_features]

# Normalize features using StandardScaler
scaler_X = StandardScaler()
scaler_Y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
Y_scaled = scaler_Y.fit_transform(Y)

# Split into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_scaled, test_size=0.2, random_state=42)

# Define the XGBoost model
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300,400,500],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [5,7,9],
    'subsample': [0.8,1],
    'colsample_bytree': [0.8,1]
}

grid_search = GridSearchCV(xgb_model, param_grid, scoring="neg_mean_squared_error", cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Get best model
best_xgb = grid_search.best_estimator_

# Make predictions (on scaled data)
Y_pred_scaled = best_xgb.predict(X_test)

# Inverse transform predictions to original scale
Y_pred = scaler_Y.inverse_transform(Y_pred_scaled)
Y_test_original = scaler_Y.inverse_transform(Y_test)

# Evaluate model
mse = mean_squared_error(Y_test_original, Y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test_original, Y_pred)

print(f"R² Score: {r2:.4f}")
print(f"Optimized RMSE: {rmse}")

# Feature importance visualization
plt.figure(figsize=(10, 6))
xgb.plot_importance(best_xgb, max_num_features=15)
plt.title("Top 15 Important Features")
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/155.csv'

In [None]:
import pandas as pd
import xgboost as xgb

# Load the trained model
best_xgb = grid_search.best_estimator_

# Define site features (input)
site_features = [
    "Score", "Druggability_Score", "Alpha_Spheres", "Total_SASA", "Polar_SASA",
    "Apolar_SASA", "Volume", "Hydrophobicity_Density", "Alpha_Sphere_Radius",
    "Solvent_Access", "Apolar_Proportion", "Hydrophobicity_Score", "Volume_Score",
    "Polarity_Score", "Charge_Score", "Polar_Atom_Proportion", "Alpha_Sphere_Density",
    "COM_Max_Dist", "Flexibility"
]

# Define ligand features (output)
ligand_features = [
    "Ligand_TPSA", "Ligand_LogP", "Ligand_Num_HDonors",
    "Ligand_Num_HAcceptors", "Ligand_Heavy_Atom", "Ligand_Frac_CSP3",
    "Ligand_Num_Rotatable_Bonds", "Ligand_Ring_Count", "Ligand_Num_Saturated_Rings",
    "Ligand_Num_Aliphatic_Rings", "Ligand_Mol_Refractivity", "Ligand_Num_Atom_StereoCenters",
    "Ligand_BalabanJ", "Ligand_Bertz_Complexity", "Ligand_Hall_Kier_Alpha",
    "Ligand_Kappa1", "Ligand_Kappa2"
]

# Load new site feature data
input_csv = "/content/drive/MyDrive/example.csv"
df = pd.read_csv(input_csv)

# Ensure correct columns
df = df[site_features]

# Predict ligand features
predicted_ligands = best_xgb.predict(df)

# Convert predictions to DataFrame
pred_df = pd.DataFrame(predicted_ligands, columns=ligand_features)

# Save predictions to CSV
output_csv = "/content/drive/MyDrive/predicted_ligands.csv"
pred_df.to_csv(output_csv, index=False)

print(f"Predictions saved to {output_csv}")
pred_df.head()