Since we have more active cpompounds than inactives, we can balance them using 
1. Oversampling using SMOTE (Syntehtic Minority over sampling technique)
2. Undersampling

In [1]:
import pandas as pd
df= pd.read_csv('dna_gyrase_bioactivity_data_class_pIC50.csv')
df

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL187677,C[C@H]1CCc2c(N3CCC(O)CC3)c(F)cc3c(=O)c(C(=O)O)...,active,360.385,2.3071,2.0,5.0,8.301030
1,1,CHEMBL363449,CC1CCc2c(N3CCC(O)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,active,360.385,2.3071,2.0,5.0,8.000000
2,2,CHEMBL8,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,active,331.347,1.5833,2.0,5.0,8.301030
3,3,CHEMBL192226,CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn(/C=C\F)c3c2F)CC1,active,367.327,2.1274,1.0,5.0,7.141463
4,4,CHEMBL371124,CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn(/C=C/F)c3c2F)CC1,active,367.327,2.1274,1.0,5.0,6.177832
...,...,...,...,...,...,...,...,...,...
663,663,,,active,,,,,
664,664,,,active,,,,,
665,665,,,active,,,,,
666,666,,,active,,,,,


In [2]:
print(df['class'].unique())

['active' 'intermediate' 'inactive' nan]


In [3]:
# Keep only active and inactive
df = df[df['class'].isin(['active', 'inactive'])]

# Convert to numeric: active = 1, inactive = 0
df['class'] = df['class'].map({'active': 1, 'inactive': 0})

# Confirm
print(df['class'].value_counts())


class
1    284
0    138
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['class'] = df['class'].map({'active': 1, 'inactive': 0})


In [4]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

# Step 1: Prepare features and labels
# Before splitting the data
df = df.dropna()
X = df.drop(columns=['class'])  # Drop target
X = X.select_dtypes(include=[np.number])    # Keep numeric only
y = df['class']

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Original class distribution (Train):", Counter(y_train))
print("Original class distribution (Test):", Counter(y_test))

# Step 3: Apply SMOTE to training set only
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("\nClass distribution after SMOTE (Train):", Counter(y_train_smote))

# Step 4: Train Random Forest
model = RandomForestClassifier(random_state=42)
model.fit(X_train_smote, y_train_smote)

# Step 5: Predict and evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Inactive (0)', 'Active (1)']))



Original class distribution (Train): Counter({1: 163, 0: 94})
Original class distribution (Test): Counter({1: 41, 0: 24})

Class distribution after SMOTE (Train): Counter({1: 163, 0: 163})





Classification Report:
              precision    recall  f1-score   support

Inactive (0)       0.79      0.79      0.79        24
  Active (1)       0.88      0.88      0.88        41

    accuracy                           0.85        65
   macro avg       0.83      0.83      0.83        65
weighted avg       0.85      0.85      0.85        65



In [5]:
#Undersampling
import pandas as pd
import numpy as np
from sklearn.utils import resample
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Step 1: Filter only numeric features
X = df.drop(columns=['Unnamed: 0', 'molecule_chembl_id', 'canonical_smiles', 'class'])
X = X.select_dtypes(include=[np.number])
y = df['class'].astype(int)  # Ensure target is numeric

# Step 2: Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

print("Original class distribution (Train):", Counter(y_train))
print("Original class distribution (Test):", Counter(y_test))

# Step 3: Convert training set to DataFrame for resampling
df_train = X_train.copy()
df_train['target'] = y_train.values

# Step 4: Separate majority and minority classes
majority = df_train[df_train['target'] == 1]
minority = df_train[df_train['target'] == 0]

# Step 5: Undersample the majority class to match minority class
majority_downsampled = resample(
    majority,
    replace=False,
    n_samples=len(minority),
    random_state=42
)

# Step 6: Combine to get balanced dataset
df_balanced = pd.concat([majority_downsampled, minority])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 7: Separate features and target again
X_balanced = df_balanced.drop('target', axis=1).values
y_balanced = df_balanced['target'].values

print("Balanced class distribution (Train):", Counter(y_balanced))

# Step 8: Train and evaluate model
model = RandomForestClassifier(random_state=42)
model.fit(X_balanced, y_balanced)

y_pred = model.predict(X_test)

print("\nClassification Report (on original test set):")
print(classification_report(y_test, y_pred, target_names=['Inactive (0)', 'Active (1)']))


Original class distribution (Train): Counter({1: 163, 0: 94})
Original class distribution (Test): Counter({1: 41, 0: 24})
Balanced class distribution (Train): Counter({0: 94, 1: 94})

Classification Report (on original test set):
              precision    recall  f1-score   support

Inactive (0)       0.59      0.67      0.63        24
  Active (1)       0.79      0.73      0.76        41

    accuracy                           0.71        65
   macro avg       0.69      0.70      0.69        65
weighted avg       0.72      0.71      0.71        65





Since both SMOTE and undersampling did not result in satisfactory outputs (though the recall is high but the precision is low). 
we will try to handle class imbalnce using decoys generated from DUDE-z

In [6]:
df

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL187677,C[C@H]1CCc2c(N3CCC(O)CC3)c(F)cc3c(=O)c(C(=O)O)...,1,360.385,2.30710,2.0,5.0,8.301030
1,1,CHEMBL363449,CC1CCc2c(N3CCC(O)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,1,360.385,2.30710,2.0,5.0,8.000000
2,2,CHEMBL8,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,1,331.347,1.58330,2.0,5.0,8.301030
3,3,CHEMBL192226,CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn(/C=C\F)c3c2F)CC1,1,367.327,2.12740,1.0,5.0,7.141463
4,4,CHEMBL371124,CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn(/C=C/F)c3c2F)CC1,1,367.327,2.12740,1.0,5.0,6.177832
...,...,...,...,...,...,...,...,...,...
419,419,CHEMBL4866508,Cc1[nH]c(C(=O)N(c2nc3ccc(C(=O)O)cc3s2)C(C)C)c(...,0,412.298,4.99302,2.0,4.0,6.000000
420,420,CHEMBL4862321,Cc1[nH]c(C(=O)Nc2nc3cc(O)c(C(=O)O)cc3s2)c(Cl)c1Cl,0,386.216,3.89572,4.0,5.0,7.214670
421,421,CHEMBL4847572,O=C(O)c1cc(O)c2nc(NC(=O)c3cc(Br)c(Br)[nH]3)sc2c1,1,461.091,3.80550,4.0,5.0,6.769551
422,422,CHEMBL4847190,Cc1[nH]c(C(=O)Nc2nc3c(O)cc(C(=O)O)cc3s2)c(Cl)c1Cl,1,386.216,3.89572,4.0,5.0,7.657577


In [7]:
df.columns

Index(['Unnamed: 0', 'molecule_chembl_id', 'canonical_smiles', 'class', 'MW',
       'LogP', 'NumHDonors', 'NumHAcceptors', 'pIC50'],
      dtype='object')

In [8]:
import pandas as pd

# Drop rows with missing SMILES
df = df.dropna(subset=['canonical_smiles'])

# Create .smi lines: SMILES<TAB>CompoundName (using molecule_chembl_id)
smi_lines = df.apply(lambda x: f"{x['canonical_smiles']}\t{x['molecule_chembl_id']}", axis=1)

# Save to .smi file
smi_lines.to_csv('dna_gyrase_inhibitors.smi', index=False, header=False)

print("SMILES file saved as 'smiles.smi'")

SMILES file saved as 'smiles.smi'


In [9]:
import pandas as pd
# Load the dataset
df_actives = pd.read_csv('dna_gyrase_bioactivity_data_class_pIC50.csv')
df_actives['label']=1 
print(f"Actives: {len(df_actives)}")

decoy_file= r"C:\Users\saman\Desktop\DNAGyrase\decoys_20377\smi\dna_gyrase_inhibitors.smi"
decoys_file = r"C:\Users\saman\Desktop\Gyrase\decoys.smi"
with open(decoys_file, 'r') as f:
    decoys_smiles = [line.strip() for line in f if line.strip()]
print(f"Decoys: {len(decoys_smiles)}")


# Create DataFrame for decoys with label 0
df_decoys = pd.DataFrame({'smiles': decoys_smiles, 'label': 0})
print(f"Decoys (DataFrame): {len(df_decoys)}")


# Combine actives and decoys
df_combined = pd.concat([df_actives, df_decoys], ignore_index=True)

# Shuffle the combined dataset
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)




Actives: 668
Decoys: 6909
Decoys (DataFrame): 6909
