Since we have more active than inactive compounds, weu can balance the classes using 

1) Oversampling (SMOTE – Synthetic Minority Oversampling Technique), which generates synthetic inactive samples to balance the dataset.
2) Undersampling

In [8]:
import pandas as pd
df= pd.read_csv(r"C:\Users\saman\Desktop\Gyrase\dna_gyrase_bioactivity_data_class_pIC50.csv")
df

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL187677,C[C@H]1CCc2c(N3CCC(O)CC3)c(F)cc3c(=O)c(C(=O)O)...,active,360.385,2.3071,2,5,8.301030
1,1,CHEMBL363449,CC1CCc2c(N3CCC(O)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,active,360.385,2.3071,2,5,8.000000
2,2,CHEMBL8,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,active,331.347,1.5833,2,5,8.301030
3,3,CHEMBL192226,CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn(/C=C\F)c3c2F)CC1,active,367.327,2.1274,1,5,7.141463
4,4,CHEMBL371124,CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn(/C=C/F)c3c2F)CC1,active,367.327,2.1274,1,5,6.177832
...,...,...,...,...,...,...,...,...,...
541,541,CHEMBL5431772,COc1ccc2nccc(OC[C@H]3CC[C@H](NC(=O)c4cc(F)c(Br...,inactive,506.347,5.0467,1,5,4.000000
542,542,CHEMBL5429864,COc1ccc2nccc(OC[C@H]3CC[C@H](NC(=O)c4ccc(I)c(F...,inactive,535.357,4.7497,1,5,4.000000
543,543,CHEMBL5394250,COc1ccc2nccc(OC[C@@H]3CC[C@@H](NC(=O)c4cc(F)c(...,active,463.868,3.9264,1,6,6.351640
544,544,CHEMBL5406927,COc1ccc2nccc(OC[C@@H]3CC[C@@H](NC(=O)c4cc(F)c(...,active,508.319,4.0355,1,6,6.732828


In [5]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

# Generate an imbalanced dataset (for demonstration)
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_classes=2,
    weights=[0.95, 0.05],  # 95% Class 0, 5% Class 1
    random_state=42
)

# Split into train and test sets (keeping imbalance in test set)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check original class distribution
print("Original class distribution (Train):", Counter(y_train))
print("Original class distribution (Test):", Counter(y_test))

# Apply SMOTE to balance the training set only (test set remains imbalanced)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("\nClass distribution after SMOTE (Train):", Counter(y_train_smote))

# Train a classifier (Random Forest) on the balanced data
model = RandomForestClassifier(random_state=42)
model.fit(X_train_smote, y_train_smote)

# Predict on the original (imbalanced) test set
y_pred = model.predict(X_test)

# Generate classification report
print("\nClassification Report (on original test set):")
print(classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1']))

Original class distribution (Train): Counter({0: 758, 1: 42})
Original class distribution (Test): Counter({0: 189, 1: 11})

Class distribution after SMOTE (Train): Counter({0: 758, 1: 758})





Classification Report (on original test set):
              precision    recall  f1-score   support

     Class 0       0.96      0.98      0.97       189
     Class 1       0.57      0.36      0.44        11

    accuracy                           0.95       200
   macro avg       0.77      0.67      0.71       200
weighted avg       0.94      0.95      0.94       200



Undersampling


In [3]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Generate synthetic imbalanced dataset (or load your own)
# Let's assume: 95% active (1), 5% inactive (0)
np.random.seed(42)
X = np.random.rand(1000, 10)  # 1000 samples, 10 features
y = np.array([1] * 950 + [0] * 50)  # 950 actives, 50 inactives

# Check class distribution
print("Original class distribution:", Counter(y))

# Split into train and test (stratified to preserve imbalance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Undersample the majority class (active = 1) to match minority class
# Step 1: Separate majority and minority classes
df_train = pd.DataFrame(X_train)
df_train['target'] = y_train

majority = df_train[df_train['target'] == 1]
minority = df_train[df_train['target'] == 0]

# Step 2: Randomly undersample majority class
majority_downsampled = resample(
    majority,
    replace=False,  # Sample without replacement
    n_samples=len(minority),  # Match minority class size
    random_state=42
)

# Step 3: Combine minority and downsampled majority
df_balanced = pd.concat([majority_downsampled, minority])

# Step 4: Shuffle the dataset to avoid ordering bias
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Extract X and y
X_balanced = df_balanced.drop('target', axis=1).values
y_balanced = df_balanced['target'].values

# Check new class distribution
print("Balanced class distribution:", Counter(y_balanced))

# Train a model (e.g., Random Forest) and evaluate
model = RandomForestClassifier(random_state=42)
model.fit(X_balanced, y_balanced)
y_pred = model.predict(X_test)

# Evaluate
print("\nClassification Report (on original test set):")
print(classification_report(y_test, y_pred))

Original class distribution: Counter({1: 950, 0: 50})
Balanced class distribution: Counter({1: 40, 0: 40})

Classification Report (on original test set):
              precision    recall  f1-score   support

           0       0.06      0.60      0.10        10
           1       0.96      0.46      0.62       190

    accuracy                           0.47       200
   macro avg       0.51      0.53      0.36       200
weighted avg       0.91      0.47      0.59       200



Since both SMOTE and undersampling did not result in satisfactory outputs (though the recall is high but the precision is low). 
we will try to handle class imbalnce using decoys generated from DUDE-z

In [10]:
df

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL187677,C[C@H]1CCc2c(N3CCC(O)CC3)c(F)cc3c(=O)c(C(=O)O)...,active,360.385,2.3071,2,5,8.301030
1,1,CHEMBL363449,CC1CCc2c(N3CCC(O)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,active,360.385,2.3071,2,5,8.000000
2,2,CHEMBL8,O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O,active,331.347,1.5833,2,5,8.301030
3,3,CHEMBL192226,CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn(/C=C\F)c3c2F)CC1,active,367.327,2.1274,1,5,7.141463
4,4,CHEMBL371124,CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn(/C=C/F)c3c2F)CC1,active,367.327,2.1274,1,5,6.177832
...,...,...,...,...,...,...,...,...,...
541,541,CHEMBL5431772,COc1ccc2nccc(OC[C@H]3CC[C@H](NC(=O)c4cc(F)c(Br...,inactive,506.347,5.0467,1,5,4.000000
542,542,CHEMBL5429864,COc1ccc2nccc(OC[C@H]3CC[C@H](NC(=O)c4ccc(I)c(F...,inactive,535.357,4.7497,1,5,4.000000
543,543,CHEMBL5394250,COc1ccc2nccc(OC[C@@H]3CC[C@@H](NC(=O)c4cc(F)c(...,active,463.868,3.9264,1,6,6.351640
544,544,CHEMBL5406927,COc1ccc2nccc(OC[C@@H]3CC[C@@H](NC(=O)c4cc(F)c(...,active,508.319,4.0355,1,6,6.732828


In [11]:
df.columns

Index(['Unnamed: 0', 'molecule_chembl_id', 'canonical_smiles', 'class', 'MW',
       'LogP', 'NumHDonors', 'NumHAcceptors', 'pIC50'],
      dtype='object')

In [17]:
import pandas as pd

# Drop rows with missing SMILES
df = df.dropna(subset=['canonical_smiles'])

# Create .smi lines: SMILES<TAB>CompoundName (using molecule_chembl_id)
smi_lines = df.apply(lambda x: f"{x['canonical_smiles']}\t{x['molecule_chembl_id']}", axis=1)

# Save to .smi file
smi_lines.to_csv('dna_gyrase_inhibitors.smi', index=False, header=False)

print("SMILES file saved as 'smiles.smi'")



SMILES file saved as 'smiles.smi'


using the decoys.smi files generated by dude-z we will combine them to form a new dataset

In [35]:
import pandas as pd

# Load actives
df_actives = pd.read_csv(r"C:\Users\saman\Desktop\Gyrase\dna_gyrase_bioactivity_data_class_pIC50.csv")
df_actives['label'] = 1  # Assign label 1 to actives
print(f"Actives: {len(df_actives)}")

# Load decoys from SMILES file
decoys_file = r"C:\Users\saman\Desktop\Gyrase\decoys.smi"
with open(decoys_file, 'r') as f:
    decoys_smiles = [line.strip() for line in f if line.strip()]
print(f"Decoys: {len(decoys_smiles)}")

# Create DataFrame for decoys with label 0
df_decoys = pd.DataFrame({'smiles': decoys_smiles, 'label': 0})
print(f"Decoys (DataFrame): {len(df_decoys)}")

# Combine actives and decoys
df_combined = pd.concat([df_actives, df_decoys], ignore_index=True)

# Shuffle the combined dataset
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)




Actives: 546
Decoys: 7000
Decoys (DataFrame): 7000


In [36]:
df_combined


Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50,label,smiles
0,,,,,,,,,,0,O=c1cc(O)n(-c2ccc(Br)cc2)[n-]1 ZINC000000039383
1,393.0,CHEMBL4756100,COc1ccc2nccc(CC[C@H]3OC[C@H](NC(=O)c4ccc(Cl)c(...,active,461.345,4.6543,1.0,5.0,6.229148,1,
2,,,,,,,,,,0,O=C1c2ccccc2S(=O)(=O)c2cc(-c3nn[n-]n3)ccc21 ZI...
3,,,,,,,,,,0,CN(C)c1ccc(/C=N/NS(=O)(=O)c2ccc(Cl)cc2)cc1 ZIN...
4,23.0,CHEMBL5315124,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...,active,361.373,1.5440,1.0,6.0,8.031050,1,
...,...,...,...,...,...,...,...,...,...,...,...
7541,,,,,,,,,,0,FC(F)(F)c1ccc2nsnc2c1 ZINC000000071474
7542,,,,,,,,,,0,Cc1ccc([C@@H](Nc2cc(C)cc[nH+]2)c2c(C)[nH]c3ccc...
7543,,,,,,,,,,0,O=S(=O)(C=C([O-])c1c(F)cccc1Cl)c1ccc(Cl)cc1 ZI...
7544,,,,,,,,,,0,Cc1ccc(/N=c2\sccn2C)c(C)c1 ZINC000000001237


In [37]:
df_combined.to_csv(r"C:\Users\saman\Desktop\Gyrase\combined_dataset.csv", index=False)