In [6]:
from rdkit import Chem
import pandas as pd
from rdkit.Chem import rdFingerprintGenerator


# Path to your CSV file
file_path = 'NEW_smilesforcompounds.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(file_path, header=None)  # Use header=None if there are no headers

# Convert the DataFrame column to a list
smiles_list = data[0].tolist()

# Function to generate Morgan (ECFP) fingerprints using MorganGenerator
def generate_morgan_fingerprints(smiles_list, radius=2, n_bits=2048):
    generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
    fingerprints = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fingerprint = generator.GetFingerprint(mol)
            # print(fingerprint)
            # Convert the fingerprint to a list
            fingerprints.append(list(fingerprint))
        else:
            fingerprints.append([None] * n_bits)
    return fingerprints

# Generate fingerprints with the desired settings
radius = 2  # ECFP4 corresponds to a radius of 2
n_bits = 2048  # Size of the fingerprint
fingerprints = generate_morgan_fingerprints(smiles_list, radius=radius, n_bits=n_bits)

# Convert to DataFrame for easier viewing
fingerprints_df = pd.DataFrame(fingerprints)

# Display the first few rows of the DataFrame
print(fingerprints_df.head())

# Optionally, save the DataFrame to a CSV file
fingerprints_df.to_csv("morgan_fingerprints.csv", index=False)

   0     1     2     3     4     5     6     7     8     9     ...  2038  \
0     0     0     0     0     0     0     0     0     0     0  ...     0   
1     0     0     0     0     0     0     0     1     0     0  ...     0   
2     0     0     0     0     0     0     0     1     0     0  ...     0   
3     0     0     0     0     0     0     0     0     0     0  ...     0   
4     0     0     0     0     0     0     0     1     0     0  ...     0   

   2039  2040  2041  2042  2043  2044  2045  2046  2047  
0     0     0     0     0     0     0     0     0     0  
1     0     0     0     0     0     0     0     0     0  
2     0     0     0     0     0     0     0     0     0  
3     0     0     0     0     0     0     0     0     0  
4     0     0     0     0     0     0     0     0     0  

[5 rows x 2048 columns]


In [7]:
# Assuming 'labels' is a list of bioactivity labels corresponding to the SMILES strings
labels = [1 for i in range(112)]
labels.extend([0 for i in range(112)])
fingerprints_df['Bioactivity'] = labels

In [8]:
import sklearn
from sklearn.model_selection import train_test_split

X = fingerprints_df.drop(columns=['Bioactivity'])  # Features
y = fingerprints_df['Bioactivity']  # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  from scipy.sparse import csr_matrix, issparse


TypeError: 'type' object is not subscriptable

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

y_pred = model.predict(X_test)

# For classification
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'AUC-ROC: {auc}')

Accuracy: 0.8571428571428571
F1 Score: 0.5
AUC-ROC: 0.6666666666666666


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(grid_search.best_params_)

{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
import numpy as np

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
for i in range(10):
    print(f"{i + 1}. Feature {indices[i]} ({importances[indices[i]]})")

1. Feature 389 (0.050696493556499585)
2. Feature 1844 (0.03762001188240858)
3. Feature 1564 (0.03128755562934798)
4. Feature 1737 (0.02320693937758288)
5. Feature 1607 (0.015043383689187726)
6. Feature 79 (0.014392875843760058)
7. Feature 410 (0.009541105499347006)
8. Feature 140 (0.009250646936479956)
9. Feature 857 (0.009163118669558754)
10. Feature 519 (0.009151081704750472)


In [None]:
print(grid_search.best_params_)

{'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming best_model is already defined from previous GridSearchCV
# and X_test, y_test are your test data and labels.

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Set Accuracy: 0.7857

Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88        22
           1       0.00      0.00      0.00         6

    accuracy                           0.79        28
   macro avg       0.39      0.50      0.44        28
weighted avg       0.62      0.79      0.69        28


Confusion Matrix:
[[22  0]
 [ 6  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
