# **real time inference**

In [27]:
!pip install rdkit-pypi




In [28]:
!pip uninstall -y numpy
!pip install numpy==1.24

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==1.24
  Downloading numpy-1.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m117.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.24.0 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.0 which is incompatible.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.0 which is incompatible.
pymc 5.23.0 requires numpy>=1.25.0, but you have numpy 1.24.0 which is incom

In [29]:
!pip uninstall -y pandas
!pip install pandas

Found existing installation: pandas 2.2.2
Uninstalling pandas-2.2.2:
  Successfully uninstalled pandas-2.2.2
Collecting pandas
  Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m111.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.0 which is incompatible.
pymc 5.23.0 requires numpy>=1.25.0, but you have numpy 1.24.0 which is incompatible.
seaborn 0.13.2 requires numpy!=1.24

In [10]:
import torch
import torch.nn as nn

# Define the model class exactly as before
class ImprovedMolecularNN(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedMolecularNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.fc5 = nn.Linear(64, 1)

        self.leaky_relu = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout(0.4)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.leaky_relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.leaky_relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.leaky_relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        x = self.leaky_relu(self.bn4(self.fc4(x)))
        x = self.sigmoid(self.fc5(x))
        return x

# Set input dimension (update if needed)
input_dim = 140  # or whatever your feature size is
# The model class must be defined before loading the state_dict
model = ImprovedMolecularNN(input_dim) # Instantiate the model with the correct input dimension
# Load the model state dictionary, mapping parameters to the instantiated model
model = torch.load("1-2D rdki best nn_model.pth", map_location=torch.device('cpu'), weights_only=False)
model.eval()

ImprovedMolecularNN(
  (fc1): Linear(in_features=140, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc5): Linear(in_features=64, out_features=1, bias=True)
  (leaky_relu): LeakyReLU(negative_slope=0.1)
  (dropout): Dropout(p=0.4, inplace=False)
  (sigmoid): Sigmoid()
)

In [12]:
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
import numpy as np

# Define the *exact* list of 140 descriptors used during training
# Replace this list with the actual 140 descriptor names
descriptor_names = [
    'MaxEStateIndex', 'MinEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt',
    'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'FpDensityMorgan1',
    'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_MRHI',
    'BCUT2D_MRLOW', 'BalabanJ', 'HallKierAlpha', 'Kappa3',
    'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14',
    'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7',
    'PEOE_VSA8', 'PEOE_VSA9',
    'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA6', 'SMR_VSA7',
    'SMR_VSA8', 'SMR_VSA9',
    'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA12', 'SlogP_VSA3', 'SlogP_VSA4',
    'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9',
    'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4',
    'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8',
    'VSA_EState10', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState9',
    'FractionCSP3', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles',
    'NumAliphaticRings', 'NumAromaticHeterocycles', 'MolLogP',
    'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_NH',
    'fr_Ar_OH', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2',
    'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_SH', 'fr_aldehyde',
    'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amidine',
    'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur',
    'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine',
    'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen',
    'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan',
    'fr_isothiocyan', 'fr_ketone', 'fr_lactam', 'fr_lactone', 'fr_methoxy',
    'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom_nonortho',
    'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation',
    'fr_phos_acid', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd',
    'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone',
    'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan',
    'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea'
]



# Ensure the number of descriptors matches the model's input dimension
assert len(descriptor_names) == input_dim, f"Mismatch in descriptor count. Expected {input_dim}, got {len(descriptor_names)}"

descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

def smiles_to_descriptors(smiles: str) -> np.ndarray:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES: {smiles}")

    descriptors = descriptor_calculator.CalcDescriptors(mol)
    return np.array(descriptors, dtype=np.float32)

In [13]:
import numpy as np

def predict(input_features):
    model.eval()
    with torch.no_grad():
        input_tensor = torch.tensor(input_features, dtype=torch.float32)

        # If input is 1D (single sample), add batch dimension
        if input_tensor.ndim == 1:
            input_tensor = input_tensor.unsqueeze(0)

        output = model(input_tensor)
        prediction = output.item()
        return prediction

In [14]:
def predict_from_smiles(smiles: str) -> float:
    input_features = smiles_to_descriptors(smiles)
    return predict(input_features)  # predict() is the function from earlier

In [17]:
smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # Aspirin
score = predict_from_smiles(smiles)
label = 1 if score > 0.5 else 0
print("Predicted probability of activity:", score)

Predicted probability of activity: 0.9997808337211609


In [24]:
smiles = "CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)Cc3ccccc3)C(=O)O)C"  # Aspirin
score = predict_from_smiles(smiles)
label = 1 if score > 0.5 else 0
print("Predicted class label:", label)


Predicted class label: 1


In [26]:
# Define the list of SMILES strings you want to process
smiles_list = [
    "CCO",  # Ethanol
    "C",   # Methane
    "CC(=O)N", # Acetamide
    "C1=CC=CC=C1" # Benzene
    # Add more SMILES strings to this list as needed
]

for s in smiles_list:
    score = predict_from_smiles(s)
    label = 1 if score > 0.5 else 0
    print(f"SMILES: {s} | Score: {score:.4f} | Predicted label: {label}")

SMILES: CCO | Score: 0.9974 | Predicted label: 1
SMILES: C | Score: 0.9796 | Predicted label: 1
SMILES: CC(=O)N | Score: 0.9952 | Predicted label: 1
SMILES: C1=CC=CC=C1 | Score: 0.9997 | Predicted label: 1


# **balancing**

In [29]:
# prompt: train and test svm rbf on this data RDkit-2D_scaled_data

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Load your data
RDkit_2D_scaled_data = pd.read_csv('RDkit-2D_scaled_data.csv')

# Replace 'target_column' with the actual name of your target variable column
X = RDkit_2D_scaled_data.drop('values', axis=1)
y = RDkit_2D_scaled_data['values']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the SVM classifier with RBF kernel
svm_rbf = SVC(kernel='rbf', random_state=42, class_weight='balanced')

# Train the model
svm_rbf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_rbf.predict(X_test)

# Evaluate the model
print("SVM RBF Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


SVM RBF Classifier Performance:
Accuracy: 0.622588424437299
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.84      0.69      1244
           1       0.71      0.41      0.52      1244

    accuracy                           0.62      2488
   macro avg       0.65      0.62      0.60      2488
weighted avg       0.65      0.62      0.60      2488



In [32]:
# prompt: make real time testing using this model

def real_time_testing(smiles_list):
    """
    Performs real-time inference on a list of SMILES strings using the loaded model.

    Args:
        smiles_list (list): A list of SMILES strings to predict on.
    """
    print("\n--- Real-Time Testing ---")
    for smiles in smiles_list:
        try:
            score = predict_from_smiles(smiles)
            label = 1 if score > 0.5 else 0
            print(f"SMILES: {smiles} | Score: {score:.4f} | Predicted label: {label}")
        except ValueError as e:
            print(f"Error processing SMILES '{smiles}': {e}")
        except Exception as e:
            print(f"An unexpected error occurred for SMILES '{smiles}': {e}")

# Example usage of the real_time_testing function
test_smiles = [
    "CCC",         # Propane
    "O=C(C)Oc1ccccc1C(=O)O", # Another Aspirin representation
    "invalid_smiles", # An invalid SMILES string to test error handling
    "C1=CC=NC=C1" # Pyridine
]

real_time_testing(test_smiles)



--- Real-Time Testing ---
SMILES: CCC | Score: 0.9773 | Predicted label: 1
SMILES: O=C(C)Oc1ccccc1C(=O)O | Score: 0.9998 | Predicted label: 1
Error processing SMILES 'invalid_smiles': Invalid SMILES: invalid_smiles
SMILES: C1=CC=NC=C1 | Score: 0.9998 | Predicted label: 1


[14:05:15] SMILES Parse Error: syntax error while parsing: invalid_smiles
[14:05:15] SMILES Parse Error: Failed parsing SMILES 'invalid_smiles' for input: 'invalid_smiles'


In [34]:
test_smiles = ["CCO", "C", "CC(=O)N", "C1=CC=CC=C1"]
real_time_testing(test_smiles)


--- Real-Time Testing ---
SMILES: CCO | Score: 0.9974 | Predicted label: 1
SMILES: C | Score: 0.9796 | Predicted label: 1
SMILES: CC(=O)N | Score: 0.9952 | Predicted label: 1
SMILES: C1=CC=CC=C1 | Score: 0.9997 | Predicted label: 1


In [35]:
# prompt: train and test on random forest

from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
# You can adjust n_estimators (number of trees), max_depth, etc.
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the Random Forest model
rf_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_clf.predict(X_test)

# Evaluate the Random Forest model
print("\nRandom Forest Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))



Random Forest Classifier Performance:
Accuracy: 0.5711414790996785
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.66      0.61      1244
           1       0.59      0.48      0.53      1244

    accuracy                           0.57      2488
   macro avg       0.57      0.57      0.57      2488
weighted avg       0.57      0.57      0.57      2488



In [36]:
# prompt: train and test the model using random forest grid search cv

from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize GridSearchCV
# cv=5 means 5-fold cross-validation
# scoring='accuracy' can be changed to 'f1', 'roc_auc', etc. depending on the metric you want to optimize
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42, class_weight='balanced'),
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',
                           n_jobs=-1, # Use all available cores
                           verbose=1) # Print progress

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("\nBest parameters found by Grid Search:")
print(grid_search.best_params_)

# Get the best estimator (the model trained with the best parameters)
best_rf_clf = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred_best_rf = best_rf_clf.predict(X_test)

# Evaluate the best model
print("\nRandom Forest Classifier Performance after Grid Search:")
print("Accuracy:", accuracy_score(y_test, y_pred_best_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_best_rf))


Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Best parameters found by Grid Search:
{'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}

Random Forest Classifier Performance after Grid Search:
Accuracy: 0.6117363344051447
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.86      0.69      1244
           1       0.72      0.36      0.48      1244

    accuracy                           0.61      2488
   macro avg       0.65      0.61      0.59      2488
weighted avg       0.65      0.61      0.59      2488



In [37]:
# prompt: real time testing on smiles

# You can reuse the existing real_time_testing function.
# It is already defined and demonstrated in the preceding code.
# Just call it with the list of SMILES you want to test in real-time.

# Example of calling the real_time_testing function with a new list of SMILES
print("\n--- Performing real-time testing on new SMILES list ---")
new_smiles_for_testing = [
    "C1=CC=C(C=C1)N", # Aniline
    "CC(C)(C)O",      # tert-Butanol
    "O=C1CCCCC1",    # Cyclohexanone
    "CCOc1ccccc1"    # Phenetole
]

real_time_testing(new_smiles_for_testing)



--- Performing real-time testing on new SMILES list ---

--- Real-Time Testing ---
SMILES: C1=CC=C(C=C1)N | Score: 0.9999 | Predicted label: 1
SMILES: CC(C)(C)O | Score: 1.0000 | Predicted label: 1
SMILES: O=C1CCCCC1 | Score: 0.9967 | Predicted label: 1
SMILES: CCOc1ccccc1 | Score: 0.9999 | Predicted label: 1
