In [48]:
!pip install rdkit
!pip install scikit-learn



In [49]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import requests

In [50]:
def get_smiles_from_pubchem(drug_name):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug_name}/property/CanonicalSMILES/JSON"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        smiles = data['PropertyTable']['Properties'][0]['CanonicalSMILES']
        return smiles
    except requests.exceptions.HTTPError as err:
        print(f"HTTP error occurred: {err}")  # Handle errors in calling the API
    except Exception as err:
        print(f"Other error occurred: {err}")  # Handle other possible errors
    return None

In [51]:
get_smiles_from_pubchem('ligandName')

HTTP error occurred: 404 Client Error: PUGREST.NotFound for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/ligandName/property/CanonicalSMILES/JSON


In [52]:
def get_molecular_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        molwt = Descriptors.MolWt(mol)
        logp = Descriptors.MolLogP(mol)
        tpsa = Descriptors.TPSA(mol)
        return [molwt, logp, tpsa]
    return [None, None, None]

# Apply the function to the SMILES data
# data[['MolWt', 'LogP', 'TPSA']] = data['SMILES'].apply(lambda x: pd.Series(get_molecular_descriptors(x)))

In [53]:
# data = pd.read_csv('compounds.csv')
# print(data.head())

In [54]:
import pandas as pd

# Load the initial data with drug names
data = pd.read_csv('compounds.csv', delimiter=';')

# Retrieve SMILES strings for each drug
data['SMILES'] = data['Agonist Name'].apply(get_smiles_from_pubchem)

# Continue with your feature extraction and ML model training as before
data[['MolWt', 'LogP', 'TPSA']] = data['SMILES'].apply(lambda x: pd.Series(get_molecular_descriptors(x) if x else [None, None, None]))

# Filter out rows where SMILES could not be retrieved
data = data.dropna(subset=['SMILES'])

# Now you can split the data, train your model, and make predictions as outlined previously

HTTP error occurred: 404 Client Error: PUGREST.NotFound for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/AFMK%20(N-acetyl-N-formyl-5-metoxykynurenamine)/property/CanonicalSMILES/JSON


In [55]:
X = data[['MolWt', 'LogP', 'TPSA']].values
y = data['Activity'].values  # 'Activity' column has binary values 0 or 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

KeyError: 'Activity'

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
joblib.dump(model, 'bioactivity_model.pkl')

In [None]:
new_data = pd.read_csv('new_compounds.csv')
new_data['Predictions'] = new_data['SMILES'].apply(lambda x: model.predict([get_molecular_descriptors(x)])[0])
print(new_data[['SMILES', 'Predictions']])