## load dataset


In [None]:
!pip install pytdc
!pip install pandas

In [3]:
import pandas as pd

data = pd.read_csv("Lipophilicity.csv")
data.head()


Unnamed: 0,CMPD_CHEMBLID,exp,smiles
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...


## extract the features from smiles

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw,Descriptors,rdmolops
from IPython.display import display
smiles=input("enter a smiles string pls")
mol=Chem.MolFromSmiles(smiles)
# Show molecular info
def featurize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return [
        Descriptors.MolWt(mol),
        Descriptors.NumHDonors(mol),
        Descriptors.NumHAcceptors(mol),
        Descriptors.MolLogP(mol),
        Descriptors.TPSA(mol)
    ]




display(Draw.MolToImage(mol))

## build the dataset

In [5]:
X = []
y = []

for i, row in data.iterrows():
    feats = featurize(row["smiles"])
    if feats is not None:
        X.append(feats)
        y.append(1 if row["exp"] >1 else 0)  # or 'target' depending on dataset


In [6]:
print("First row of X:", X[0])
print("First label y:", y[0])
print("Total X:", len(X), "| Total y:", len(y))


First row of X: [340.85800000000006, 0, 4, 3.5489000000000024, 24.3]
First label y: 1
Total X: 4200 | Total y: 4200


## spliting dataset so that we can train and test


In [7]:
!pip install scikit-learn




In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## model preparation


In [9]:
!pip install numpy



In [10]:
import numpy as np

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train).ravel()
y_test = np.array(y_test).ravel()




In [11]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)


0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


## FINAL RESULT

## model testing with decision tree-80%

In [12]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(" Accuracy:\n", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


 Accuracy:
 0.8047619047619048

Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.44      0.46       158
           1       0.87      0.89      0.88       682

    accuracy                           0.80       840
   macro avg       0.68      0.66      0.67       840
weighted avg       0.80      0.80      0.80       840


 Confusion Matrix:
 [[ 69  89]
 [ 75 607]]


In [None]:
!pip install pandas

In [14]:
import pandas as pd
pd.set_option("display.width", 1000)
pd.set_option("display.max_columns", None)


In [15]:
import pandas as pd

# 1. Create a DataFrame from X_test (test features)
X_test_df = pd.DataFrame(
    X_test,
    columns=["MolWt", "HDonors", "HAcceptors", "LogP", "TPSA"]
)

# 2. Add true labels and predicted labels
X_test_df["True Label"] = y_test
X_test_df["Predicted"] = y_pred
# 3. Add a column to check if prediction is correct
X_test_df["Correct?"] = X_test_df["True Label"] == X_test_df["Predicted"]
print(X_test_df.head(10))



     MolWt  HDonors  HAcceptors     LogP    TPSA  True Label  Predicted  Correct?
0  366.804      1.0         3.0  4.11690   62.55           1          0     False
1  341.451      2.0         4.0  3.24140   58.56           1          1      True
2  492.428      3.0         5.0  5.09260   91.32           0          1     False
3  281.381      1.0         5.0  1.29180   54.46           1          1      True
4  386.779      3.0         4.0  2.05610   95.50           1          1      True
5  514.629      1.0         5.0  7.26442   72.94           1          1      True
6  458.400      3.0         6.0  3.66580  125.79           0          0      True
7  359.794      2.0         5.0  2.29450   87.72           1          1      True
8  145.165      1.0         3.0  1.21200   51.80           1          1      True
9  253.240      1.0         5.0  1.98928   80.00           1          1      True


## accuracy testing by svm-81%

In [16]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)


0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(" Accuracy:\n", accuracy_score(y_test, y_pred))

 Accuracy:
 0.8119047619047619


## accuracy test by XGBoost-83%

In [19]:
!pip install xgboost
from xgboost import XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(" Accuracy:\n", accuracy_score(y_test, y_pred))

 Accuracy:
 0.8345238095238096


## acuracy test by LightGBM-84.64%

In [None]:
!pip install lightgbm
from lightgbm import LGBMClassifier
model = LGBMClassifier()
model.fit(X_train, y_train)


In [37]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(" Accuracy:\n", accuracy_score(y_test, y_pred))

 Accuracy:
 0.8464285714285714




In [None]:
!pip install plotly

In [50]:
import plotly.express as px

fig = px.scatter(
    X_test_df,  # Use the actual DataFrame name
    x='MolWt',
    y='LogP',
    color='Predicted',
    hover_data=['TPSA', 'HDonors', 'HAcceptors'],
    title='Toxicity Prediction by Molecular Properties'
)

fig.show()


## code for toxicity of your input

In [60]:
from rdkit import Chem
from rdkit.Chem import Descriptors

# Your featurization function (make sure it's same as used in training)
def featurize(mol):
    if mol is not None:
        return [
            Descriptors.MolWt(mol),
            Descriptors.NumHDonors(mol),
            Descriptors.NumHAcceptors(mol),
            Descriptors.MolLogP(mol),
            Descriptors.TPSA(mol)
        ]
    else:
        return None

# Input one SMILES from user
smiles = input("Enter a SMILES string to check toxicity: ")

# Convert to molecule and featurize
mol = Chem.MolFromSmiles(smiles)
features = featurize(mol)
molwt = Descriptors.MolWt(mol)
h_donors = Descriptors.NumHDonors(mol)
h_acceptors = Descriptors.NumHAcceptors(mol)
logp = Descriptors.MolLogP(mol)
tpsa = Descriptors.TPSA(mol)

print("\n🔍 Molecular Properties:")
print(f"• Molecular Weight     : {molwt:.2f}")
print(f"• H-Bond Donors        : {h_donors}")
print(f"• H-Bond Acceptors     : {h_acceptors}")
print(f"• LogP (Lipophilicity) : {logp:.2f}")
print(f"• TPSA (Polar Surface) : {tpsa:.2f}")


# Predict toxicity
if features:
    y_pred_single = model.predict([features])[0]
    if y_pred_single == 1:
        print("🔴 This molecule is predicted to be TOXIC.")

    else:
        print("🟢 This molecule is predicted to be NON-TOXIC.")
else:
    print("❌ Invalid SMILES string. Please try again.")

X_test_df.groupby("Predicted")[["MolWt", "LogP", "HDonors", "HAcceptors", "TPSA"]].describe()




🔍 Molecular Properties:
• Molecular Weight     : 46.07
• H-Bond Donors        : 1
• H-Bond Acceptors     : 1
• LogP (Lipophilicity) : -0.00
• TPSA (Polar Surface) : 20.23
🟢 This molecule is predicted to be NON-TOXIC.



X does not have valid feature names, but LGBMClassifier was fitted with feature names



Unnamed: 0_level_0,MolWt,MolWt,MolWt,MolWt,MolWt,MolWt,MolWt,MolWt,LogP,LogP,LogP,LogP,LogP,LogP,LogP,LogP,HDonors,HDonors,HDonors,HDonors,HDonors,HDonors,HDonors,HDonors,HAcceptors,HAcceptors,HAcceptors,HAcceptors,HAcceptors,HAcceptors,HAcceptors,HAcceptors,TPSA,TPSA,TPSA,TPSA,TPSA,TPSA,TPSA,TPSA
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Predicted,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2
0,144.0,360.715403,100.744063,157.257,289.35575,349.4105,443.625,601.535,144.0,2.709067,1.525995,-1.9714,1.5923,2.82155,3.819655,6.2538,144.0,1.944444,1.169411,0.0,1.0,2.0,3.0,6.0,144.0,4.625,1.899116,1.0,3.0,4.0,6.0,10.0,144.0,82.236597,28.255002,23.47,62.525,79.515,102.58,148.77
1,696.0,399.683605,102.220428,113.16,337.12625,402.754,470.10525,837.058,696.0,3.489927,1.252209,-0.0176,2.633755,3.47131,4.28562,8.4212,696.0,1.573276,1.067683,0.0,1.0,1.0,2.0,6.0,696.0,5.372126,2.160009,0.0,4.0,5.0,7.0,17.0,696.0,79.701034,29.987821,0.0,57.69,80.04,98.235,216.89
