## preprocessing dataset

In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
%pip install xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

Note: you may need to restart the kernel to use updated packages.


In [36]:
# also cleaned the dataset manually, some values were wrong
import pandas as pd
df = pd.read_csv("/home/rizanb/Documents/hob_pred/data/interim/drug_properties_hob.csv")
df.head()

Unnamed: 0,drug_name,smiles,hacc,hdon,mw,logp,mr,tpsa,nrot,type,hob_category
0,adrenaline,CNCC(C1=CC(=C(C=C1)O)O)O,4,4,183.207,0.3506,48.6581,72.72,3,B,1
1,alprenolol,CC(C)NCC(COC1=CC=CC=C1CC=C)O,3,2,249.354,2.1528,74.9835,41.49,8,B,1
2,clomethiazole,CC1=C(SC=N1)CCCl,2,0,161.657,2.23282,41.275,12.89,2,N,1
3,coumarin,C1=CC=C2C(=C1)C=CC(=O)O2,2,0,146.145,1.793,42.484,30.21,0,N,1
4,dobutamine,CC(CCC1=CC=C(C=C1)O)NCCC2=CC(=C(C=C2)O)O,4,4,301.386,2.9568,87.3681,72.72,7,B,1


In [37]:
df.columns = df.columns.str.strip()

In [38]:
df["type"].unique()

array(['B', 'N', 'A'], dtype=object)

In [39]:
len(df["drug_name"].unique()) == len(df["drug_name"])

True

In [40]:
print(df["hacc"].dtype, df["hdon"].dtype, df["mw"].dtype, df["logp"].dtype, df["mr"].dtype, df["tpsa"].dtype, df["nrot"].dtype )

int64 int64 float64 float64 float64 float64 int64


In [41]:
df.isna().sum()

drug_name       0
smiles          0
hacc            0
hdon            0
mw              0
logp            0
mr              0
tpsa            0
nrot            0
type            0
hob_category    0
dtype: int64

In [42]:
df.isnull().sum()

drug_name       0
smiles          0
hacc            0
hdon            0
mw              0
logp            0
mr              0
tpsa            0
nrot            0
type            0
hob_category    0
dtype: int64

## remove unnecessary features - drug name, smiles

In [43]:
df =  df.drop(columns=["drug_name","smiles"],axis=1)
df.head()

Unnamed: 0,hacc,hdon,mw,logp,mr,tpsa,nrot,type,hob_category
0,4,4,183.207,0.3506,48.6581,72.72,3,B,1
1,3,2,249.354,2.1528,74.9835,41.49,8,B,1
2,2,0,161.657,2.23282,41.275,12.89,2,N,1
3,2,0,146.145,1.793,42.484,30.21,0,N,1
4,4,4,301.386,2.9568,87.3681,72.72,7,B,1


## one hot encoding for type (acid/base/neutral)

In [44]:
df_enc = pd.get_dummies(df, columns=["type"], dtype=int)

In [45]:
df_enc.head()

Unnamed: 0,hacc,hdon,mw,logp,mr,tpsa,nrot,hob_category,type_A,type_B,type_N
0,4,4,183.207,0.3506,48.6581,72.72,3,1,0,1,0
1,3,2,249.354,2.1528,74.9835,41.49,8,1,0,1,0
2,2,0,161.657,2.23282,41.275,12.89,2,1,0,0,1
3,2,0,146.145,1.793,42.484,30.21,0,1,0,0,1
4,4,4,301.386,2.9568,87.3681,72.72,7,1,0,1,0


## extract features and target

In [46]:
X = df_enc.drop(columns=["hob_category"])
X.head()

Unnamed: 0,hacc,hdon,mw,logp,mr,tpsa,nrot,type_A,type_B,type_N
0,4,4,183.207,0.3506,48.6581,72.72,3,0,1,0
1,3,2,249.354,2.1528,74.9835,41.49,8,0,1,0
2,2,0,161.657,2.23282,41.275,12.89,2,0,0,1
3,2,0,146.145,1.793,42.484,30.21,0,0,0,1
4,4,4,301.386,2.9568,87.3681,72.72,7,0,1,0


In [47]:
y = df_enc["hob_category"]


In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(181, 10) (46, 10) (181,) (46,)


## fit & transform scaler on training data, only transform on test data 

In [50]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [51]:
import joblib

joblib.dump(X_train_scaled, '/home/rizanb/Documents/hob_pred/data/processed/X_train_scaled.joblib')
joblib.dump(X_test_scaled, '/home/rizanb/Documents/hob_pred/data/processed/X_test_scaled.joblib')
joblib.dump(y_train, '/home/rizanb/Documents/hob_pred/data/processed/y_train.joblib')
joblib.dump(y_test, '/home/rizanb/Documents/hob_pred/data/processed/y_test.joblib')


['/home/rizanb/Documents/hob_pred/data/processed/y_test.joblib']

## evaluate model predictions

In [61]:
models_path = "/home/rizanb/Documents/hob_pred/models/"
reports_path = "/home/rizanb/Documents/hob_pred/reports/"

accuracy_report = ""

models = [
    ('logr', LogisticRegression()),
    ('rf', RandomForestClassifier()),
    ('svc', SVC()),
    ('knn', KNeighborsClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('xgb', XGBClassifier(eval_metric="mlogloss", enable_categorical=True))
]

for name, model in models:
    if name == "xgb":
        model.fit(X_train_scaled, y_train - 1)
        accuracy = model.score(X_test_scaled, y_test - 1) 
        print(f"accuracy of {name}: {accuracy:.3f}")
        
        with open("/home/rizanb/Documents/hob_pred/reports/accuracy_report.txt", "a") as f:
            f.write(f"{name}: {accuracy:.3f} \n")
        
        joblib.dump(name, f"{models_path}{name}_{accuracy:.3f}.pkl")
        break
    model.fit(X_train_scaled, y_train)
    accuracy = model.score(X_test_scaled, y_test)
    print(f"accuracy of {name}: {accuracy:.3f}")

    with open("/home/rizanb/Documents/hob_pred/reports/accuracy_report.txt", "a") as f:
            f.write(f"{name}: {accuracy:.3f} \n")
        
    joblib.dump(name, f"{models_path}{name}_{accuracy:.3f}.pkl")


accuracy of logr: 0.522
accuracy of rf: 0.522
accuracy of svc: 0.543
accuracy of knn: 0.587
accuracy of gb: 0.500
accuracy of xgb: 0.543
