## Loading and Preprocessing of Data

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xlrd

# Load the data
# data = pd.read_excel("../dataset/drug200.xls", engine='xlrd')
data = pd.read_csv("../dataset/drug200.xls")
data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [19]:
data.nunique()

Age             57
Sex              2
BP               3
Cholesterol      2
Na_to_K        198
Drug             5
dtype: int64

In [89]:
# print(data["Drug"].unique())
# print(data["Sex"].unique())
# print(data["BP"].unique())
# print(data["Cholesterol"].unique())

[print(data[x].unique()) for x in ["Drug","Sex","BP","Cholesterol"]]


['DrugY' 'drugC' 'drugX' 'drugA' 'drugB']
['F' 'M']
['HIGH' 'LOW' 'NORMAL']
['HIGH' 'NORMAL']


[None, None, None, None]

In [94]:
[print(data[x].value_counts()) for x in ["Drug","Sex","BP","Cholesterol"]]

Drug
DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: count, dtype: int64
Sex
M    104
F     96
Name: count, dtype: int64
BP
HIGH      77
LOW       64
NORMAL    59
Name: count, dtype: int64
Cholesterol
HIGH      103
NORMAL     97
Name: count, dtype: int64


[None, None, None, None]

In [25]:
data.isna().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [27]:
from sklearn.model_selection import train_test_split
X=data.drop('Drug',axis=1)
y=data['Drug']

In [29]:
X.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,23,F,HIGH,HIGH,25.355
1,47,M,LOW,HIGH,13.093
2,47,M,LOW,HIGH,10.114
3,28,F,NORMAL,HIGH,7.798
4,61,F,LOW,HIGH,18.043


In [32]:
y.head()

0    DrugY
1    drugC
2    drugC
3    drugX
4    DrugY
Name: Drug, dtype: object

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Machine Learning Pipelines

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

cat_col =[1,2,3]
num_col=[0,4]

transformer = ColumnTransformer([
    ("encoder", OrdinalEncoder(), cat_col),
    ("num_imputer", SimpleImputer(strategy='median'),num_col),
    ("num_scaler", StandardScaler(), num_col)
])

pipe = Pipeline(
    
        steps=[
            ("preprocessor", transformer),
            ("model", RandomForestClassifier(n_estimators=100, random_state=42))
            ]
        )
pipe.fit(X_train, y_train)

## Model Evaluation

In [42]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

predictions = pipe.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average='weighted')
classification_rep = classification_report(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Classification Report: {classification_rep}")

print(f"Confusion Matrix: \n{conf_matrix}")

Accuracy: 0.975
F1 Score: 0.9741545893719806
Classification Report:               precision    recall  f1-score   support

       DrugY       1.00      1.00      1.00        15
       drugA       1.00      1.00      1.00         6
       drugB       1.00      1.00      1.00         3
       drugC       1.00      0.80      0.89         5
       drugX       0.92      1.00      0.96        11

    accuracy                           0.97        40
   macro avg       0.98      0.96      0.97        40
weighted avg       0.98      0.97      0.97        40

Confusion Matrix: 
[[15  0  0  0  0]
 [ 0  6  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  4  1]
 [ 0  0  0  0 11]]


In [43]:
import joblib

joblib.dump(pipe, "../model/drug200_model.pkl")

['../model/drug200_model.pkl']

In [45]:
import skops.io as sio

sio.dump(pipe, "../model/drug_pipeline.skops")

In [68]:
untrusted_types = sio.get_untrusted_types(file="../model/drug_pipeline.skops")
print(untrusted_types)
sio.load("../model/drug_pipeline.skops", trusted=untrusted_types)

['numpy.dtype']


Prediction: ['drugX']


