In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

In [45]:
df = pd.read_csv('drug200.csv')


In [46]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [47]:
print(df.isnull().sum())
#looking for missing values

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64


In [48]:
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['BP'] = label_encoder.fit_transform(df['BP'])
df['Cholesterol'] = label_encoder.fit_transform(df['Cholesterol'])

In [49]:
X = df.drop('Drug', axis=1)
y = df['Drug']

In [50]:
#spliting data into training and testing models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
#training gaussian NB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)

In [52]:
#training categorical NB
cnb = CategoricalNB()
cnb.fit(X_train, y_train)
y_pred_cnb = cnb.predict(X_test)

In [53]:
#training decision tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

In [54]:
#Printing accuracy of models
print("Gaussian NB accuracy: ", accuracy_score(y_test, y_pred_gnb))
print("Categorical NB accuracy: ", accuracy_score(y_test, y_pred_cnb))
print("Decision tree accuracy: ", accuracy_score(y_test, y_pred_dt))

Gaussian NB accuracy:  0.925
Categorical NB accuracy:  0.8
Decision tree accuracy:  1.0


In [55]:
#printing classification report of models
print("Gaussian NB\n", classification_report(y_test, y_pred_gnb))
print("Categorical NB\n", classification_report(y_test, y_pred_cnb))
print("Decision tree\n", classification_report(y_test, y_pred_dt))

Gaussian NB
               precision    recall  f1-score   support

       drugA       0.86      1.00      0.92         6
       drugB       0.75      1.00      0.86         3
       drugC       0.83      1.00      0.91         5
       drugX       1.00      1.00      1.00        11
       drugY       1.00      0.80      0.89        15

    accuracy                           0.93        40
   macro avg       0.89      0.96      0.92        40
weighted avg       0.94      0.93      0.92        40

Categorical NB
               precision    recall  f1-score   support

       drugA       0.71      0.83      0.77         6
       drugB       0.50      0.33      0.40         3
       drugC       1.00      0.20      0.33         5
       drugX       0.69      1.00      0.81        11
       drugY       1.00      0.93      0.97        15

    accuracy                           0.80        40
   macro avg       0.78      0.66      0.66        40
weighted avg       0.83      0.80      0.77     

In [56]:
#printing parameters of models
print("Gaussian NB parameters: ", gnb.get_params())
print("Categorical NB parameters: ", cnb.get_params())
print("Decision tree parameters: ", dt.get_params())

Gaussian NB parameters:  {'priors': None, 'var_smoothing': 1e-09}
Categorical NB parameters:  {'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn', 'min_categories': None}
Decision tree parameters:  {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
