In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [2]:
dataset = pd.read_csv('CKD.csv')

In [3]:
categorical_cols=dataset.select_dtypes('object','category').columns #to find the columns that have categorical data in it.
categorical_cols

Index(['sg', 'rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe',
       'ane', 'classification'],
      dtype='object')

In [4]:
for col in categorical_cols:
    print(dataset[col].value_counts())
    print("-"*30)

sg
a    152
c     84
b     81
d     75
e      7
Name: count, dtype: int64
------------------------------
rbc
normal      352
abnormal     47
Name: count, dtype: int64
------------------------------
pc
normal      323
abnormal     76
Name: count, dtype: int64
------------------------------
pcc
notpresent    357
present        42
Name: count, dtype: int64
------------------------------
ba
notpresent    377
present        22
Name: count, dtype: int64
------------------------------
htn
no     253
yes    146
Name: count, dtype: int64
------------------------------
dm
no     263
yes    136
Name: count, dtype: int64
------------------------------
cad
no     365
yes     34
Name: count, dtype: int64
------------------------------
appet
yes     316
poor     83
Name: count, dtype: int64
------------------------------
pe
poor    322
yes      77
Name: count, dtype: int64
------------------------------
ane
no     339
yes     60
Name: count, dtype: int64
------------------------------
classification


In [5]:
dataset.columns

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hrmo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [6]:
indep=dataset[['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu','sc', 'sod', 'pot', 'hrmo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane']]

In [7]:
dep=dataset[['classification']]

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(indep,dep,test_size=0.30,random_state=0)

In [9]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
x_train = enc.fit_transform(x_train)
x_test = enc.transform(x_test)

In [10]:
x_train=np.where(x_train < 0,0,x_train)
x_test=np.where(x_test < 0,0,x_test)

In [11]:
from sklearn.naive_bayes import CategoricalNB

classifier = CategoricalNB()

from sklearn.model_selection import GridSearchCV
catnb_param_grid = {
       'alpha': [0.001, 0.01, 0.1],    
       'fit_prior': [True, False]
}
grid=GridSearchCV(classifier,catnb_param_grid,refit=True,verbose=3,n_jobs=-1,scoring='f1_weighted')
grid.fit(x_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


  y = column_or_1d(y, warn=True)


In [12]:
print("The best parameter:",grid.best_params_)

The best parameter: {'alpha': 0.001, 'fit_prior': True}


In [13]:
y_pred = grid.predict(x_test)

In [14]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
from sklearn.metrics import classification_report
clf_report = classification_report(y_test, y_pred)              
print(clf_report)
print(cm)

              precision    recall  f1-score   support

          no       0.98      0.96      0.97        45
         yes       0.97      0.99      0.98        75

    accuracy                           0.97       120
   macro avg       0.98      0.97      0.97       120
weighted avg       0.98      0.97      0.97       120

[[43  2]
 [ 1 74]]


In [15]:
from sklearn.metrics import f1_score
f1_wght_average=f1_score(y_test,y_pred,average='weighted')
print("The F1 score final:",f1_wght_average)

The F1 score final: 0.9749423320187514


In [16]:
from sklearn.metrics import roc_auc_score
scr=roc_auc_score(y_test,grid.predict_proba(x_test)[:,1])
print("The roc_auc_score is:",scr)

The roc_auc_score is: 0.9961481481481482
