In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import make_column_selector, make_column_transformer

import warnings
warnings.filterwarnings('ignore')

In [11]:
cancer_df = pd.read_csv(r'C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\Cases\Cancer\Cancer.csv', index_col=0)
cancer_df.head()

Unnamed: 0_level_0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
subjid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,40-49,premeno,15 to 19,0 to 2,yes,three,right,left_up,no,recurrence-events
2,50-59,ge40,15 to 19,0 to 2,no,one,right,central,no,no-recurrence-events
3,50-59,ge40,35 to 39,0 to 2,no,two,left,left_low,no,recurrence-events
4,40-49,premeno,35 to 39,0 to 2,yes,three,right,left_low,yes,no-recurrence-events
5,40-49,premeno,30 to 34,3 to 5,yes,two,left,right_up,no,recurrence-events


In [12]:
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 286 entries, 1 to 286
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   menopause    286 non-null    object
 2   tumor-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-caps    278 non-null    object
 5   deg-malig    286 non-null    object
 6   breast       286 non-null    object
 7   breast-quad  285 non-null    object
 8   irradiat     286 non-null    object
 9   Class        286 non-null    object
dtypes: object(10)
memory usage: 24.6+ KB


In [15]:
X = cancer_df.drop('Class', axis=1)
y = cancer_df['Class']

In [53]:
import gc


ohe = OneHotEncoder(
    sparse_output=False, 
    drop='first', 
    handle_unknown='ignore'
).set_output(transform='pandas')

ct = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=object)),
    (ohe, make_column_selector(dtype_include=object))
)

nb = BernoulliNB()

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

pipe = Pipeline([
    ('CT', ct), 
    ('NB', nb)
])



gcv = GridSearchCV(
    estimator=pipe, 
    cv=kfold,
    scoring='roc_auc',
    param_grid={ 
        'NB__alpha': np.linspace(0.001, 3, 10),
    }, 

)

gcv.fit(X_train, y_train)
print('ROC AUC Score: ',gcv.best_score_)
y_pred = gcv.best_estimator_.predict(X_test)
y_pred_proba = gcv.best_estimator_.predict_proba(X_test)


print(classification_report(y_test, y_pred))


print(roc_auc_score(y_test, y_pred_proba[:, 1]))

 

ROC AUC Score:  0.7352055352055352
                      precision    recall  f1-score   support

no-recurrence-events       0.84      0.83      0.83        69
   recurrence-events       0.33      0.35      0.34        17

            accuracy                           0.73        86
           macro avg       0.59      0.59      0.59        86
        weighted avg       0.74      0.73      0.74        86

0.6483375959079283
