# 1 Libraries

In [2]:
# to use dataframes
import pandas as pd
import seaborn as sns

# for plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# to allow inline plotting on notebook cells
%matplotlib inline
plt.switch_backend('agg')

# for numerical computing
import numpy as np

# for data transformation
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# to create chain of multiple data processing
from sklearn.pipeline import Pipeline

# for univariate anomaly detection
from outliers import smirnov_grubbs as grubbs

# for localoutlierfactor for clustering
from sklearn.neighbors import LocalOutlierFactor

# to implement a train test split strategy
from sklearn.model_selection import train_test_split

# for logistic regression class of Sklearn
from sklearn.linear_model import LogisticRegression

# for neural network model MLP Classifier
from sklearn.neural_network import MLPClassifier

# for neural network model Keras
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

# for classification metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import auc
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# for reporting
from sklearn.metrics import classification_report

# to create tables
from tabulate import tabulate

# for feature power transformation functions
from scipy.stats import boxcox
from scipy.special import boxcox1p

# normality tests
from scipy.stats import shapiro
from scipy.stats import normaltest
from scipy.stats import probplot

# to handle imbalanced data
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

# to ignore warnings
import warnings 
warnings.filterwarnings('ignore')

# for nicer prints
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

print("done")

done


# Data

In [10]:
df = pd.read_csv('archive/diabetes_filtered.csv')
df = df[df["Age"] != 1]

print(df.shape)
tmp_df = df.head(1).T
tmp_df.columns = ['sample']
tmp_df['dtypes'] = df.dtypes
tmp_df

(247980, 22)


Unnamed: 0,sample,dtypes
Diabetes_binary,0,int64
HighBP,1,int64
HighChol,1,int64
CholCheck,1,int64
BMI,40,int64
Smoker,1,int64
Stroke,0,int64
HeartDiseaseorAttack,0,int64
PhysActivity,0,int64
Fruits,0,int64


# Missing & Null Values

In [4]:
# Check for missing values
print(df.isnull().sum())

# Check for null values
df.isnull().values.any()

Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


False

# Data Cleaning and Transformation

In [11]:
    
categorical_cols = ['Diabetes_binary','HighBP','HighChol','CholCheck','Smoker',
                    'Stroke','HeartDiseaseorAttack','PhysActivity','Fruits', 'Veggies',
                    'HvyAlcoholConsump','AnyHealthcare','NoDocbcCost','DiffWalk','Sex']
for col in categorical_cols:
    df[col] = df[col].astype("category")
    
education_mapping = {
    1: 'Never attended school or only kindergarten',
    2: 'Grades 1 through 8 (Elementary)',
    3: 'Grades 9 through 11 (Some high school)',
    4: 'Grade 12 or GED (High school graduate)',
    5: 'College 1 year to 3 years (Some college or technical school)',
    6: 'College 4 years or more (College graduate)'
}

health_mapping = {
    1: 'Excellent',
    2: 'Very good',
    3: 'Good',
    4: 'Fair',
    5: 'Poor'
}

education_order = [
    'Never attended school or only kindergarten',
    'Grades 1 through 8 (Elementary)',
    'Grades 9 through 11 (Some high school)',
    'Grade 12 or GED (High school graduate)',
    'College 1 year to 3 years (Some college or technical school)',
    'College 4 years or more (College graduate)'
]

health_order = [
    'Excellent',    # Best health (numeric code 1)
    'Very good',    # Numeric code 2
    'Good',         # Numeric code 3
    'Fair',         # Numeric code 4
    'Poor'          # Worst health (numeric code 5)
]

if df['Education'].dtype in ['int64', 'int32', 'float64']:
    df['Education'] = df['Education'].map(education_mapping)

if df['GenHlth'].dtype in ['int64', 'int32', 'float64']:
    df['GenHlth'] = df['GenHlth'].map(health_mapping)

df['Education'] = pd.Categorical(
    df['Education'],
    categories=education_order,
    ordered=True
)

df['GenHlth'] = pd.Categorical(
    df['GenHlth'],
    categories=health_order,
    ordered=True
)

tmp_df = df.head(1).T
tmp_df.columns = ['sample']
tmp_df['dtypes'] = df.dtypes
tmp_df

Unnamed: 0,sample,dtypes
Diabetes_binary,0,category
HighBP,1,category
HighChol,1,category
CholCheck,1,category
BMI,40,int64
Smoker,1,category
Stroke,0,category
HeartDiseaseorAttack,0,category
PhysActivity,0,category
Fruits,0,category


# Data Split (Train/Test)

In [12]:
from sklearn.model_selection import train_test_split

cols_to_drop = ['Diabetes_binary','Sex','Education','Income','MentHlth','PhysHlth','AnyHealthcare','GenHlth'] 
X = df.drop(cols_to_drop, axis=1)
y = df['Diabetes_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

print("Train class distribution:")
print(y_train.value_counts(normalize=True))

print("\nTest class distribution:")
print(y_test.value_counts(normalize=True))


Train class distribution:
Diabetes_binary
0    0.857781
1    0.142219
Name: proportion, dtype: float64

Test class distribution:
Diabetes_binary
0    0.857771
1    0.142229
Name: proportion, dtype: float64


# 3 Modeling

## Logistic Regression (Class Weights)

### Training

In [13]:

class_weights = compute_class_weight(class_weight='balanced',classes=np.unique(y_train),y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

lr_pipeline = Pipeline(steps=[('lr', LogisticRegression(solver='liblinear',
        C=0.1, class_weight=class_weight_dict, random_state=123))])

print("Training Logistic Regression Model")
lr_pipeline.fit(X_train, y_train) 


print("Evaluating Train Set")
y_pred_lr_train = lr_pipeline.predict(X_train) 
y_proba_lr_train = lr_pipeline.predict_proba(X_train)[:, 1]

print("Evaluating Test Set")
y_pred_lr_test = lr_pipeline.predict(X_test) 
y_proba_lr_test = lr_pipeline.predict_proba(X_test)[:, 1]

print("Done")

Training Logistic Regression Model
Evaluating Train Set
Evaluating Test Set
Done


### Result

In [14]:
print("\n--- Logistic Regression Evaluation (Class Weight) (Train Set) ---")

print(f"ROC AUC Score: {roc_auc_score(y_train, y_proba_lr_train):.4f}")
print(classification_report(y_train, y_pred_lr_train))
print("\nConfusion Matrix (Train):")
print(confusion_matrix(y_train, y_pred_lr_train))

print("\n--- Logistic Regression Evaluation (Class Weight) (Test Set) ---")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_lr_test):.4f}")
print(classification_report(y_test, y_pred_lr_test))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr_test))



--- Logistic Regression Evaluation (Class Weight) (Train Set) ---
ROC AUC Score: 0.7985
              precision    recall  f1-score   support

           0       0.94      0.71      0.81    170170
           1       0.30      0.74      0.42     28214

    accuracy                           0.71    198384
   macro avg       0.62      0.73      0.62    198384
weighted avg       0.85      0.71      0.75    198384


Confusion Matrix (Train):
[[120625  49545]
 [  7238  20976]]

--- Logistic Regression Evaluation (Class Weight) (Test Set) ---
ROC AUC Score: 0.7994
              precision    recall  f1-score   support

           0       0.94      0.71      0.81     42542
           1       0.30      0.75      0.43      7054

    accuracy                           0.71     49596
   macro avg       0.62      0.73      0.62     49596
weighted avg       0.85      0.71      0.75     49596


Confusion Matrix:
[[30051 12491]
 [ 1771  5283]]


In [15]:
import joblib
joblib.dump(lr_pipeline,"lr_model_2.pkl")

['lr_model_2.pkl']

### Training

In [15]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

print("Before SMOTE:")
print("Training set class distribution:")
print(y_train.value_counts())
print(f"Imbalance ratio: {y_train.value_counts()[0] / y_train.value_counts()[1]:.2f}")


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=123)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("\nAfter SMOTE:")
print("Resampled training set class distribution:")
print(y_train_resampled.value_counts())

knn_smote = KNeighborsClassifier(n_neighbors=11, metric='manhattan')

print("KNN Training with Stratified Cross-Validation (SMOTE)")

cv_scores = []
cv_f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_resampled, y_train_resampled)):
    
    X_fold_train, X_fold_val = X_train_resampled[train_idx], X_train_resampled[val_idx]
    y_fold_train, y_fold_val = y_train_resampled.iloc[train_idx], y_train_resampled.iloc[val_idx]
    
    knn_smote.fit(X_fold_train, y_fold_train)
    
    y_fold_pred = knn_smote.predict(X_fold_val)
    
    fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
    fold_f1 = f1_score(y_fold_val, y_fold_pred, average='weighted')
    
    cv_scores.append(fold_accuracy)
    cv_f1_scores.append(fold_f1)
    
    print(f"Fold {fold + 1}: Accuracy = {fold_accuracy:.4f}, F1-score = {fold_f1:.4f}")

print(f"\nCross-Validation Results:")
print(f"Mean Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print(f"Mean F1-score: {np.mean(cv_f1_scores):.4f} (+/- {np.std(cv_f1_scores):.4f})")

print("\nTraining final model on entire resampled training set...")
knn_smote.fit(X_train_resampled, y_train_resampled)


print("\nKNN Train Evaluation")
y_pred_smote_train = knn_smote.predict(X_train_scaled)
y_prob_train = knn_smote.predict_proba(X_train_scaled)[:, 1]

print("\nKNN Test Evaluation")
y_pred_smote_test= knn_smote.predict(X_test_scaled)
y_prob_test = knn_smote.predict_proba(X_test_scaled)[:, 1]
print("Done")

Before SMOTE:
Training set class distribution:
Diabetes_binary
0    170170
1     28214
Name: count, dtype: int64
Imbalance ratio: 6.03

After SMOTE:
Resampled training set class distribution:
Diabetes_binary
0    170170
1    170170
Name: count, dtype: int64
KNN Training with Stratified Cross-Validation (SMOTE)
Fold 1: Accuracy = 0.7449, F1-score = 0.7440
Fold 2: Accuracy = 0.7435, F1-score = 0.7426
Fold 3: Accuracy = 0.7419, F1-score = 0.7410
Fold 4: Accuracy = 0.7421, F1-score = 0.7413
Fold 5: Accuracy = 0.7443, F1-score = 0.7434

Cross-Validation Results:
Mean Accuracy: 0.7433 (+/- 0.0012)
Mean F1-score: 0.7425 (+/- 0.0012)

Training final model on entire resampled training set...

KNN Train Evaluation

KNN Test Evaluation
Done


### Result

In [16]:
print("\n--- KNN Evaluation (SMOTE) (Train Set) ---")

print(f"ROC AUC Score: {roc_auc_score(y_train, y_prob_train):.4f}")

print("\nClassification Report:")
print(classification_report(y_train, y_pred_smote_train))

print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_pred_smote_train))

print("\n--- KNN Evaluation (SMOTE) (Test Set) ---")

print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob_test):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_smote_test))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_smote_test))



--- KNN Evaluation (SMOTE) (Train Set) ---
ROC AUC Score: 0.8406

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.85      0.89    170170
           1       0.41      0.63      0.49     28214

    accuracy                           0.82    198384
   macro avg       0.67      0.74      0.69    198384
weighted avg       0.86      0.82      0.83    198384


Confusion Matrix:
[[143957  26213]
 [ 10338  17876]]

--- KNN Evaluation (SMOTE) (Test Set) ---
ROC AUC Score: 0.7432

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.82      0.86     42542
           1       0.31      0.50      0.38      7054

    accuracy                           0.77     49596
   macro avg       0.61      0.66      0.62     49596
weighted avg       0.82      0.77      0.79     49596


Confusion Matrix:
[[34734  7808]
 [ 3539  3515]]


### Training

In [17]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler


print("Before Undersampling:")
print("Training set class distribution:")
print(y_train.value_counts())
print(f"Imbalance ratio: {y_train.value_counts()[0] / y_train.value_counts()[1]:.2f}")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

undersampler = RandomUnderSampler(sampling_strategy=0.5,random_state=123)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)

print("\nAfter Undersampling:")
print("Resampled training set class distribution:")
print(y_train_resampled.value_counts())

knn_undersample = KNeighborsClassifier(n_neighbors=7, metric='manhattan')

print("KNN Training with Stratified Cross-Validation (Undersampling)")

cv_scores = []
cv_f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_resampled, y_train_resampled)):
    
    X_fold_train, X_fold_val = X_train_resampled[train_idx], X_train_resampled[val_idx]
    y_fold_train, y_fold_val = y_train_resampled.iloc[train_idx], y_train_resampled.iloc[val_idx]
    
    
    knn_undersample.fit(X_fold_train, y_fold_train)
    
    
    y_fold_pred = knn_undersample.predict(X_fold_val)
    
    
    fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
    fold_f1 = f1_score(y_fold_val, y_fold_pred, average='weighted')
    
    cv_scores.append(fold_accuracy)
    cv_f1_scores.append(fold_f1)
    
    print(f"Fold {fold + 1}: Accuracy = {fold_accuracy:.4f}, F1-score = {fold_f1:.4f}")

print(f"\nCross-Validation Results:")
print(f"Mean Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print(f"Mean F1-score: {np.mean(cv_f1_scores):.4f} (+/- {np.std(cv_f1_scores):.4f})")

print("\nTraining final model on entire resampled training set...")
knn_undersample.fit(X_train_resampled, y_train_resampled)
print("Done")

print("\nKNN Test Evaluation")
y_pred_undersample_train = knn_undersample.predict(X_train_scaled)
y_prob_undersample_train = knn_undersample.predict_proba(X_train_scaled)[:, 1]
print("Done")


print("\nKNN Test Evaluation")
y_pred_undersample_test = knn_undersample.predict(X_test_scaled)
y_prob_undersample_test = knn_undersample.predict_proba(X_test_scaled)[:, 1]
print("Done")


Before Undersampling:
Training set class distribution:
Diabetes_binary
0    170170
1     28214
Name: count, dtype: int64
Imbalance ratio: 6.03

After Undersampling:
Resampled training set class distribution:
Diabetes_binary
0    56428
1    28214
Name: count, dtype: int64
KNN Training with Stratified Cross-Validation (Undersampling)
Fold 1: Accuracy = 0.7217, F1-score = 0.7103
Fold 2: Accuracy = 0.7212, F1-score = 0.7097
Fold 3: Accuracy = 0.7246, F1-score = 0.7125
Fold 4: Accuracy = 0.7221, F1-score = 0.7104
Fold 5: Accuracy = 0.7179, F1-score = 0.7051

Cross-Validation Results:
Mean Accuracy: 0.7215 (+/- 0.0021)
Mean F1-score: 0.7096 (+/- 0.0024)

Training final model on entire resampled training set...
Done

KNN Test Evaluation
Done

KNN Test Evaluation
Done


### Result

In [18]:
print("\n--- KNN Evaluation (Undersampling) (Train Set) ---")

print(f"ROC AUC Score: {roc_auc_score(y_train, y_prob_undersample_train):.4f}")

print("\nClassification Report:")
print(classification_report(y_train, y_pred_undersample_train))

print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_pred_undersample_train))

print("\n--- KNN Evaluation (Undersampling) (Test Set) ---")

print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob_undersample_test):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_undersample_test))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_undersample_test))



--- KNN Evaluation (Undersampling) (Train Set) ---
ROC AUC Score: 0.7852

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.87      0.90    170170
           1       0.41      0.53      0.46     28214

    accuracy                           0.82    198384
   macro avg       0.66      0.70      0.68    198384
weighted avg       0.85      0.82      0.83    198384


Confusion Matrix:
[[148738  21432]
 [ 13363  14851]]

--- KNN Evaluation (Undersampling) (Test Set) ---
ROC AUC Score: 0.7365

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.86      0.88     42542
           1       0.35      0.45      0.39      7054

    accuracy                           0.80     49596
   macro avg       0.63      0.66      0.64     49596
weighted avg       0.83      0.80      0.81     49596


Confusion Matrix:
[[36532  6010]
 [ 3849  3205]]


### Training

In [19]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import StandardScaler

print("Before ADASYN:")
print("Training set class distribution:")
print(y_train.value_counts())
print(f"Imbalance ratio: {y_train.value_counts()[0] / y_train.value_counts()[1]:.2f}")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


adasyn = ADASYN(random_state=123, n_neighbors=7)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train_scaled, y_train)

print("\nAfter ADASYN:")
print("Resampled training set class distribution:")
print(y_train_resampled.value_counts())
print(f"New training set size: {len(X_train_resampled)}")


knn_adasyn = KNeighborsClassifier(n_neighbors=11, metric='manhattan')

print("KNN Training with Stratified Cross-Validation (ADASYN)")

cv_scores = []
cv_f1_scores = []
cv_precision_scores = []
cv_recall_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_resampled, y_train_resampled)):
    
    X_fold_train, X_fold_val = X_train_resampled[train_idx], X_train_resampled[val_idx]
    y_fold_train, y_fold_val = y_train_resampled.iloc[train_idx], y_train_resampled.iloc[val_idx]
    
    
    knn_adasyn.fit(X_fold_train, y_fold_train)
    
    
    y_fold_pred = knn_adasyn.predict(X_fold_val)
    

    fold_accuracy = accuracy_score(y_fold_val, y_fold_pred)
    fold_f1 = f1_score(y_fold_val, y_fold_pred, average='weighted')
    fold_precision = precision_score(y_fold_val, y_fold_pred, average='weighted')
    fold_recall = recall_score(y_fold_val, y_fold_pred, average='weighted')
    
    cv_scores.append(fold_accuracy)
    cv_f1_scores.append(fold_f1)
    cv_precision_scores.append(fold_precision)
    cv_recall_scores.append(fold_recall)
    
    print(f"Fold {fold + 1}: Accuracy = {fold_accuracy:.4f}, F1 = {fold_f1:.4f}, Precision = {fold_precision:.4f}, Recall = {fold_recall:.4f}")

print(f"\nCross-Validation Results:")
print(f"Mean Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print(f"Mean F1-score: {np.mean(cv_f1_scores):.4f} (+/- {np.std(cv_f1_scores):.4f})")
print(f"Mean Precision: {np.mean(cv_precision_scores):.4f} (+/- {np.std(cv_precision_scores):.4f})")
print(f"Mean Recall: {np.mean(cv_recall_scores):.4f} (+/- {np.std(cv_recall_scores):.4f})")


print("\nTraining final model on entire resampled training set...")
knn_adasyn.fit(X_train_resampled, y_train_resampled)
print("Done")


print("\nKNN Train Evaluation with ADASYN")
y_pred_adasyn_train = knn_adasyn.predict(X_train_scaled)
y_prob_adasyn_train = knn_adasyn.predict_proba(X_train_scaled)[:, 1]
print("Done")


print("\nKNN Test Evaluation with ADASYN")
y_pred_adasyn_test = knn_adasyn.predict(X_test_scaled)
y_prob_adasyn_test = knn_adasyn.predict_proba(X_test_scaled)[:, 1]
print("Done")

Before ADASYN:
Training set class distribution:
Diabetes_binary
0    170170
1     28214
Name: count, dtype: int64
Imbalance ratio: 6.03

After ADASYN:
Resampled training set class distribution:
Diabetes_binary
0    170170
1    161419
Name: count, dtype: int64
New training set size: 331589
KNN Training with Stratified Cross-Validation (ADASYN)
Fold 1: Accuracy = 0.7258, F1 = 0.7242, Precision = 0.7286, Recall = 0.7258
Fold 2: Accuracy = 0.7277, F1 = 0.7262, Precision = 0.7303, Recall = 0.7277
Fold 3: Accuracy = 0.7262, F1 = 0.7247, Precision = 0.7286, Recall = 0.7262
Fold 4: Accuracy = 0.7243, F1 = 0.7228, Precision = 0.7268, Recall = 0.7243
Fold 5: Accuracy = 0.7260, F1 = 0.7244, Precision = 0.7288, Recall = 0.7260

Cross-Validation Results:
Mean Accuracy: 0.7260 (+/- 0.0011)
Mean F1-score: 0.7245 (+/- 0.0011)
Mean Precision: 0.7286 (+/- 0.0011)
Mean Recall: 0.7260 (+/- 0.0011)

Training final model on entire resampled training set...
Done

KNN Train Evaluation with ADASYN
Done

KNN Te

### Result

In [20]:
print("\n--- KNN Evaluation (ADASYN) (Train Set) ---")

print(f"ROC AUC Score: {roc_auc_score(y_train, y_prob_adasyn_train):.4f}")

print("\nClassification Report:")
print(classification_report(y_train, y_pred_adasyn_train))

print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_pred_adasyn_train))


print("\n--- KNN Evaluation (ADASYN) (Test Set) ---")

print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob_adasyn_test):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_adasyn_test))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_adasyn_test))



--- KNN Evaluation (ADASYN) (Train Set) ---
ROC AUC Score: 0.8390

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.84      0.88    170170
           1       0.40      0.65      0.49     28214

    accuracy                           0.81    198384
   macro avg       0.67      0.74      0.69    198384
weighted avg       0.86      0.81      0.83    198384


Confusion Matrix:
[[142709  27461]
 [  9973  18241]]

--- KNN Evaluation (ADASYN) (Test Set) ---
ROC AUC Score: 0.7432

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.81      0.86     42542
           1       0.31      0.51      0.38      7054

    accuracy                           0.77     49596
   macro avg       0.61      0.66      0.62     49596
weighted avg       0.82      0.77      0.79     49596


Confusion Matrix:
[[34367  8175]
 [ 3438  3616]]
