In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv('bank.csv')  

# Step 1: Data Preparation
# Handle missing values if any
data.dropna(inplace=True)

# Encode categorical variables
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(), categorical_cols)], remainder='passthrough')
X = encoder.fit_transform(data.drop(columns=['deposit']))
y = LabelEncoder().fit_transform(data['deposit'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Preprocessing
# Standardize numerical features if needed (optional)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Model Selection and Grid Search
# Initialize classifiers
knn_classifier = KNeighborsClassifier()
mnb_classifier = MultinomialNB()
bnb_classifier = BernoulliNB()
svc_classifier = SVC()
dt_classifier = DecisionTreeClassifier()

# Define hyperparameter grids for each classifier
param_grid_knn = {'n_neighbors': [3, 5, 7, 9, 11]}
param_grid_mnb = {'alpha': [0.1, 1.0, 10.0]}
param_grid_bnb = {'alpha': [0.1, 1.0, 10.0]}
param_grid_svc = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1.0, 10.0]}
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search for each classifier
knn_grid = GridSearchCV(knn_classifier, param_grid_knn, cv=5, scoring='accuracy')
mnb_grid = GridSearchCV(mnb_classifier, param_grid_mnb, cv=5, scoring='accuracy')
bnb_grid = GridSearchCV(bnb_classifier, param_grid_bnb, cv=5, scoring='accuracy')
svc_grid = GridSearchCV(svc_classifier, param_grid_svc, cv=5, scoring='accuracy')
dt_grid = GridSearchCV(dt_classifier, param_grid_dt, cv=5, scoring='accuracy')

# Fit the grids
knn_grid.fit(X_train, y_train)
mnb_grid.fit(X_train, y_train)
bnb_grid.fit(X_train, y_train)
svc_grid.fit(X_train, y_train)
dt_grid.fit(X_train, y_train)

# Step 5: Model Evaluation
# Evaluate each model
models = {
    'K-Nearest Neighbors': knn_grid.best_estimator_,
    'Multinomial Naive Bayes': mnb_grid.best_estimator_,
    'Bernoulli Naive Bayes': bnb_grid.best_estimator_,
    'Support Vector Classifier': svc_grid.best_estimator_,
    'Decision Tree Classifier': dt_grid.best_estimator_
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} - Accuracy: {accuracy:.2f}')
    print(classification_report(y_test, y_pred))

# Generate confusion matrix for Decision Tree Classifier
dt_pred = dt_grid.best_estimator_.predict(X_test)
conf_matrix = confusion_matrix(y_test, dt_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Plot ROC curve for Support Vector Classifier
y_prob = svc_grid.best_estimator_.decision_function(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.figure()
plt.plot(fpr, tpr, lw=2, label='SVC (AUC = %0.2f)' % roc_auc_score(y_test, y_prob))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()


ModuleNotFoundError: No module named 'sklearn'