In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
df1=pd.read_csv('cleaned_dataset.csv')

In [4]:
# Label Encoding for categorical variables
label_encoder = LabelEncoder()

df1['TypeofContact'] = label_encoder.fit_transform(df1['TypeofContact'])
df1['Occupation'] = label_encoder.fit_transform(df1['Occupation'])
df1['Gender'] = label_encoder.fit_transform(df1['Gender'])
df1['ProductPitched'] = label_encoder.fit_transform(df1['ProductPitched'])
df1['MaritalStatus'] = label_encoder.fit_transform(df1['MaritalStatus'])
df1['Designation'] = label_encoder.fit_transform(df1['Designation'])
print(df1)

      CustomerID  ProdTaken        Age  TypeofContact  CityTier  \
0         200000          1  41.000000              1         3   
1         200001          0  49.000000              0         1   
2         200002          1  37.000000              1         1   
3         200003          0  33.000000              0         1   
4         200004          0  37.622265              1         1   
...          ...        ...        ...            ...       ...   
4883      204883          1  49.000000              1         3   
4884      204884          1  28.000000              0         1   
4885      204885          1  52.000000              1         3   
4886      204886          1  19.000000              1         3   
4887      204887          1  36.000000              1         1   

      DurationOfPitch  Occupation  Gender  NumberOfPersonVisiting  \
0                 6.0           2       0                       3   
1                14.0           2       1                

Feature selection:Random forest

In [5]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
    
# Define the features and target variable
X = df1.drop(columns=['ProdTaken'])
y = df1['ProdTaken']

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)

# Get feature importances
importances = clf.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
feature_importances = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Select top 10 features
top_features_rf = feature_importances.head(10)['Feature'].values

# Print top features
print("Top 10 features:", top_features_rf)

# Update the feature set with the selected top features
X_selected_rf = X[top_features_rf]

Top 10 features: ['Age' 'CustomerID' 'MonthlyIncome' 'DurationOfPitch' 'Passport'
 'NumberOfTrips' 'PitchSatisfactionScore' 'MaritalStatus'
 'NumberOfFollowups' 'ProductPitched']


In [6]:

# Split the data into training and testing sets using the common features
X_train, X_test, y_train, y_test = train_test_split(X_selected_rf, y, test_size=0.2, random_state=42)
# Print the shapes of the training and testing sets
print("Shape of X_train_common:", X_train.shape)
print("Shape of X_test_common:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train_common: (3910, 10)
Shape of X_test_common: (978, 10)
Shape of y_train: (3910,)
Shape of y_test: (978,)


In [7]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split


# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Print the shapes of the new training sets
print("Shape of X_train_smote:", X_train_smote.shape)
print("Shape of y_train_smote:", y_train_smote.shape)

# Check the value counts after SMote
print("Value counts for y_train_smote:")
print(y_train_smote.value_counts())


Shape of X_train_smote: (6362, 10)
Shape of y_train_smote: (6362,)
Value counts for y_train_smote:
ProdTaken
0    3181
1    3181
Name: count, dtype: int64


In [8]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
X_train_scaled = scaler.fit_transform(X_train_smote)

# Transform the testing data using the fitted scaler
X_test_scaled = scaler.transform(X_test)

# Print the shapes of the scaled training and testing sets
print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of X_test_scaled:", X_test_scaled.shape)


Shape of X_train_scaled: (6362, 10)
Shape of X_test_scaled: (978, 10)


Naive Bayes:

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Naive Bayes classifier
nb_clf = GaussianNB()

# Train the classifier
nb_clf.fit(X_train_scaled, y_train_smote)

# Make predictions on the training set
y_train_pred_nb = nb_clf.predict(X_train_scaled)

# Make predictions on the testing set
y_test_pred_nb = nb_clf.predict(X_test_scaled)

# Calculate the training accuracy
accuracy_train_nb = accuracy_score(y_train_smote, y_train_pred_nb)

# Evaluate the classifier on the testing set
accuracy_test_nb = accuracy_score(y_test, y_test_pred_nb)

# Generate classification reports
report_train_nb = classification_report(y_train_smote, y_train_pred_nb)
report_test_nb = classification_report(y_test, y_test_pred_nb)

# Print results
print(f"Naive Bayes Training Accuracy: {accuracy_train_nb}")
print("Naive Bayes Training Classification Report:")
print(report_train_nb)

print(f"Naive Bayes Testing Accuracy: {accuracy_test_nb}")
print("Naive Bayes Testing Classification Report:")
print(report_test_nb)

print("Naive Bayes Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_nb))


Naive Bayes Training Accuracy: 0.6351776171015404
Naive Bayes Training Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.53      0.59      3181
           1       0.61      0.74      0.67      3181

    accuracy                           0.64      6362
   macro avg       0.64      0.64      0.63      6362
weighted avg       0.64      0.64      0.63      6362

Naive Bayes Testing Accuracy: 0.5613496932515337
Naive Bayes Testing Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.52      0.66       787
           1       0.27      0.71      0.39       191

    accuracy                           0.56       978
   macro avg       0.57      0.62      0.52       978
weighted avg       0.76      0.56      0.61       978

Naive Bayes Testing Confusion Matrix:
[[413 374]
 [ 55 136]]


Tuned Model (Naive Bayes)

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Naive Bayes classifier
nb_clf = GaussianNB()

# Set up the parameter grid
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=nb_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train_smote)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_nb_clf = grid_search.best_estimator_

print(f"Best parameters found: {best_params}")

# Train the best estimator
best_nb_clf.fit(X_train_scaled, y_train_smote)

# Make predictions on the training set
y_train_pred_nb = best_nb_clf.predict(X_train_scaled)

# Make predictions on the testing set
y_test_pred_nb = best_nb_clf.predict(X_test_scaled)

# Calculate the training accuracy
accuracy_train_at_nb = accuracy_score(y_train_smote, y_train_pred_nb)

# Evaluate the classifier on the testing set
accuracy_test_at_nb = accuracy_score(y_test, y_test_pred_nb)

# Generate classification reports
report_train_at_nb = classification_report(y_train_smote, y_train_pred_nb)
report_test_at_nb = classification_report(y_test, y_test_pred_nb)

# Print results
print(f"Naive Bayes Training Accuracy: {accuracy_train_at_nb}")
print("Naive Bayes Training Classification Report:")
print(report_train_at_nb)

print(f"Naive Bayes Testing Accuracy: {accuracy_test_at_nb}")
print("Naive Bayes Testing Classification Report:")
print(report_test_at_nb)




Best parameters found: {'var_smoothing': 1e-09}
Naive Bayes Training Accuracy: 0.6351776171015404
Naive Bayes Training Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.53      0.59      3181
           1       0.61      0.74      0.67      3181

    accuracy                           0.64      6362
   macro avg       0.64      0.64      0.63      6362
weighted avg       0.64      0.64      0.63      6362

Naive Bayes Testing Accuracy: 0.5613496932515337
Naive Bayes Testing Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.52      0.66       787
           1       0.27      0.71      0.39       191

    accuracy                           0.56       978
   macro avg       0.57      0.62      0.52       978
weighted avg       0.76      0.56      0.61       978



GBM

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the GBM classifier
gbm_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the classifier on the SMOTE variables
gbm_clf.fit(X_train_scaled, y_train_smote)

# Make predictions on the training set
y_train_pred_gbm = gbm_clf.predict(X_train_scaled)

# Make predictions on the testing set
y_test_pred_gbm = gbm_clf.predict(X_test_scaled)

# Calculate the training accuracy
accuracy_train_gbm = accuracy_score(y_train_smote, y_train_pred_gbm)

# Evaluate the classifier on the testing set
accuracy_test_gbm = accuracy_score(y_test, y_test_pred_gbm)

# Generate classification reports
report_train_gbm = classification_report(y_train_smote, y_train_pred_gbm)
report_test_gbm = classification_report(y_test, y_test_pred_gbm)

# Print results
print(f"GBM Training Accuracy: {accuracy_train_gbm}")
print("GBM Training Classification Report:")
print(report_train_gbm)

print(f"GBM Testing Accuracy: {accuracy_test_gbm}")
print("GBM Testing Classification Report:")
print(report_test_gbm)

print("GBM Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_gbm))


GBM Training Accuracy: 0.8992455202766426
GBM Training Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      3181
           1       0.94      0.86      0.89      3181

    accuracy                           0.90      6362
   macro avg       0.90      0.90      0.90      6362
weighted avg       0.90      0.90      0.90      6362

GBM Testing Accuracy: 0.8374233128834356
GBM Testing Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.90       787
           1       0.60      0.51      0.55       191

    accuracy                           0.84       978
   macro avg       0.74      0.71      0.73       978
weighted avg       0.83      0.84      0.83       978

GBM Testing Confusion Matrix:
[[721  66]
 [ 93  98]]


Tunded Model(GBM)

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the GBM classifier with default parameters
gbm_clf = GradientBoostingClassifier(random_state=42)

# Set up the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gbm_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train_smote)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
gbm_clf = grid_search.best_estimator_

print(f"Best parameters found: {best_params}")
best_gbm_clf = GradientBoostingClassifier(**best_params, random_state=42)

# Train the best estimator
best_gbm_clf.fit(X_train_scaled, y_train_smote)

# Make predictions on the training set
y_train_pred_gbm = best_gbm_clf.predict(X_train_scaled)

# Make predictions on the testing set
y_test_pred_gbm = best_gbm_clf.predict(X_test_scaled)

# Calculate the training accuracy
accuracy_train_at_gbm = accuracy_score(y_train_smote, y_train_pred_gbm)

# Evaluate the classifier on the testing set
accuracy_test_at_gbm = accuracy_score(y_test, y_test_pred_gbm)

# Generate classification reports
report_train_at_gbm = classification_report(y_train_smote, y_train_pred_gbm)
report_test_at_gbm = classification_report(y_test, y_test_pred_gbm)

# Print results
print(f"GBM Training Accuracy: {accuracy_train_at_gbm}")
print("GBM Training Classification Report:")
print(report_train_at_gbm)

print(f"GBM Testing Accuracy: {accuracy_test_at_gbm}")
print("GBM Testing Classification Report:")
print(report_test_at_gbm)

print("GBM Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_gbm))


Best parameters found: {'learning_rate': 0.2, 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 300, 'subsample': 0.8}
GBM Training Accuracy: 0.9992140836215027
GBM Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3181
           1       1.00      1.00      1.00      3181

    accuracy                           1.00      6362
   macro avg       1.00      1.00      1.00      6362
weighted avg       1.00      1.00      1.00      6362

GBM Testing Accuracy: 0.8895705521472392
GBM Testing Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       787
           1       0.81      0.57      0.67       191

    accuracy                           0.89       978
   macro avg       0.86      0.77      0.80       978
weighted avg       0.88      0.89      0.88       978

GBM Testing Confusion Matrix:
[[761  26]
 [ 82 109]]


Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the SMOTE variables
rf_clf.fit(X_train_scaled, y_train_smote)

# Make predictions on the training set
y_train_pred_rf = rf_clf.predict(X_train_scaled)

# Make predictions on the testing set
y_test_pred_rf = rf_clf.predict(X_test_scaled)

# Calculate the training accuracy
accuracy_train_rf = accuracy_score(y_train_smote, y_train_pred_rf)

# Evaluate the classifier on the testing set
accuracy_test_rf = accuracy_score(y_test, y_test_pred_rf)

# Generate classification reports
report_train_rf = classification_report(y_train_smote, y_train_pred_rf)
report_test_rf = classification_report(y_test, y_test_pred_rf)

# Print results
print(f"Random Forest Training Accuracy: {accuracy_train_rf}")
print("Random Forest Training Classification Report:")
print(report_train_rf)

print(f"Random Forest Testing Accuracy: {accuracy_test_rf}")
print("Random Forest Testing Classification Report:")
print(report_test_rf)

print("Random Forest Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_rf))


Random Forest Training Accuracy: 1.0
Random Forest Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3181
           1       1.00      1.00      1.00      3181

    accuracy                           1.00      6362
   macro avg       1.00      1.00      1.00      6362
weighted avg       1.00      1.00      1.00      6362

Random Forest Testing Accuracy: 0.878323108384458
Random Forest Testing Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.93       787
           1       0.74      0.58      0.65       191

    accuracy                           0.88       978
   macro avg       0.82      0.77      0.79       978
weighted avg       0.87      0.88      0.87       978

Random Forest Testing Confusion Matrix:
[[748  39]
 [ 80 111]]


Tuned model (RF)

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train_smote)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_rf_clf = grid_search.best_estimator_

print(f"Best parameters found: {best_params}")

# Train the best estimator
best_rf_clf.fit(X_train_scaled, y_train_smote)

# Make predictions on the training set
y_train_at_pred_rf = best_rf_clf.predict(X_train_scaled)

# Make predictions on the testing set
y_test_at_pred_rf = best_rf_clf.predict(X_test_scaled)

# Calculate the training accuracy
accuracy_train_at_rf = accuracy_score(y_train_smote, y_train_at_pred_rf)

# Evaluate the classifier on the testing set
accuracy_test_at_rf = accuracy_score(y_test, y_test_at_pred_rf)

# Generate classification reports
report_train_at_rf = classification_report(y_train_smote, y_train_at_pred_rf)
report_test_at_rf = classification_report(y_test, y_test_at_pred_rf)

# Print results
print(f"Random Forest Training Accuracy: {accuracy_train_at_rf}")
print("Random Forest Training Classification Report:")
print(report_train_at_rf)

print(f"Random Forest Testing Accuracy: {accuracy_test_at_rf}")
print("Random Forest Testing Classification Report:")
print(report_test_at_rf)

print("Random Forest Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_at_pred_rf))


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters found: {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Random Forest Training Accuracy: 1.0
Random Forest Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3181
           1       1.00      1.00      1.00      3181

    accuracy                           1.00      6362
   macro avg       1.00      1.00      1.00      6362
weighted avg       1.00      1.00      1.00      6362

Random Forest Testing Accuracy: 0.8926380368098159
Random Forest Testing Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       787
           1       0.78      0.63      0.70       191

    accuracy                           0.89       978
   macro avg       0.85      0.79      0.82       978
weighted avg       0.89      

KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the KNN classifier
knn_clf = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors (n_neighbors) as needed

# Train the classifier on the SMOTE variables
knn_clf.fit(X_train_scaled, y_train_smote)

# Make predictions on the training set
y_train_pred_knn = knn_clf.predict(X_train_scaled)

# Make predictions on the testing set
y_test_pred_knn = knn_clf.predict(X_test_scaled)

# Calculate the training accuracy
accuracy_train_knn = accuracy_score(y_train_smote, y_train_pred_knn)

# Evaluate the classifier on the testing set
accuracy_test_knn = accuracy_score(y_test, y_test_pred_knn)

# Generate classification reports
report_train_knn = classification_report(y_train_smote, y_train_pred_knn)
report_test_knn = classification_report(y_test, y_test_pred_knn)

# Print results
print(f"K-Nearest Neighbors (KNN) Training Accuracy: {accuracy_train_knn}")
print("K-Nearest Neighbors (KNN) Training Classification Report:")
print(report_train_knn)

print(f"K-Nearest Neighbors (KNN) Testing Accuracy: {accuracy_test_knn}")
print("K-Nearest Neighbors (KNN) Testing Classification Report:")
print(report_test_knn)

print("K-Nearest Neighbors (KNN) Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred_knn))


K-Nearest Neighbors (KNN) Training Accuracy: 0.872052813580635
K-Nearest Neighbors (KNN) Training Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87      3181
           1       0.84      0.92      0.88      3181

    accuracy                           0.87      6362
   macro avg       0.87      0.87      0.87      6362
weighted avg       0.87      0.87      0.87      6362

K-Nearest Neighbors (KNN) Testing Accuracy: 0.7372188139059305
K-Nearest Neighbors (KNN) Testing Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.75      0.82       787
           1       0.40      0.70      0.51       191

    accuracy                           0.74       978
   macro avg       0.66      0.72      0.67       978
weighted avg       0.81      0.74      0.76       978

K-Nearest Neighbors (KNN) Testing Confusion Matrix:
[[587 200]
 [ 57 134]]


Tuned KNN model

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Initialize the KNN classifier
knn_clf = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=knn_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train_smote)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_knn_clf = grid_search.best_estimator_

print(f"Best parameters found: {best_params}")

# Train the best estimator
best_knn_clf.fit(X_train_scaled, y_train_smote)

# Make predictions on the training set
y_train_at_pred_knn = best_knn_clf.predict(X_train_scaled)

# Make predictions on the testing set
y_test_at_pred_knn = best_knn_clf.predict(X_test_scaled)

# Calculate the training accuracy
accuracy_train_at_knn = accuracy_score(y_train_smote, y_train_at_pred_knn)

# Evaluate the classifier on the testing set
accuracy_test_at_knn = accuracy_score(y_test, y_test_at_pred_knn)

# Generate classification reports
report_train_at_knn = classification_report(y_train_smote, y_train_at_pred_knn)
report_test_at_knn = classification_report(y_test, y_test_at_pred_knn)

# Print results
print(f"K-Nearest Neighbors (KNN) Training Accuracy: {accuracy_train_at_knn}")
print("K-Nearest Neighbors (KNN) Training Classification Report:")
print(report_train_at_knn)

print(f"K-Nearest Neighbors (KNN) Testing Accuracy: {accuracy_test_at_knn}")
print("K-Nearest Neighbors (KNN) Testing Classification Report:")
print(report_test_at_knn)

print("K-Nearest Neighbors (KNN) Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_at_pred_knn))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters found: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
K-Nearest Neighbors (KNN) Training Accuracy: 1.0
K-Nearest Neighbors (KNN) Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3181
           1       1.00      1.00      1.00      3181

    accuracy                           1.00      6362
   macro avg       1.00      1.00      1.00      6362
weighted avg       1.00      1.00      1.00      6362

K-Nearest Neighbors (KNN) Testing Accuracy: 0.7822085889570553
K-Nearest Neighbors (KNN) Testing Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.81      0.86       787
           1       0.46      0.65      0.54       191

    accuracy                           0.78       978
   macro avg       0.68      0.73      0.70       978
weighted avg       0.82 

In [30]:
# Test accuracies before tuning
accuracy_test_before_tuning = {
    'Naive Bayes': accuracy_score(y_test, y_test_pred_nb),
    'Gradient Boosting': accuracy_score(y_test, y_test_pred_gbm),
    'Random Forest': accuracy_score(y_test, y_test_pred_rf),
    'K-Nearest Neighbors': accuracy_score(y_test, y_test_pred_knn)
}

# Test accuracies after tuning
accuracy_test_after_tuning = {
    'Naive Bayes': accuracy_score(y_test, y_test_pred_nb),
    'Gradient Boosting': accuracy_score(y_test, y_test_pred_gbm),
    'Random Forest': accuracy_score(y_test, y_test_pred_rf),
    'K-Nearest Neighbors': accuracy_score(y_test, y_test_at_pred_knn)
}

# Print test accuracies
print("Test Accuracies Before Tuning:")
for clf, accuracy in accuracy_test_before_tuning.items():
    print(f"{clf}: {accuracy}")

print("\nTest Accuracies After Tuning:")
for clf, accuracy in accuracy_test_after_tuning.items():
    print(f"{clf}: {accuracy}")

# Find the classifier with the maximum accuracy after tuning
max_accuracy_clf = max(accuracy_test_after_tuning, key=accuracy_test_after_tuning.get)
max_accuracy = accuracy_test_after_tuning[max_accuracy_clf]

print(f"\nThe classifier with the maximum accuracy after tuning is: {max_accuracy_clf} with accuracy: {max_accuracy}")


Test Accuracies Before Tuning:
Naive Bayes: 0.5613496932515337
Gradient Boosting: 0.8374233128834356
Random Forest: 0.878323108384458
K-Nearest Neighbors: 0.7822085889570553

Test Accuracies After Tuning:
Naive Bayes: 0.5613496932515337
Gradient Boosting: 0.8374233128834356
Random Forest: 0.878323108384458
K-Nearest Neighbors: 0.7822085889570553

The classifier with the maximum accuracy after tuning is: Random Forest with accuracy: 0.878323108384458


In [34]:
import plotly.graph_objects as go

# Define classifier names
classifiers = ['Naive Bayes', 'Gradient Boosting', 'Random Forest', 'KNN']

# Define training and testing accuracies before tuning
accuracy_train_before_tuning = [accuracy_train_at_nb, accuracy_train_at_gbm, accuracy_train_at_rf, accuracy_train_at_knn]
accuracy_test_before_tuning = [accuracy_test_at_nb, accuracy_test_at_gbm, accuracy_test_at_rf, accuracy_test_at_knn]

# Define training and testing accuracies after tuning
accuracy_train_after_tuning = [best_nb_clf.score(X_train_scaled, y_train_smote),
                               best_gbm_clf.score(X_train_scaled, y_train_smote),
                               best_rf_clf.score(X_train_scaled, y_train_smote),
                               best_knn_clf.score(X_train_scaled, y_train_smote)]

accuracy_test_after_tuning = [best_nb_clf.score(X_test_scaled, y_test),
                              best_gbm_clf.score(X_test_scaled, y_test),
                              best_rf_clf.score(X_test_scaled, y_test),
                              best_knn_clf.score(X_test_scaled, y_test)]

# Create traces for before and after tuning
fig = go.Figure()

fig.add_trace(go.Bar(
    x=classifiers,
    y=accuracy_train_before_tuning,
    name='Before Tuning - Training Accuracy',
    marker_color='indianred'
))

fig.add_trace(go.Bar(
    x=classifiers,
    y=accuracy_test_before_tuning,
    name='Before Tuning - Testing Accuracy',
    marker_color='lightsalmon'
))

fig.add_trace(go.Bar(
    x=classifiers,
    y=accuracy_train_after_tuning,
    name='After Tuning - Training Accuracy',
    marker_color='lightblue'
))

fig.add_trace(go.Bar(
    x=classifiers,
    y=accuracy_test_after_tuning,
    name='After Tuning - Testing Accuracy',
    marker_color='skyblue'
))

# Update layout
fig.update_layout(
    title='Classifier Performance Before and After Tuning',
    xaxis_tickfont_size=14,
    yaxis=dict(
        title='Accuracy',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=0,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.15,
    bargroupgap=0.1
)

# Show plot
fig.show()
