In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report


In [2]:
df = pd.read_csv('cleaned_data.csv')
X = df.drop('target',axis=1)
y = df['target']


In [3]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")




Training set size: 60144
Validation set size: 20048
Test set size: 20049


In [4]:
models = {
    'Logistic_Regression': LogisticRegression(max_iter=10000),
    'Decision_Tree': DecisionTreeClassifier(random_state=42),
    'Random_Forest': RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=500),
    'K-Nearest_Neighbors': KNeighborsClassifier(n_jobs=-1, n_neighbors=5),
    'Naive_Bayes': GaussianNB(),
    'Gradient_Boosting': GradientBoostingClassifier(random_state=42)
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(model_name)
    print('Accuracy:',accuracy_score(y_test,y_pred))
    print('Confusion Matrix:',confusion_matrix(y_test,y_pred))
    print('Classification Report:',classification_report(y_test,y_pred))


Logistic_Regression
Accuracy: 0.5402264452092373
Confusion Matrix: [[8834 1930]
 [7288 1997]]
Classification Report:               precision    recall  f1-score   support

           0       0.55      0.82      0.66     10764
           1       0.51      0.22      0.30      9285

    accuracy                           0.54     20049
   macro avg       0.53      0.52      0.48     20049
weighted avg       0.53      0.54      0.49     20049

Decision_Tree
Accuracy: 0.520474836650207
Confusion Matrix: [[5921 4843]
 [4771 4514]]
Classification Report:               precision    recall  f1-score   support

           0       0.55      0.55      0.55     10764
           1       0.48      0.49      0.48      9285

    accuracy                           0.52     20049
   macro avg       0.52      0.52      0.52     20049
weighted avg       0.52      0.52      0.52     20049

Random_Forest
Accuracy: 0.5493540824978802
Confusion Matrix: [[7012 3752]
 [5283 4002]]
Classification Report:         

[WinError 2] The system cannot find the file specified
  File "C:\Users\admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.12_3.12.1520.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


K-Nearest_Neighbors
Accuracy: 0.5209237368447304
Confusion Matrix: [[6295 4469]
 [5136 4149]]
Classification Report:               precision    recall  f1-score   support

           0       0.55      0.58      0.57     10764
           1       0.48      0.45      0.46      9285

    accuracy                           0.52     20049
   macro avg       0.52      0.52      0.52     20049
weighted avg       0.52      0.52      0.52     20049

Naive_Bayes
Accuracy: 0.5408249788019353
Confusion Matrix: [[9495 1269]
 [7937 1348]]
Classification Report:               precision    recall  f1-score   support

           0       0.54      0.88      0.67     10764
           1       0.52      0.15      0.23      9285

    accuracy                           0.54     20049
   macro avg       0.53      0.51      0.45     20049
weighted avg       0.53      0.54      0.47     20049

Gradient_Boosting
Accuracy: 0.5643174223153274
Confusion Matrix: [[7245 3519]
 [5216 4069]]
Classification Report:      

In [5]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Gradient Boosting

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7]
}

# Initialize GridSearchCV with the GradientBoostingClassifier, the parameter grid, and 5-fold cross-validation

grid_search = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model
print('Accuracy:',accuracy_score(y_test,y_pred))
print('Confusion Matrix:',confusion_matrix(y_test,y_pred))
print('Classification Report:',classification_report(y_test,y_pred))
print(f"Best Parameters: {best_params}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Accuracy: 0.5643174223153274
Confusion Matrix: [[7245 3519]
 [5216 4069]]
Classification Report:               precision    recall  f1-score   support

           0       0.58      0.67      0.62     10764
           1       0.54      0.44      0.48      9285

    accuracy                           0.56     20049
   macro avg       0.56      0.56      0.55     20049
weighted avg       0.56      0.56      0.56     20049

Best Parameters: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 300}


In [8]:


# Predict on the validation set using the best model
y_pred_val = best_model.predict(X_val)

# Evaluate the model's performance on the validation set
print("Validation Set Performance:")
print('Accuracy:', accuracy_score(y_val, y_pred_val))
print('Confusion Matrix:', confusion_matrix(y_val, y_pred_val))
print('Classification Report:', classification_report(y_val, y_pred_val))


Validation Set Performance:
Accuracy: 0.5705806065442937
Confusion Matrix: [[7242 3521]
 [5088 4197]]
Classification Report:               precision    recall  f1-score   support

           0       0.59      0.67      0.63     10763
           1       0.54      0.45      0.49      9285

    accuracy                           0.57     20048
   macro avg       0.57      0.56      0.56     20048
weighted avg       0.57      0.57      0.57     20048



In [10]:
import joblib

# Save the model to a file
joblib.dump(best_model, 'model.pkl')

# To load the model later
# loaded_model = joblib.load('best_model.pkl')

['model.pkl']