In [1]:
import pandas as pd
# import pyarrow
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC

In [2]:
X_train = pd.read_parquet('tmp/X_train.parquet')
X_test = pd.read_parquet('tmp/X_test.parquet')
y_train = pd.read_parquet('tmp/y_train.parquet')
y_test = pd.read_parquet('tmp/y_test.parquet')

In [3]:
X_train.head()

Unnamed: 0,Age,Income,Emp_length,Amount,Rate,Percent_income,Cred_length,Home_MORTGAGE,Home_OTHER,Home_OWN,Home_RENT,Intent_DEBTCONSOLIDATION,Intent_EDUCATION,Intent_HOMEIMPROVEMENT,Intent_MEDICAL,Intent_PERSONAL,Intent_VENTURE,Default_N,Default_Y
0,5.712903,-0.323881,-0.67342,-0.757573,0.024165,-0.657458,4.487315,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,-0.273252,-0.646554,-1.156213,-0.172315,1.67828,1.496501,-0.691554,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.745843,-0.243213,-0.432024,1.014021,0.647544,1.3092,-0.691554,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-0.903374,-0.16248,0.292165,-0.56776,-0.966452,-0.563808,-0.444942,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,-0.588313,0.20853,-0.432024,-0.409582,0.47164,-0.751109,-0.691554,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [4]:
y_train = y_train['y_train']
y_test = y_test['y_test']
y_train

0        0.0
1        1.0
2        0.0
3        0.0
4        0.0
        ... 
26059    0.0
26060    1.0
26061    1.0
26062    0.0
26063    0.0
Name: y_train, Length: 26064, dtype: float64

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26064, 19), (6517, 19), (26064,), (6517,))

In [22]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

In [23]:
# Train models and evaluate performance
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Performance for {model_name}:\n")
    print(classification_report(y_test, y_pred))

Performance for Logistic Regression:

              precision    recall  f1-score   support

         0.0       0.86      0.95      0.90      5072
         1.0       0.72      0.45      0.55      1445

    accuracy                           0.84      6517
   macro avg       0.79      0.70      0.73      6517
weighted avg       0.83      0.84      0.82      6517

Performance for Random Forest:

              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95      5072
         1.0       0.94      0.68      0.79      1445

    accuracy                           0.92      6517
   macro avg       0.93      0.83      0.87      6517
weighted avg       0.92      0.92      0.91      6517

Performance for SVM:

              precision    recall  f1-score   support

         0.0       0.89      0.98      0.93      5072
         1.0       0.89      0.57      0.69      1445

    accuracy                           0.89      6517
   macro avg       0.89      0.77   

In [38]:
%time
# Hyperparameter Tuning with GridSearchCV for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'min_samples_split': [0.2, 0.3]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train.ravel())

CPU times: total: 0 ns
Wall time: 0 ns


In [39]:
# Best model and parameters
best_rf_model = grid_search.best_estimator_
print(f"Best Hyperparameters: {grid_search.best_params_}")

print("Test accuracy for RF:", best_rf_model.score(X_test, y_test.ravel()))

# Evaluate the best model on the test set
y_pred_rf_best = best_rf_model.predict(X_test)
print(f"Best Model Performance:\n{classification_report(y_test.ravel(), y_pred_rf_best)}")

Best Hyperparameters: {'max_depth': 5, 'min_samples_split': 0.2, 'n_estimators': 200}
Test accuracy for RF: 0.8487033911308884
Best Model Performance:
              precision    recall  f1-score   support

         0.0       0.84      1.00      0.91      5072
         1.0       0.99      0.32      0.49      1445

    accuracy                           0.85      6517
   macro avg       0.91      0.66      0.70      6517
weighted avg       0.87      0.85      0.82      6517



In [60]:
%time
# Define the Logistic Regression model

# Define the parameter grid
param_grid_log_reg = {
    'C': [0.01, 0.1, 1],
    'solver': ['lbfgs', 'liblinear']  # Different solvers
}

# Initialize GridSearchCV
grid_search_log_reg = GridSearchCV(
    estimator=LogisticRegression(), param_grid=param_grid_log_reg, cv=5, scoring='accuracy', n_jobs=1, verbose=1)

# Fit GridSearchCV
grid_search_log_reg.fit(X_train, y_train)

CPU times: total: 0 ns
Wall time: 0 ns
Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [61]:

# Best hyperparameters and score
print("Best hyperparameters for Logistic Regression:", grid_search_log_reg.best_params_)
print("Best accuracy for Logistic Regression:", grid_search_log_reg.best_score_)

# Evaluate the best model on the test set
best_log_reg = grid_search_log_reg.best_estimator_
print("Test accuracy for Logistic Regression:", best_log_reg.score(X_test, y_test))

y_pred_log_reg_best = best_log_reg.predict(X_test)
print(f"Best Model Performance:\n{classification_report(y_test, y_pred_log_reg_best)}")

Best hyperparameters for Logistic Regression: {'C': 1, 'solver': 'liblinear'}
Best accuracy for Logistic Regression: 0.8479894705058004
Test accuracy for Logistic Regression: 0.8384225870799448
Best Model Performance:
              precision    recall  f1-score   support

         0.0       0.86      0.95      0.90      5072
         1.0       0.72      0.45      0.55      1445

    accuracy                           0.84      6517
   macro avg       0.79      0.70      0.73      6517
weighted avg       0.83      0.84      0.82      6517



In [64]:
# Define the SVM model
svm = SVC(probability=True)

# Define the parameter grid
param_grid_svm = {
    'C': [0.01, 0.1],
    'kernel': ['linear', 'poly'],
    'gamma': ['scale', 'auto']  # For 'rbf', 'poly', and 'sigmoid' kernels
}

# Initialize GridSearchCV
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=5, scoring='accuracy', n_jobs=1, verbose=2)

# Fit GridSearchCV
grid_search_svm.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time=   4.6s
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time=   4.6s
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time=   4.6s
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time=   4.5s
[CV] END .................C=0.01, gamma=scale, kernel=linear; total time=   4.6s
[CV] END ...................C=0.01, gamma=scale, kernel=poly; total time=   5.0s
[CV] END ...................C=0.01, gamma=scale, kernel=poly; total time=   4.9s
[CV] END ...................C=0.01, gamma=scale, kernel=poly; total time=   5.0s
[CV] END ...................C=0.01, gamma=scale, kernel=poly; total time=   5.0s
[CV] END ...................C=0.01, gamma=scale, kernel=poly; total time=   5.0s
[CV] END ..................C=0.01, gamma=auto, kernel=linear; total time=   4.6s
[CV] END ..................C=0.01, gamma=auto, ke

In [65]:
# Best hyperparameters and score
print("Best hyperparameters for SVM:", grid_search_svm.best_params_)
print("Best accuracy for SVM:", grid_search_svm.best_score_)

# Evaluate the best model on the test set
best_svm = grid_search_svm.best_estimator_
print("Test accuracy for SVM:", best_svm.score(X_test, y_test))
y_pred_svm_best = best_svm.predict(X_test)
print(f"Best Model Performance:\n{classification_report(y_test, y_pred_svm_best)}")

Best hyperparameters for SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}
Best accuracy for SVM: 0.8773019558665766
Test accuracy for SVM: 0.8750959030228633
Best Model Performance:
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92      5072
         1.0       0.90      0.49      0.64      1445

    accuracy                           0.88      6517
   macro avg       0.89      0.74      0.78      6517
weighted avg       0.88      0.88      0.86      6517



In [66]:
# save models
import joblib
joblib.dump(best_rf_model, 'best_random_forest_model.pkl')
joblib.dump(best_log_reg, 'best_logistic_regression_model.pkl')
joblib.dump(best_svm, 'best_svm_model.pkl')

['best_svm_model.pkl']

In [None]:
from tpot import TPOTClassifier
# using automl tpot to find best tree classifier with genetic algorithm
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
accuracy = tpot.score(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")
tpot.export('best_tpot_pipeline.py')
print('Model written successfully')

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.926258362300165

Generation 2 - Current best internal CV score: 0.926258362300165

Generation 3 - Current best internal CV score: 0.926258362300165
