# Modeling

In [25]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import PowerTransformer, RobustScaler

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # supress warning 

In [11]:
# Check current working directory
current_directory = os.getcwd()
print("Current working directory:", current_directory)

Current working directory: C:\Users\jessh\Documents\MS_Applied_Data_Science\ADS599 Capstone Project\Project


#### Read in data from the preprocessing notebook

In [12]:
# Read in data
# all the training/validation/test dataframes
x_train = pd.read_csv('data/x_train.csv') 
x_train_scaled = pd.read_csv('data/x_train_scaled.csv')
x_train_pca = pd.read_csv('data/x_train_pca.csv')
x_train_scaled_pca = pd.read_csv('data/x_train_scaled_pca.csv')

x_val = pd.read_csv('data/x_val.csv') 
x_val_scaled = pd.read_csv('data/x_val_scaled.csv')
x_val_pca = pd.read_csv('data/x_val_pca.csv')
x_val_scaled_pca = pd.read_csv('data/x_val_scaled_pca.csv')

x_test = pd.read_csv('data/x_test.csv')
x_test_scaled = pd.read_csv('data/x_test_scaled.csv')
x_test_pca = pd.read_csv('data/x_test_pca.csv')
x_test_scaled_pca = pd.read_csv('data/x_test_scaled_pca.csv')


# all the labels
y_train = np.ravel(pd.read_csv('data/y_train.csv'))
y_val = np.ravel(pd.read_csv('data/y_val.csv'))
y_test = np.ravel(pd.read_csv('data/y_test.csv'))

#### Yeo Johnson transformation of data

We wanted to add in additional dataframes to see if there was a difference in modeling performance. This Yeo-Johnson transformation was one of them, another would be to do transformation + scaling.

In [28]:
# transformed data
# create copy of df 
x_train_transformed = x_train.copy()
x_val_transformed = x_val.copy()
x_test_transformed = x_test.copy()

# get numeric columns
numeric_columns = x_train_transformed.select_dtypes(include=['float']).columns

def yeo_johnson_transform(column):
    # Create an instance of PowerTransformer with Yeo-Johnson method
    pt = PowerTransformer(method='yeo-johnson')
    
    # Reshape column for PowerTransformer which expects 2D input
    column_reshaped = column.values.reshape(-1, 1)
    
    # Fit and transform the column
    transformed_col = pt.fit_transform(column_reshaped)
    
    # Flatten the result to match original column shape
    return transformed_col.flatten()

# Apply Box-Cox transformation to each numeric column
for col in numeric_columns:
    x_train_transformed[col] = yeo_johnson_transform(x_train_transformed[col])
    x_val_transformed[col] = yeo_johnson_transform(x_val_transformed[col])
    x_test_transformed[col] = yeo_johnson_transform(x_test_transformed[col])


#### Log transformed + scaled data

In [41]:
x_train_trans_scaled = x_train_transformed.copy()
x_val_trans_scaled = x_val_transformed.copy()
x_test_trans_scaled = x_test_transformed.copy()

scaler = RobustScaler()
x_train_trans_scaled[numeric_columns] = scaler.fit_transform(x_train_trans_scaled[numeric_columns])
x_val_trans_scaled[numeric_columns] = scaler.transform(x_val_trans_scaled[numeric_columns])
x_test_trans_scaled[numeric_columns] = scaler.transform(x_test_trans_scaled[numeric_columns])

## Baseline Model Selection - Logistic Regression

We'll first start by deciding on a baseline model for comparison against other models. The confusion matrix will be used to determine which dataframe will be ingested for each machine learning model. We currently have the following dataframes/data to feed into the logistic regression model:

* The preprocessed data - x_train
* The transformed data - x_train_tranformed
* The scaled data - x_train_scaled
* The transformed + scaled data - x_train_trans_scaled
* The pca transformed data - x_train_pca
* The scaled data + pca - x_train_scaled_pca

Based on the results of the baseline regression model, we can choose a dataframe to carry through the modeling process.

#### Create and train Logistic Regression Model for unscaled data

This is the first model with the data that has been preprocessed but not scaled nor transformed for normality. The accuracy was terrible, the precision and F-score were non existant.

In [5]:
# logreg model
model = LogisticRegression()

# Train the model
model.fit(x_train, y_train)

# Evaluate the model on the validation set
y_val_pred = model.predict(x_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_confusion_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print('Validation Confusion Matrix:')
print(val_confusion_matrix)
print('Validation Classification Report:')
print(val_classification_report)


# Evaluate the model on the test set
y_test_pred = model.predict(x_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy}')
print('Test Confusion Matrix:')
print(test_confusion_matrix)
print('Test Classification Report:')
print(test_classification_report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.25420515406663136
Validation Confusion Matrix:
[[     0 116831]
 [     0  39822]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00    116831
           1       0.25      1.00      0.41     39822

    accuracy                           0.25    156653
   macro avg       0.13      0.50      0.20    156653
weighted avg       0.06      0.25      0.10    156653

Test Accuracy: 0.25420353134934315
Test Confusion Matrix:
[[     0 116832]
 [     0  39822]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00    116832
           1       0.25      1.00      0.41     39822

    accuracy                           0.25    156654
   macro avg       0.13      0.50      0.20    156654
weighted avg       0.06      0.25      0.10    156654



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Create and train Logistic Regression Model for the scaled data

This is the first model with the data that has been preprocessed and scaled, but not transformed for normality. The accuracy was 100%, leading us to believe that the model is overfit.

In [14]:
# logreg model
model = LogisticRegression()

# Train the model
model.fit(x_train_scaled, y_train)

# Evaluate the model on the validation set
y_val_pred = model.predict(x_val_scaled)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_confusion_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print('Validation Confusion Matrix:')
print(val_confusion_matrix)
print('Validation Classification Report:')
print(val_classification_report)


# Evaluate the model on the test set
y_test_pred = model.predict(x_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy}')
print('Test Confusion Matrix:')
print(test_confusion_matrix)
print('Test Classification Report:')
print(test_classification_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy: 1.0
Validation Confusion Matrix:
[[116831      0]
 [     0  39822]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    116831
           1       1.00      1.00      1.00     39822

    accuracy                           1.00    156653
   macro avg       1.00      1.00      1.00    156653
weighted avg       1.00      1.00      1.00    156653

Test Accuracy: 1.0
Test Confusion Matrix:
[[116832      0]
 [     0  39822]]
Test Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    116832
           1       1.00      1.00      1.00     39822

    accuracy                           1.00    156654
   macro avg       1.00      1.00      1.00    156654
weighted avg       1.00      1.00      1.00    156654



#### Create and train Logistic Regression Model for yeo-johnson transformed data

This is the first model with the data that has been preprocessed and transformed, but not scaled. The accuracy was 100%, leading us to believe that the model is also overfit.

In [31]:
# logreg model
model = LogisticRegression()

# Train the model
model.fit(x_train_transformed, y_train)

# Evaluate the model on the validation set
y_val_pred = model.predict(x_val_transformed)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_confusion_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print('Validation Confusion Matrix:')
print(val_confusion_matrix)
print('Validation Classification Report:')
print(val_classification_report)


# Evaluate the model on the test set
y_test_pred = model.predict(x_test_transformed)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy}')
print('Test Confusion Matrix:')
print(test_confusion_matrix)
print('Test Classification Report:')
print(test_classification_report)

Validation Accuracy: 1.0
Validation Confusion Matrix:
[[116831      0]
 [     0  39822]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    116831
           1       1.00      1.00      1.00     39822

    accuracy                           1.00    156653
   macro avg       1.00      1.00      1.00    156653
weighted avg       1.00      1.00      1.00    156653

Test Accuracy: 1.0
Test Confusion Matrix:
[[116832      0]
 [     0  39822]]
Test Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    116832
           1       1.00      1.00      1.00     39822

    accuracy                           1.00    156654
   macro avg       1.00      1.00      1.00    156654
weighted avg       1.00      1.00      1.00    156654



#### Create and train Logistic Regression Model for yeo-johnson transformed and scaled data

This is the first model with the data that has been preprocessed, scaled, and transformed for normality. The accuracy was 100%, leading us to believe that the model is also overfit.

In [42]:
# logreg model
model = LogisticRegression()

# Train the model
model.fit(x_train_trans_scaled, y_train)

# Evaluate the model on the validation set
y_val_pred = model.predict(x_val_trans_scaled)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_confusion_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print('Validation Confusion Matrix:')
print(val_confusion_matrix)
print('Validation Classification Report:')
print(val_classification_report)


# Evaluate the model on the test set
y_test_pred = model.predict(x_test_trans_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy}')
print('Test Confusion Matrix:')
print(test_confusion_matrix)
print('Test Classification Report:')
print(test_classification_report)

Validation Accuracy: 1.0
Validation Confusion Matrix:
[[116831      0]
 [     0  39822]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    116831
           1       1.00      1.00      1.00     39822

    accuracy                           1.00    156653
   macro avg       1.00      1.00      1.00    156653
weighted avg       1.00      1.00      1.00    156653

Test Accuracy: 1.0
Test Confusion Matrix:
[[116832      0]
 [     0  39822]]
Test Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    116832
           1       1.00      1.00      1.00     39822

    accuracy                           1.00    156654
   macro avg       1.00      1.00      1.00    156654
weighted avg       1.00      1.00      1.00    156654



#### Create and train Logistic Regression Model for the PCA transformed data (orig)

This is the fifth model with the data that has been preprocessed, but not scaled nor transformed for normality. The accuracy was about 81%, which is the best model so far.

In [15]:
# logreg model
model = LogisticRegression()

# Train the model
model.fit(x_train_pca, y_train)

# Evaluate the model on the validation set
y_val_pred = model.predict(x_val_pca)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_confusion_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print('Validation Confusion Matrix:')
print(val_confusion_matrix)
print('Validation Classification Report:')
print(val_classification_report)


# Evaluate the model on the test set
y_test_pred = model.predict(x_test_pca)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy}')
print('Test Confusion Matrix:')
print(test_confusion_matrix)
print('Test Classification Report:')
print(test_classification_report)

Validation Accuracy: 0.8094706133939343
Validation Confusion Matrix:
[[114853   1978]
 [ 27869  11953]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.98      0.89    116831
           1       0.86      0.30      0.44     39822

    accuracy                           0.81    156653
   macro avg       0.83      0.64      0.66    156653
weighted avg       0.82      0.81      0.77    156653

Test Accuracy: 0.8097080189462127
Test Confusion Matrix:
[[114859   1973]
 [ 27837  11985]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.98      0.89    116832
           1       0.86      0.30      0.45     39822

    accuracy                           0.81    156654
   macro avg       0.83      0.64      0.67    156654
weighted avg       0.82      0.81      0.77    156654



# Create and train Logistic Regression Model for the PCA transformed data (scaled)

This is the sixth model with the data that has been preprocessed and scaled, but not transformed for normality. The accuracy was about 82%, which is the best model so far beating the previous model.

In [16]:
# logreg model
model = LogisticRegression()

# Train the model
model.fit(x_train_scaled_pca, y_train)

# Evaluate the model on the validation set
y_val_pred = model.predict(x_val_scaled_pca)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_confusion_matrix = confusion_matrix(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print('Validation Confusion Matrix:')
print(val_confusion_matrix)
print('Validation Classification Report:')
print(val_classification_report)


# Evaluate the model on the test set
y_test_pred = model.predict(x_test_scaled_pca)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)

print(f'Test Accuracy: {test_accuracy}')
print('Test Confusion Matrix:')
print(test_confusion_matrix)
print('Test Classification Report:')
print(test_classification_report)

Validation Accuracy: 0.8236739800706019
Validation Confusion Matrix:
[[107864   8967]
 [ 18655  21167]]
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.92      0.89    116831
           1       0.70      0.53      0.61     39822

    accuracy                           0.82    156653
   macro avg       0.78      0.73      0.75    156653
weighted avg       0.81      0.82      0.81    156653

Test Accuracy: 0.82516245994357
Test Confusion Matrix:
[[108066   8766]
 [ 18623  21199]]
Test Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.92      0.89    116832
           1       0.71      0.53      0.61     39822

    accuracy                           0.83    156654
   macro avg       0.78      0.73      0.75    156654
weighted avg       0.82      0.83      0.82    156654

