# Set up Environment

In [2]:
pip install imbalanced-learn

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
# Import Necessary Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import sklearn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import (confusion_matrix, 
                             roc_auc_score, 
                             average_precision_score)
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc, f1_score, cohen_kappa_score, brier_score_loss

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

# import xgboost as xgb

import warnings
import time
warnings.filterwarnings("ignore")

from multiprocessing import Pool
import dask.dataframe as dd
from dask.distributed import Client
from dask.diagnostics import ProgressBar
from collections import Counter

# Load Processed Data

In [4]:
# set up table for all loans
df = pd.read_csv('/Users/miracles/Desktop/practicum/Processed Data/df_final.csv')
df

Unnamed: 0,Loan Sequence Number,Monthly Reporting Period,Current Actual UPB,Current Loan Delinquency Status,Loan Age,Remaining Months to Legal Maturity,Current Interest Rate,Delinquency Due to Disaster,Interest Bearing UPB,Original UPB,...,Occupancy Status_S,Property Type_CO,Property Type_CP,Property Type_MH,Property Type_PU,Property Type_SF,Property Valuation Method_1.0,Property Valuation Method_2.0,Property Valuation Method_3.0,Property Valuation Method_9.0
0,F11Q10000044,2011-02-01,135000.00,0,0.0,360.0,4.625,0,135000.00,135000.0,...,False,False,False,False,True,False,False,False,False,True
1,F11Q10000044,2011-03-01,135000.00,0,1.0,359.0,4.625,0,135000.00,135000.0,...,False,False,False,False,True,False,False,False,False,True
2,F11Q10000044,2011-04-01,135000.00,0,2.0,358.0,4.625,0,135000.00,135000.0,...,False,False,False,False,True,False,False,False,False,True
3,F11Q10000044,2011-05-01,134000.00,0,3.0,357.0,4.625,0,134000.00,135000.0,...,False,False,False,False,True,False,False,False,False,True
4,F11Q10000044,2011-06-01,134000.00,0,4.0,356.0,4.625,0,134000.00,135000.0,...,False,False,False,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014603,F19Q40515256,2022-11-01,115037.14,16,34.0,326.0,4.125,0,115037.14,118000.0,...,False,False,False,False,False,True,False,True,False,False
2014604,F19Q40515256,2022-12-01,115037.14,17,35.0,325.0,4.125,0,115037.14,118000.0,...,False,False,False,False,False,True,False,True,False,False
2014605,F19Q40515256,2023-01-01,115037.14,18,36.0,324.0,4.125,0,115037.14,118000.0,...,False,False,False,False,False,True,False,True,False,False
2014606,F19Q40515256,2023-02-01,115037.14,19,37.0,323.0,4.125,0,115037.14,118000.0,...,False,False,False,False,False,True,False,True,False,False


In [5]:
# Define columns of different types
columns_orig = [
       'Original UPB', 'Mortgage Insurance Percentage (MI %)',
       'Original Loan-to-Value (LTV)', 'Original Interest Rate', 'Super Conforming Flag', 'Credit Score',
       'Original Debt-to-Income (DTI) Ratio',
       'Number of Borrowers', 'Number of Units', 'Property Valuation Method',
       'Valid DTI Ratio', 'Channel_B', 'Channel_C', 'Channel_R', 'Loan Purpose_C',
       'Loan Purpose_N', 'Loan Purpose_P', 'First Time Homebuyer Flag_N',
       'First Time Homebuyer Flag_Y', 'Occupancy Status_I',
       'Occupancy Status_P', 'Occupancy Status_S', 'Property Type_CO',
       'Property Type_CP', 'Property Type_MH', 'Property Type_PU',
       'Property Type_SF']

columns_monthly = [
       'Monthly Reporting Period',
       'Current Actual UPB', 'Current Loan Delinquency Status', 'Loan Age',
       'Remaining Months to Legal Maturity', 
       'Current Interest Rate', 'Delinquency Due to Disaster', 'Interest Bearing UPB',
       'Label']

columns_categorical_final = [
       'Valid DTI Ratio',
       'Super Conforming Flag',
       'Delinquency Due to Disaster',
       'Channel',
       'Loan Purpose',
       'First Time Homebuyer Flag',
       'Occupancy Status',
       'Property Type',
       'Property Valuation Method']

columns_continuous = [
       'Current Actual UPB', 'Current Loan Delinquency Status', 'Loan Age',
       'Remaining Months to Legal Maturity', 'Current Interest Rate',
       'Interest Bearing UPB', 'Original UPB',
       'Mortgage Insurance Percentage (MI %)', 'Original Loan-to-Value (LTV)',
       'Original Interest Rate',
       'Credit Score',
       'Original Debt-to-Income (DTI) Ratio', 'Number of Borrowers',
       'Number of Units','Valid DTI Ratio',
       'Housing Price', 'CPI', 'Unemployment_Rate', 'Recession'
]

## Create shifted label

In [6]:
# Convert 'Monthly Reporting Period' to datetime type for proper sorting and manipulation
df['Monthly Reporting Period'] = pd.to_datetime(df['Monthly Reporting Period'])

# Sort the DataFrame by 'Loan Sequence Number' and 'Monthly Reporting Period' to ensure correct label alignment
df.sort_values(by=['Loan Sequence Number', 'Monthly Reporting Period'], inplace=True)

# Group by 'Loan Sequence Number' and shift 'Label' up by one to create the 'label_target'
df['label_target'] = df.groupby('Loan Sequence Number')['Label'].shift(-1)

# Drop rows where 'label_target' is NaN, indicating no subsequent month data for the same loan sequence number
df = df.dropna(subset=['label_target'])
df = df.drop(columns = {'Label'})
df = df.reset_index(drop = True)

In [7]:
df['target_0&1'] = df['label_target'].apply(lambda x: 0 if x in [0, 1] else 1)

## Create Windowed dataframes for Modeling

In [8]:
def gaussian_weights(window_size, std_dev):
    # Creates a series of weights for a Gaussian distribution
    # The most recent time point (the last in the sequence) gets the highest weight
    return np.exp(-0.5 * ((np.arange(window_size) - window_size + 1) / std_dev) ** 2)

def apply_gaussian_rolling(group, window_size=12, std_dev=3):
    weights = gaussian_weights(window_size, std_dev)
    # Initialize a DataFrame to hold the weighted averages
    weighted_averages = pd.DataFrame(index=group.index)
    # Calculate the weighted average for each feature
    for column in columns_to_process:
        # Prepend zeros for padding if there are not enough past months
        padded_series = np.concatenate((np.zeros(window_size - 1), group[column].to_numpy()))
        # Apply convolution to calculate the weighted average using Gaussian weights
        weighted_avg = np.convolve(padded_series, weights[::-1], mode='valid') / np.convolve(np.ones_like(padded_series), weights[::-1], mode='valid')
        weighted_averages[column] = weighted_avg
    return weighted_averages

# List of columns to process
columns_to_process = ['Current Actual UPB', 'Current Loan Delinquency Status', 'Loan Age',
       'Remaining Months to Legal Maturity', 'Current Interest Rate',
       'Delinquency Due to Disaster', 'Interest Bearing UPB', 'Original UPB',
       'Mortgage Insurance Percentage (MI %)', 'Original Loan-to-Value (LTV)',
       'Original Interest Rate', 'Super Conforming Flag', 'Credit Score',
       'Original Debt-to-Income (DTI) Ratio', 'Number of Borrowers',
       'Number of Units', 'Valid DTI Ratio', 'Housing Price', 'CPI',
       'Unemployment_Rate', 'Recession', 'Channel_B', 'Channel_C', 'Channel_R',
       'Loan Purpose_C', 'Loan Purpose_N', 'Loan Purpose_P',
       'First Time Homebuyer Flag_N', 'First Time Homebuyer Flag_Y',
       'Occupancy Status_I', 'Occupancy Status_P', 'Occupancy Status_S',
       'Property Type_CO', 'Property Type_CP', 'Property Type_MH',
       'Property Type_PU', 'Property Type_SF', 'Property Valuation Method_1.0',
       'Property Valuation Method_2.0', 'Property Valuation Method_3.0',
       'Property Valuation Method_9.0']  

# Apply the function to each group and store the result
window_size = 12  # This includes the current month and the past 11 months
std_dev = 2  # Standard deviation for the Gaussian weights

# We need to make sure the DataFrame is grouped by Loan Sequence Number and sorted by date within each group
df = df.sort_values(['Loan Sequence Number', 'Monthly Reporting Period'])

# Apply the transformation and update the original columns
transformed_df = df.groupby('Loan Sequence Number').apply(lambda x: apply_gaussian_rolling(x, window_size, std_dev)).reset_index(level=0, drop=True)
transformed_df[['Loan Sequence Number', 'Monthly Reporting Period','label_target','target_0&1']] = df[['Loan Sequence Number', 'Monthly Reporting Period','label_target','target_0&1']] 
transformed_df.head()

Unnamed: 0,Current Actual UPB,Current Loan Delinquency Status,Loan Age,Remaining Months to Legal Maturity,Current Interest Rate,Delinquency Due to Disaster,Interest Bearing UPB,Original UPB,Mortgage Insurance Percentage (MI %),Original Loan-to-Value (LTV),...,Property Type_PU,Property Type_SF,Property Valuation Method_1.0,Property Valuation Method_2.0,Property Valuation Method_3.0,Property Valuation Method_9.0,Loan Sequence Number,Monthly Reporting Period,label_target,target_0&1
0,44900.795304,0.0,0.0,119.735454,1.538268,0.0,44900.795304,44900.795304,0.0,18.292917,...,0.332598,0.0,0.0,0.0,0.0,0.332598,F11Q10000044,2011-02-01,0.0,0
1,84525.608084,0.0,0.332598,225.069023,2.895785,0.0,84525.608084,84525.608084,0.0,34.436359,...,0.626116,0.0,0.0,0.0,0.0,0.626116,F11Q10000044,2011-03-01,0.0,0
2,111759.317081,0.0,0.958714,297.066131,3.828791,0.0,111759.317081,111759.317081,0.0,45.531574,...,0.827847,0.0,0.0,0.0,0.0,0.827847,F11Q10000044,2011-04-01,0.0,0
3,126003.872579,0.0,1.786561,335.110695,4.328194,0.0,126003.872579,126336.471063,0.0,51.470414,...,0.935826,0.0,0.0,0.0,0.0,0.935826,F11Q10000044,2011-05-01,0.0,0
4,131787.017298,0.0,2.722387,350.379301,4.536376,0.0,131787.017298,132413.132913,0.0,53.946091,...,0.980838,0.0,0.0,0.0,0.0,0.980838,F11Q10000044,2011-06-01,0.0,0


In [None]:
# Define paths for the CSV files on the desktop
transform_csv_path = "/Users/Miracles/Desktop/df_rolling.csv"

# Output dataframes to CSV
transformed_df.to_csv(transform_csv_path, index=False)

## Standard Scaling the Continuous Data

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the data and then transform
transformed_df[columns_continuous] = scaler.fit_transform(transformed_df[columns_continuous])

# If you want to transform your DataFrame and store the scaled values directly back into it
transformed_df[columns_continuous] = scaler.fit_transform(transformed_df[columns_continuous].values)
transformed_df

# Logistic Regression

In [None]:
# Step 1: Split the data into features and target
X = transformed_df.drop(['Loan Sequence Number','Monthly Reporting Period','label_target','target_0&1'], axis=1)  
y = transformed_df['target_0&1']

# Step 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that first oversamples the minority class and then trains a logistic regression model
pipeline = make_pipeline(
    SMOTE(random_state=42),  # This will oversample the minority class
    StandardScaler(),        # This will standardize the features
    LogisticRegression(class_weight='balanced')  # This sets the class weight to 'balanced'
)

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Predict on the test data
predictions = pipeline.predict(X_test)

# Evaluation
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print("Accuracy:", accuracy_score(y_test, predictions))

# Precision-Recall Curve
precision, recall, thresholds_pr = precision_recall_curve(y_test, predictions)
plt.figure(figsize=(6, 6))
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

# Calculate the F1 score
f1 = f1_score(y_test, predictions)
print(f'F1 Score: {f1}')

# Calculate Cohen's Kappa
kappa = cohen_kappa_score(y_test, predictions)
print(f'Cohen\'s Kappa: {kappa}')

# ROC Curve and AUC-ROC
fpr, tpr, thresholds_roc = roc_curve(y_test, predictions)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')  # Dashed diagonal
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Brier Score
brier_score = brier_score_loss(y_test, predictions)
print(f'Brier Score: {brier_score}')