In [1]:
#import dependancies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
#load csv files
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
#count columns with object data types
sum(test_df.dtypes == object)

8

In [4]:
#check columns with object type
train_cat_columns = train_df.select_dtypes(['object']).columns
train_cat_columns

Index(['home_ownership', 'verification_status', 'loan_status', 'pymnt_plan',
       'initial_list_status', 'application_type', 'hardship_flag',
       'debt_settlement_flag'],
      dtype='object')

In [5]:
# Convert categorical data to numeric and separate target feature for training data
train_dummies_df = pd.get_dummies(train_df, prefix='', prefix_sep='',
                            columns=['home_ownership','verification_status'
                                     ,'loan_status','pymnt_plan',
                                     'initial_list_status','application_type',
                                     'hardship_flag','debt_settlement_flag'])

In [6]:
# check test_cat_columns data types that are object
test_cat_columns = test_df.select_dtypes(['object']).columns
test_cat_columns

Index(['home_ownership', 'verification_status', 'loan_status', 'pymnt_plan',
       'initial_list_status', 'application_type', 'hardship_flag',
       'debt_settlement_flag'],
      dtype='object')

In [7]:
# Convert categorical data to numeric and separate target feature for testing data
test_dummies_df = pd.get_dummies(test_df, prefix='', prefix_sep='',
                            columns=['home_ownership', 'verification_status', 'loan_status',
                                     'pymnt_plan','initial_list_status', 'application_type',
                                     'hardship_flag','debt_settlement_flag'])

In [8]:
num_train = test_dummies_df.columns[test_dummies_df.dtypes!='object']
cat_train = test_dummies_df.columns[test_dummies_df.dtypes=='object']
# test_dummies_df[num_train].isnull().sum().sort_values(ascending=False)

In [9]:
num_test = test_dummies_df.columns[test_dummies_df.dtypes!='object']
cat_test = test_dummies_df.columns[test_dummies_df.dtypes=='object']
# test_dummies_df[num_test].isnull().sum().sort_values(ascending=False)

In [10]:
# add missing dummy variables to testing set
#the following line returns empty list because there are two identical columns in train_dummies_df - 'Y'
c = list(set(train_dummies_df.columns).symmetric_difference(test_dummies_df))

#check if there are any differences in values between two identical columns 'Y'
(train_dummies_df['Y'] != train_dummies_df['Y']).sum()

#since there found to be no differences in columns 'Y', I decided to drop duplicates from dataframes
train_dummies_df = train_dummies_df.loc[:,~train_dummies_df.columns.duplicated()]
test_dummies_df = test_dummies_df.loc[:,~test_dummies_df.columns.duplicated()]
# len(train_dummies_df.columns)

X_train = train_dummies_df.drop(["high_risk","low_risk"], axis=1).values
y_train = train_dummies_df["high_risk"].values

X_test = test_dummies_df.drop(["high_risk","low_risk"], axis=1).values
y_test = test_dummies_df["high_risk"].values

In [11]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Logistic Regression Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Logistic RegressionTesting Data Score: {classifier.score(X_test, y_test)}")

Logistic Regression Training Data Score: 0.6493431855500821
Logistic RegressionTesting Data Score: 0.5250957039557635


In [12]:
# Train a Random Forest Classifier model and print the model score
rf_clf = RandomForestClassifier(random_state=1)
rf_clf.fit(X_train, y_train)
print(f"Random Forest Training Data Score: {rf_clf.score(X_train, y_train)}")
print(f"Random Forest Testing Data Score: {rf_clf.score(X_test, y_test)}")

Random Forest Training Data Score: 1.0
Random Forest Testing Data Score: 0.6225010633772863


In [13]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [14]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)
print(f"Logistic Regression Scaled Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Logistic Regression Scaled Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Logistic Regression Scaled Training Data Score: 0.7124794745484401
Logistic Regression Scaled Testing Data Score: 0.6712037430880476


In [15]:
# Train a Random Forest Classifier model on the scaled data and print the model score
rf_clf = RandomForestClassifier(random_state=1)
rf_clf.fit(X_train_scaled, y_train)
print(f"Random Forest Scaled Training Data Score: {rf_clf.score(X_train, y_train)}")
print(f"Random Forest Scaled Testing Data Score: {rf_clf.score(X_test, y_test)}")

Random Forest Scaled Training Data Score: 0.5027093596059113
Random Forest Scaled Testing Data Score: 0.5004253509145045
