In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,25200.0,0.1102,548.17,RENT,65000.0,Not Verified,n,42.67,0.0,1.0,...,10.0,0.0,0.0,282008.0,93765.0,57300.0,116320.0,N,N,low_risk
1,14000.0,0.2055,375.22,MORTGAGE,80000.0,Source Verified,n,15.47,0.0,0.0,...,75.0,0.0,0.0,434976.0,137629.0,17800.0,95032.0,N,N,low_risk
2,30000.0,0.1171,992.28,MORTGAGE,200000.0,Not Verified,n,14.14,0.0,0.0,...,100.0,0.0,0.0,99849.0,68769.0,13500.0,86349.0,N,N,low_risk
3,12000.0,0.1033,256.92,MORTGAGE,50000.0,Not Verified,n,21.41,0.0,0.0,...,33.3,0.0,0.0,209700.0,44654.0,13000.0,39700.0,N,N,low_risk
4,10625.0,0.1612,259.06,OWN,29000.0,Not Verified,n,25.87,0.0,0.0,...,0.0,0.0,0.0,35300.0,11893.0,18800.0,8000.0,N,N,low_risk


In [3]:
# Separate the target column (y) from the predictive features (X) in both our train and test dfs
X_train = train_df.drop("target", axis=1).copy()
X_test = test_df.drop("target", axis=1).copy()
y_train = train_df["target"].copy()
y_test = test_df["target"].copy()

In [4]:
# Convert categorical data to numeric and separate target feature for training data
X_train = pd.get_dummies(X_train)
X_train.head()
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,12000.0,0.2055,449.34,70000.0,28.56,0.0,1.0,9.0,1.0,18051.0,...,0,0,1,1,0,1,0,1,0,1
1,36000.0,0.0819,733.23,200000.0,11.38,0.0,0.0,13.0,0.0,35928.0,...,1,0,1,0,1,1,0,1,0,1
2,37225.0,0.1308,848.51,122700.0,16.83,0.0,0.0,11.0,0.0,32279.0,...,0,0,1,0,1,0,1,1,0,1
3,12000.0,0.1102,392.98,64500.0,36.63,0.0,2.0,15.0,0.0,24069.0,...,1,0,1,0,1,1,0,1,0,1
4,25000.0,0.1774,631.31,50000.0,33.82,0.0,0.0,12.0,0.0,22815.0,...,0,0,1,1,0,1,0,1,0,1


In [5]:

cols_to_add = set(X_train.columns) ^ set(X_test.columns)


for col in cols_to_add:
    X_test[col] = 0


X_test = X_test[X_train.columns]


target_encoder = LabelEncoder().fit(y_train)
y_train = target_encoder.transform(y_train)
y_test = target_encoder.transform(y_test)
y_test

array([1, 1, 1, ..., 0, 0, 0])

In [6]:
# Check the shapes to make sure X_train and X_test have the same number of columns (same with y_train/y_test)
# Check that X_train and y_train have the same number of rows (same with X_test/y_test)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(12790, 91)
(8418, 91)
(12790,)
(8418,)


In [7]:
# Train the Logistic Regression model on the unscaled data and print the model train and test score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_test, y_test))
print(lr.score(X_train, y_train))

0.48930862437633643
0.6162627052384676


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Train a Random Forest Classifier model and print the model train and test score
rf = RandomForestClassifier(n_estimators=350, max_depth=3)
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))
print(rf.score(X_train, y_train))

0.6777144214777857
0.690226739640344


In [9]:
# Create a scaler based on the X_train data
scaler = StandardScaler().fit(X_train)

# Use the scaler on X_train and X_test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
# Train the Logistic Regression model on the scaled data and print the model train/test score
lr.fit(X_train_scaled, y_train)
print(lr.score(X_test_scaled, y_test))
print(lr.score(X_train_scaled, y_train))

0.7198859586600143
0.7007036747458952


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
# Train a Random Forest Classifier model on the scaled data and print the model train/test score
rf.fit(X_train_scaled, y_train)
print(rf.score(X_test_scaled, y_test))
print(rf.score(X_train_scaled, y_train))

rf.feature_importances_

0.6648847707293895
0.6928068803752931


array([1.16653117e-02, 1.42697186e-01, 2.16686523e-02, 1.60554263e-03,
       5.15430565e-04, 5.94850817e-05, 1.18195057e-03, 2.21921550e-04,
       0.00000000e+00, 3.53093279e-03, 6.29475357e-04, 8.39204489e-02,
       9.33818975e-02, 1.30770228e-02, 8.82586092e-03, 4.94664921e-02,
       5.69781662e-02, 9.62559996e-02, 0.00000000e+00, 0.00000000e+00,
       1.83488837e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       8.20161364e-05, 2.84238639e-03, 1.56401736e-03, 9.38158899e-05,
       4.39158445e-04, 2.22944145e-03, 3.54005430e-04, 1.45234956e-04,
       7.23276696e-03, 3.06399536e-03, 5.31309167e-03, 1.77757166e-02,
       1.08222532e-02, 1.88835313e-02, 1.57151302e-03, 1.54396041e-04,
       7.00519164e-03, 1.31305677e-02, 3.41907519e-03, 2.08010090e-02,
       1.29483150e-03, 2.31600203e-05, 0.00000000e+00, 1.32232380e-03,
       1.63446096e-02, 3.29745304e-03, 1.15430758e-03, 6.99057257e-03,
       5.99129809e-03, 3.15186303e-03, 6.20176736e-04, 1.96833723e-04,
      