In [1]:
### Import Dependancies ###
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
### Read in data ###
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
# Convert categorical data to numeric and separate target feature for testing data
train_df_X = train_df.drop(columns=['Unnamed: 0', 'index','loan_amnt'])
test_df_X = test_df.drop(columns=['Unnamed: 0', 'index','loan_amnt'])
train_df_y = train_df['loan_amnt'] 
test_df_y = test_df['loan_amnt']

train_dummies = pd.get_dummies(train_df_X)
test_dummies = pd.get_dummies(test_df_X)

In [4]:
# add missing dummy variables to testing set
train_col_list = train_dummies.columns.to_list()
test_col_list = test_dummies.columns.to_list()
train_col_df = pd.DataFrame({"train":train_col_list})
test_col_df = pd.DataFrame({"test":test_col_list})
checkDF = pd.concat([train_col_df,test_col_df],axis=1)

def checkColumns(x):
    train = x['train']
    test = x['test']
    if test == train:
        return 0
    else:
        return 1
checkDF['col to test'] = checkDF.apply(lambda x: checkColumns(x),axis=1)

coltotest = checkDF.loc[checkDF['col to test']==1]
coltotest_list = coltotest['train'].tolist()

for col in coltotest_list:
    test_dummies[col] = 0

In [5]:
X_train = train_dummies  
X_test = test_dummies
y_train = train_df_y
y_test = test_df_y

# Part 1: Prediction
---
I predict that the random forest regression will have a better score because it chooses the optimal split point for each decision tree. It should provide higher accuracy through cross validation. Furthermore, logistic regression performs better with scaled data, which these models in this part are using unscaled data.

In [77]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.09688013136288999
Testing Data Score: 0.06890684814972352


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.3198638877073586


# Part 1: Result
---
The result is that logistic regression performed worse than random forest classifier. This was an expected finding because random forest can handle large data that isn't scaled. In addition, it has an approach that optimizes the decision trees it chooses.

# Part 2: Prediction
---
I predict that the logistic regression will perform better with scaled data since it is a distance-based algorithm. However, it will still perform worse than the random forest classifier. Also the random forest classifer will perform about the same with the scaled data since it is a decision tree algorithm. 

In [None]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [83]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)
print(f'Training Score: {classifier.score(X_train_scaled, y_train)}')
print(f'Testing Score: {classifier.score(X_test_scaled, y_test)}')

Training Score: 0.4408866995073892
Testing Score: 0.1837515950659294


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.3200765631646108


# Part 2: Result
---
As expected, the results show that the logistic regression did perform significantly better with scaled data than when it was using unscaled data, which logically makes sense since it is a distanced-based algorithm. Thus, scaling the data will have a larger impact on the model score for a logistic regression model than a random forest classifier model. The other result is that the random forest classifer model using scaled data resulted in a very minor increase in the model score, which was expected since it uses a decision tree algorithm that chooses the optimal decision tree when building the model. 