### Preliminary Assessment

Given the binary nature of this dataset's output and the ability to trim out noisy variables if needed I predict the Logistic Regression will outperform the Random Forest Classifier.

### Import Dependencies

In [None]:
# data collection
from os.path import join
import pandas as pd

# data visualization
from matplotlib import pyplot as plt

# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

### Functions

In [None]:
# functions
def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    regression = model.fit(X_train_scaled, y_train)
    print(f"Model: {type(regression).__name__}")
    print(f"Train Score:\t {regression.score(X_train_scaled, y_train):,.4f}")
    print(f"Test Score:\t {regression.score(X_test_scaled, y_test):,.4f}")
    print("")

def predict_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    regression = model.fit(X_train_scaled, y_train)
    y_true = y_test
    y_pred = model.predict(X_test)
    return pd.DataFrame({"Prediction": y_pred, "Actual": y_true})

### Import & Prepare Data

In [None]:
# import and preview the raw data
raw_df = pd.read_csv(join("resources", "lending_data.csv"))
raw_df.head()

In [None]:
# extract variables and outcome
X = raw_df.drop("loan_status", axis = 1).values
y = raw_df["loan_status"].values

In [None]:
# create training and testing collections
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [None]:
# scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# create the data list
data = [X_train_scaled, X_test_scaled, y_train, y_test]

### Training and Testing

In [None]:
# test the models
test_model(LogisticRegression(), data)
test_model(RandomForestClassifier(), data)

### Predictions

In [None]:
# make predictions with the models
lr_df = predict_model(LogisticRegression(), data)
rfc_df = predict_model(RandomForestClassifier(), data)

In [None]:
# measure accuracy
lr_acc = []
rfc_acc = []
for i, row in lr_df.iterrows():
    if row["Prediction"] == row["Actual"]:
        lr_acc.append(1)
    else:
        lr_acc.append(0)
        
for i, row in rfc_df.iterrows():
    if row["Prediction"] == row["Actual"]:
        rfc_acc.append(1)
    else:
        rfc_acc.append(0)

print(f"Logistic Regression Prediction Accuracy: {sum(lr_acc) / len(lr_acc):,.3f}")
print(f"Random Forest Classifier Prediction Accuracy: {sum(rfc_acc) / len(rfc_acc):,.3f}")

### Assessment

While the training and testing scores show the two models perform nearly the same, the accuracy measurement showed the Logistic Regression (`0.968`) outperformed the Random Forest Classifier (`0.855`) by approximately 10% in prediction accuracy.