In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler


---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
customer_data_df = pd.read_csv("./Resources/bank_customer_data.csv")

# Review the DataFrame
customer_data_df.head()


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = customer_data_df['Exited']

# Separate the raw X variable data, the features, and drop the labels/outcomes column as well as additional columns not useful in predicting outcomes
X_raw_data = customer_data_df.copy()
X_raw_data.drop(['Exited','Surname','CustomerId','RowNumber'], axis=1, inplace=True)


In [4]:
# Convert/encode columns with categorical feature values into numerical values, save results as final 'X' feature DataFrame
X_encoded = pd.get_dummies(X_raw_data, dtype=int)

X = X_encoded


In [5]:
# Review the y variable Series
print(y[:5])


0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64


In [6]:
# Review the X variable DataFrame
X.head(10)


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,1,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,1,0
3,699,39,1,0.0,2,0,0,93826.63,1,0,0,1,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,1,0
5,645,44,8,113755.78,2,1,0,149756.71,0,0,1,0,1
6,822,50,7,0.0,2,1,1,10062.8,1,0,0,0,1
7,376,29,4,115046.74,4,1,0,119346.88,0,1,0,1,0
8,501,44,4,142051.07,2,0,1,74940.5,1,0,0,0,1
9,684,27,2,134603.88,1,1,1,71725.73,1,0,0,0,1


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [7]:
# Check the balance of our target values
y.value_counts()


Exited
0    7963
1    2037
Name: count, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [8]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## 1: ORIGINAL DATA Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [9]:
# Define training and testing data sets for this first version of the model and label with '1'

X_train1 = X_train
X_test1 = X_test
y_train1 = y_train
y_test1 = y_test


In [10]:
# Instantiate the Logistic Regression model with a random_state parameter of 1, labeled with '1' for this first version of the model
classifier1 = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier1.fit(X_train1, y_train1)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [11]:
# Make a prediction using the testing data
predictions1 = classifier1.predict(X_test1)
pd.DataFrame({"Prediction": predictions1, "Actual": y_test1}).head(10)


Unnamed: 0,Prediction,Actual
9953,0,0
3850,0,0
4962,0,0
3886,0,0
5437,0,0
8517,0,0
2041,0,0
1989,0,0
1933,1,0
9984,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [12]:
# Print the balanced_accuracy score of the 1st model
print(balanced_accuracy_score(y_test1, predictions1))


0.5135295260295261


In [13]:
# Generate a confusion matrix for the 1st model
cm1 = confusion_matrix(y_test1, predictions1)
cm1_df = pd.DataFrame(
    cm1, index=['actual_retained', 'actual_churned'], columns=['predicted_retained', 'predicted_churned']
)

print(cm1_df)


                 predicted_retained  predicted_churned
actual_retained                1946                 34
actual_churned                  497                 23


In [14]:
# Print the classification report for the 1st model
target_names = ['retained_customer', 'churned_customer']

cr1 = classification_report(y_test1, predictions1,
                            target_names=target_names)

print(cr1)


                   precision    recall  f1-score   support

retained_customer       0.80      0.98      0.88      1980
 churned_customer       0.40      0.04      0.08       520

         accuracy                           0.79      2500
        macro avg       0.60      0.51      0.48      2500
     weighted avg       0.71      0.79      0.71      2500



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model is quite effective at predicting both healthy and high-risk loans, as indicated by the strong f1-scores for each (as well as the strong underlying precision and recall scores). However, in this case the logistic regression model appears to be significantly less effective at predicting high-risk loans than at predicting healthy loans, based on the respective f1-scores of 0.88 vs 1.00). This may be partially due to the dataset being quite unbalanced, with a significant majority of loans being healthy, but for business purposes it may be ideal to find a model that skews in the other direction - 'over-flagging' loans for further review may be better than 'under-flagging', depending on the cost vs. benefit of manually reviewing potentially high-risk loans.

---

## 2: RESAMPLED/OVERSAMPLED DATA Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [15]:
# Instantiate the random oversampler model with a random_state parameter of 1
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_R, y_R = ros.fit_resample(X_train, y_train)


In [16]:
# Count the distinct values of the resampled labels data
y_R.value_counts()


Exited
0    5983
1    5983
Name: count, dtype: int64

In [17]:
# Define training and testing data sets for this second version of the model and label with '2'

X_train2 = X_R
X_test2 = X_test
y_train2 = y_R
y_test2 = y_test


### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [18]:
# Instantiate the Logistic Regression model with a random_state parameter of 1, labeled with '2' for this first version of the model
classifier2 = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
classifier2.fit(X_train2, y_train2)

# Make a prediction using the testing data
predictions2 = classifier2.predict(X_test2)
pd.DataFrame({"Prediction": predictions2, "Actual": y_test2}).head(10)


Unnamed: 0,Prediction,Actual
9953,1,0
3850,0,0
4962,1,0
3886,0,0
5437,0,0
8517,0,0
2041,0,0
1989,0,0
1933,1,0
9984,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [19]:
# Print the balanced_accuracy score of the 2nd model 
print(balanced_accuracy_score(y_test, predictions2))


0.673931623931624


In [20]:
# Generate a confusion matrix for the 2nd model
cm2 = confusion_matrix(y_test2, predictions2)
cm2_df = pd.DataFrame(
    cm2, index=['actual_healthy', 'actual_at_risk'], columns=['predicted_healthy', 'predicted_at_risk']
)

print(cm2_df)


                 predicted_retained  predicted_churned
actual_retained                1946                 34
actual_churned                  497                 23
                predicted_healthy  predicted_at_risk
actual_healthy               1298                682
actual_at_risk                160                360


In [21]:
# Print the classification report for the 2nd model

cr2 = classification_report(y_test2, predictions2,
                            target_names=target_names)

print(cr2)


                   precision    recall  f1-score   support

retained_customer       0.80      0.98      0.88      1980
 churned_customer       0.40      0.04      0.08       520

         accuracy                           0.79      2500
        macro avg       0.60      0.51      0.48      2500
     weighted avg       0.71      0.79      0.71      2500

                   precision    recall  f1-score   support

retained_customer       0.89      0.66      0.76      1980
 churned_customer       0.35      0.69      0.46       520

         accuracy                           0.66      2500
        macro avg       0.62      0.67      0.61      2500
     weighted avg       0.78      0.66      0.69      2500



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** While the original logistic regression model is quite effective at predicting both healthy and high-risk loans, refitting a logistic regression model to the re-/over-sampled data produces a solidly more reliable model/prediction with a higher f1-score for predicting high-risk loans.

From a business perspective this over-sampling-based model may be much more ideal, as the  model 'missed' 52 fewer high-risk loans (just 1/14 (or ~7%) as many high-risk loans as the original model 'missed'), while only mistakenly predicting 14 additional healthy loans as being high-risk compared to the original model. This new model seems likely to be much more favorable from a profitabilty perspective and I would likely recommend that the bank put it into use for flagging potentially high-risk loans for manual/human review (though again it depends on the cost of manual review vs the benefit of identifying high-risk loans early).

## SCALED DATA Predict a Logistic Regression Model with Scaled Training Data

In [22]:
# Instantiate the StandardScaler model to normalize the original training and test data
sc = StandardScaler()


In [23]:
# Define training and testing data sets for this third version of the model, labled with '3', using the StandardScaler model fit to X_train to transform both X_train and X_test

X_train3 = sc.fit_transform(X_train)
X_test3 = sc.transform(X_test)
y_train3 = y_train
y_test3 = y_test


In [24]:
# Instantiate the Logistic Regression model with a random_state parameter of 1, labeled with '3' for this third version of the model
classifier3 = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier3.fit(X_train3, y_train3)


In [25]:
# Make a prediction using the testing data
predictions3 = classifier3.predict(X_test3)
pd.DataFrame({"Prediction": predictions3, "Actual": y_test3}).head(10)


Unnamed: 0,Prediction,Actual
9953,0,0
3850,0,0
4962,0,0
3886,0,0
5437,0,0
8517,0,0
2041,0,0
1989,0,0
1933,1,0
9984,0,0


In [26]:
# Print the balanced_accuracy score of the 3rd model
print(balanced_accuracy_score(y_test3, predictions3))


0.593405205905206


In [27]:
# Generate a confusion matrix for the 3rd model
cm3 = confusion_matrix(y_test3, predictions3)
cm3_df = pd.DataFrame(
    cm3, index=['actual_retained', 'actual_churned'], columns=['predicted_retained', 'predicted_churned']
)

print(cm3_df)


                 predicted_retained  predicted_churned
actual_retained                1912                 68
actual_churned                  405                115


In [28]:
# Print the classification report for the 3rd model
target_names = ['retained_customer', 'churned_customer']

cr3 = classification_report(y_test3, predictions3,
                            target_names=target_names)

print(cr3)


                   precision    recall  f1-score   support

retained_customer       0.83      0.97      0.89      1980
 churned_customer       0.63      0.22      0.33       520

         accuracy                           0.81      2500
        macro avg       0.73      0.59      0.61      2500
     weighted avg       0.78      0.81      0.77      2500



## SCALED+RESAMPLED/OVERSAMPLED DATA Predict a Logistic Regression Model with Scaled+Oversampled Training Data

In [29]:
# Instantiate the random oversampler model with a random_state parameter of 1
ros = RandomOverSampler(random_state=1)

# Fit the scaled training data from the prior version of the model to the random_oversampler model
X_R, y_R = ros.fit_resample(X_train3, y_train3)


In [30]:
# Count the distinct values of the resampled labels/outcomes data
y_R.value_counts()


Exited
0    5983
1    5983
Name: count, dtype: int64

In [31]:
# Define training and testing data sets for this fourth version of the model and label with '4'

X_train4 = X_R
X_test4 = X_test3
y_train4 = y_R
y_test4 = y_test


In [32]:
# Instantiate the Logistic Regression model with a random_state parameter of 1
classifier4 = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier4.fit(X_train4, y_train4)


In [33]:
# Make a prediction using the testing data
predictions4 = classifier4.predict(X_test4)
pd.DataFrame({"Prediction": predictions4, "Actual": y_test4}).head(10)


Unnamed: 0,Prediction,Actual
9953,0,0
3850,0,0
4962,0,0
3886,0,0
5437,0,0
8517,0,0
2041,1,0
1989,0,0
1933,1,0
9984,0,0


In [34]:
# Print the balanced_accuracy score of the 4th model
print(balanced_accuracy_score(y_test4, predictions4))


0.7118589743589743


In [35]:
# Generate a confusion matrix for the 4th model
cm4 = confusion_matrix(y_test4, predictions4)
cm4_df = pd.DataFrame(
    cm4, index=['actual_retained', 'actual_churned'], columns=['predicted_retained', 'predicted_churned']
)

print(cm4_df)


                 predicted_retained  predicted_churned
actual_retained                1452                528
actual_churned                  161                359


In [36]:
# Print the classification report for the 4th model
target_names = ['retained_customer', 'churned_customer']

cr4 = classification_report(y_test4, predictions4,
                            target_names=target_names)

print(cr4)


                   precision    recall  f1-score   support

retained_customer       0.90      0.73      0.81      1980
 churned_customer       0.40      0.69      0.51       520

         accuracy                           0.72      2500
        macro avg       0.65      0.71      0.66      2500
     weighted avg       0.80      0.72      0.75      2500

