In [None]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt

In [3]:
from sklearn.metrics import balanced_accuracy_score,confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,balanced_accuracy_score
from sklearn.linear_model import LogisticRegression

In [4]:
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier,BalancedRandomForestClassifier
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler

In [5]:
columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [6]:
# Load the data
file_path = Path('LoanStats_2019Q1.csv.zip')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

len(df.columns)

86

In [7]:
df = pd.get_dummies(df, columns=["issue_d"
                                 ,"pymnt_plan"
                                 ,"home_ownership"
                                 ,"initial_list_status"
                                 ,"next_pymnt_d"
                                 ,"application_type"
                                 ,"hardship_flag"
                                 ,"debt_settlement_flag"
                                 ,"verification_status"])

In [8]:
LE = LabelEncoder()
le_ls = LE.fit(df["loan_status"])
df["loan_status"] = le_ls.transform(df["loan_status"])

In [9]:
len(df.columns)

96

# Split the Data into Training and Testing

In [10]:
# Create our features
X =  df.drop(columns="loan_status")

# Create our target
y = df["loan_status"]

In [11]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,initial_list_status_w,next_pymnt_d_Apr-2019,next_pymnt_d_May-2019,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.876121,0.383161,0.616839,0.86034,0.13966,1.0,1.0,0.478007,0.373992,0.148001
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.329446,0.486161,0.486161,0.346637,0.346637,0.0,0.0,0.49952,0.483865,0.355104
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
y.value_counts()

1    68470
0      347
Name: loan_status, dtype: int64

In [13]:
# Create X_train, X_test, y_train, y_test
X_train,X_test,y_train,y_test=train_test_split(X,y)

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

* **Create the StandardScaler instance**

In [14]:
scaler = StandardScaler()

* **Fit the Standard Scaler with the training data**

In [15]:
X_scaler = scaler.fit(X_train)

* **Scale the training and testing data**

In [16]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

* **Resample the training data with the BalancedRandomForestClassifier**

In [17]:
rf_model = BalancedRandomForestClassifier(random_state=1)

In [18]:
rf_model = rf_model.fit(X_train_scaled,y_train)
rf_predictions = rf_model.predict(X_test_scaled)

* **Calculated the balanced accuracy score**

In [19]:
bal_acc_score = balanced_accuracy_score(y_test, rf_predictions)
bal_acc_score

0.7886461488397707

### <U>***I generated a different score then on the homework example***

* **Display the confusion matrix**

In [20]:
cm = confusion_matrix(y_test,rf_predictions)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1 "], columns=["Prediction 0", "Prediction 1"])
cm_df

Unnamed: 0,Prediction 0,Prediction 1
Actual 0,57,27
Actual 1,1734,15387


* **Print the imbalanced classification report**

In [21]:
cr = classification_report_imbalanced(y_test,rf_predictions)
print("Random Forest Classification Report :")
print(cr)

Random Forest Classification Report :
                   pre       rec       spe        f1       geo       iba       sup

          0       0.03      0.68      0.90      0.06      0.78      0.60        84
          1       1.00      0.90      0.68      0.95      0.78      0.62     17121

avg / total       0.99      0.90      0.68      0.94      0.78      0.62     17205



* **List the features sorted in descending order by feature importance**

In [22]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True,)

[(0.07054913090573278, 'total_rec_prncp'),
 (0.0677123695179994, 'total_pymnt_inv'),
 (0.056870474009964284, 'total_rec_int'),
 (0.05626107050478853, 'last_pymnt_amnt'),
 (0.05603840165142847, 'total_pymnt'),
 (0.02553916950308028, 'int_rate'),
 (0.024026251503580412, 'issue_d_Jan-2019'),
 (0.020817295799601693, 'dti'),
 (0.017974806896227342, 'out_prncp_inv'),
 (0.017173349639989125, 'max_bal_bc'),
 (0.016805079993442974, 'mo_sin_old_il_acct'),
 (0.016584993957435484, 'bc_open_to_buy'),
 (0.016209905980895133, 'out_prncp'),
 (0.015795759232629544, 'mo_sin_old_rev_tl_op'),
 (0.015569757276389262, 'annual_inc'),
 (0.015275713381910284, 'avg_cur_bal'),
 (0.015149067421777782, 'total_bc_limit'),
 (0.015062104575545075, 'bc_util'),
 (0.01398114834249405, 'total_bal_il'),
 (0.013938740976692326, 'tot_hi_cred_lim'),
 (0.01391610679298811, 'loan_amnt'),
 (0.013697975325406574, 'total_rev_hi_lim'),
 (0.013271733382510465, 'total_bal_ex_mort'),
 (0.01310593538024286, 'mo_sin_rcnt_rev_tl_op'),
 

### Easy Ensemble Classifier

In [23]:
EE = EasyEnsembleClassifier(base_estimator=None, n_estimators=100, n_jobs=1,
            random_state=1, replacement=False, sampling_strategy='auto',
            verbose=0, warm_start=False)

* **Train the EasyEnsembleClassifier**

In [24]:
EE.fit(X_train, y_train)
ee_predictions = EE.predict(X_test)

* **Display the ``balanced_accuracy_score``**

In [25]:
balanced_accuracy_score(y_test, ee_predictions)

0.9336800253656745

* **Display the confusion matrix**

In [26]:
ee_cm = confusion_matrix(y_test,ee_predictions)
ee_cm

array([[   78,     6],
       [ 1048, 16073]], dtype=int64)

* **Print the imbalanced classification report**

In [27]:
ee_imbal_class = classification_report_imbalanced(y_test,ee_predictions)
print(ee_imbal_class)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.07      0.93      0.94      0.13      0.93      0.87        84
          1       1.00      0.94      0.93      0.97      0.93      0.87     17121

avg / total       1.00      0.94      0.93      0.96      0.93      0.87     17205



---
---

# <U>***Part Two***</U> -------------------->

* Generate the Features
* Generate the Target
* Split the Data into Training and Testing

In [28]:
X =  df.drop(columns="loan_status")
y = df["loan_status"]

In [29]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,initial_list_status_w,next_pymnt_d_Apr-2019,next_pymnt_d_May-2019,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.876121,0.383161,0.616839,0.86034,0.13966,1.0,1.0,0.478007,0.373992,0.148001
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.329446,0.486161,0.486161,0.346637,0.346637,0.0,0.0,0.49952,0.483865,0.355104
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
y.value_counts()

1    68470
0      347
Name: loan_status, dtype: int64

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

* **Create the StandardScaler instance**

In [31]:
scaler = StandardScaler()

* **Fit the Standarad Scaler with the traning data**

In [32]:
X_train_scaler = scaler.fit(X_train)
X_test_scaler = scaler.fit(X_test)

* **Scale the Traning and testing data**

In [33]:
X_train_scaled = X_train_scaler.transform(X_train)
X_test_scaled = X_test_scaler.transform(X_test)

# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

### Naive Random Oversampling

* **Resample the traning data with the ``RandomOverSampler``**

In [34]:
ROS = RandomOverSampler(random_state=1)

In [35]:
X_ros, y_ros = ROS.fit_resample(X_train,y_train)

In [36]:
Counter(y_ros)

Counter({1: 51349, 0: 51349})

* **Train the ``Logistic Regression`` model using the resampled data**

In [37]:
lr = LogisticRegression(random_state=1)

In [38]:
lr_over = lr.fit(X_ros,y_ros)
over_pred = lr_over.predict(X_test)

* **Calculated the balanced accuracy score**

In [39]:
over_bal = balanced_accuracy_score(y_test,over_pred)
over_bal

0.6371279631530201

* **Display the confusion matrix**

In [40]:
over_cm = confusion_matrix(y_test,over_pred)
over_cm

array([[   53,    31],
       [ 6107, 11014]], dtype=int64)

* **Print the imbalanced classification report**

In [41]:
over_imbal = classification_report_imbalanced(y_test,over_pred)
print(over_imbal)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.01      0.63      0.64      0.02      0.64      0.41        84
          1       1.00      0.64      0.63      0.78      0.64      0.41     17121

avg / total       0.99      0.64      0.63      0.78      0.64      0.41     17205



### SMOTE Oversampling

* Resample the traning data with SMOTE

In [42]:
X_smote,y_smote = SMOTE(random_state=1).fit_resample(X_train,y_train)

In [43]:
Counter(y_smote)

Counter({1: 51349, 0: 51349})

* **Train the Logistic Regression model using the resamled data**

In [44]:
lr = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100,
          n_jobs=None, penalty='l2', random_state=1, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [45]:
lr_smote = lr.fit(X_smote,y_smote)

* **Display the confusion matrix**

In [46]:
lr_predictions = lr_smote.predict(X_test)

In [47]:
cm = confusion_matrix(y_test,lr_predictions)
pd.DataFrame(cm)

Unnamed: 0,0,1
0,49,35
1,5753,11368


* **Display the ``calculated balanced accuracy score``**

In [48]:
bal_score = balanced_accuracy_score(y_test,lr_predictions)
bal_score

0.6236566205245021

* **Print the ``imbalanced classification report``**

In [49]:
imbalanced = classification_report_imbalanced(y_test,lr_predictions)
print(imbalanced)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.01      0.58      0.66      0.02      0.62      0.38        84
          1       1.00      0.66      0.58      0.80      0.62      0.39     17121

avg / total       0.99      0.66      0.58      0.79      0.62      0.39     17205



# Undersampling

In this section, you will test an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

* **Resample the data using the ``ClusterCentroids`` resampler**

In [50]:
RUS = RandomUnderSampler(random_state=1)

In [51]:
X_under,y_under = RUS.fit_resample(X_train,y_train)

In [52]:
Counter(y_train)

Counter({1: 51349, 0: 263})

* Train the Logistic Regression model using the resampled data

In [53]:
lr_under = lr.fit(X_under,y_under)

In [54]:
under_prediction = lr_under.predict(X_test)

* **Display the ``balanced_accuracy_score``**

In [55]:
bal_score = balanced_accuracy_score(y_test,under_prediction)

In [56]:
bal_score

0.6019529066226105

* **Display the ``Confusion matrix``**

In [57]:
cm2 = confusion_matrix(y_test,under_prediction)
cm2

array([[   50,    34],
       [ 6700, 10421]], dtype=int64)

* **Print the ``imbalanced_classification_report``**

In [58]:
imbal = classification_report_imbalanced(y_test,under_prediction)
print(imbal)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.01      0.60      0.61      0.01      0.60      0.36        84
          1       1.00      0.61      0.60      0.76      0.60      0.36     17121

avg / total       0.99      0.61      0.60      0.75      0.60      0.36     17205



# Combination (Over and Under) Sampling

In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

* **Resample the traning data with ``SMOTEENN``**

In [59]:
SMEE = SMOTEENN(random_state=1)

In [60]:
X_smoteenn,y_smoteenn = SMEE.fit_resample(X_train,y_train)
Counter(y_smoteenn)

Counter({0: 51334, 1: 46215})

* Train the ``Losgistic_Regression`` model using the resampled data

In [61]:

lr_smoteenn = lr.fit(X_smoteenn,y_smoteenn)
smoteenn_predictions = lr_smoteenn.predict(X_test)

* **Generate the ``balanced_accuracy_score``**

In [62]:
smoteenn_bal_score = balanced_accuracy_score(y_test,smoteenn_predictions)
smoteenn_bal_score

0.6363373718157317

* **Display the ``confusion_matrix``**

In [63]:
cm_smoteenn = confusion_matrix(y_test,smoteenn_predictions)
pd.DataFrame(cm_smoteenn)

Unnamed: 0,0,1
0,59,25
1,7357,9764


* **Print the ``imbalanced_classification_report**

In [64]:
smoteenn_imbal_score = classification_report_imbalanced(y_test,smoteenn_predictions)
print(smoteenn_imbal_score)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.01      0.70      0.57      0.02      0.63      0.41        84
          1       1.00      0.57      0.70      0.73      0.63      0.40     17121

avg / total       0.99      0.57      0.70      0.72      0.63      0.40     17205

