In [None]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

#Import RandomOverSampler for recommended model
from imblearn.over_sampling import RandomOverSampler

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [None]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_df = pd.read_csv('lending_data.csv')

# Review the DataFrame
lending_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [None]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_df['loan_status']

# Separate the X variable, the features
X = lending_df.drop('loan_status', axis =1)

In [None]:
# Review the y variable Series
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [None]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [None]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state =1)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression = LogisticRegression(random_state=1)

# Fit the model using training data
logistic_regression.fit(X_train,y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [None]:
# Make a prediction using the testing data
y_test_pred = logistic_regression.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test,y_test_pred)
cm_df = pd.DataFrame(cm, index= ['Actual 0', 'Actual 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18663,102
Actual 1,56,563


In [None]:
# Print the classification report for the model
print(classification_report(y_test, y_test_pred, target_names = ['healthy loan(0)', 'high-risk loan(1)']))

                   precision    recall  f1-score   support

  healthy loan(0)       1.00      0.99      1.00     18765
high-risk loan(1)       0.85      0.91      0.88       619

         accuracy                           0.99     19384
        macro avg       0.92      0.95      0.94     19384
     weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Logistic regression model predicted healthy loan (0) with recall score of 0.99 and high-risk loan(1) with recall score of 0.91. But as we see the y.value_counts() data is imbalanced towards healthy loans(0) with 75036 values as 0 and 2500 values as 1, which could have impacted the training and hence the predictions.

For imbalanced data, either i can use oversampling or undersampling. In former case, duplicate datapoints will be created for minority class which is 1 which will overfit the data. While in latter case, majority class(0) will be compacted to the same size as minority class, which might discard potentially valuable information from majority class.

Oversampling seems to be more appropriate to use to balance the data as I don't want to loose valuable information from majority class. Evidently, It would effect the model's training time and memory requirement but it's the trade off for better predictions.

## Create a Logistic Regression Model with the resampled Data


###  Step 1: Resample the data using RandomOverSampler from imblearn.over_sampling

In [None]:
# Instantiate the RandomOverSampler
# Assign a random_state parameter of 1 to the model
random_over_sampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler
X_train_ros, y_train_ros = random_over_sampler.fit_resample(X_train, y_train)

In [None]:
#Review the y_train_ros variable Series
y_train_ros.value_counts()

0    56271
1    56271
Name: loan_status, dtype: int64

###Step 2: Fit a logistic regression model by using the resampled data (X_train_ros and y_train_ros).

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_ros = LogisticRegression(random_state=1)

# Fit the model using resampled data
logistic_regression_ros.fit(X_train_ros,y_train_ros)

### Step 3: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [None]:
# Make a prediction using the testing data
y_test_ros_pred = logistic_regression_ros.predict(X_test)

### Step 4: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Generate a confusion matrix for the model
cm_ros = confusion_matrix(y_test,y_test_ros_pred)
cm_ros_df = pd.DataFrame(cm_ros, index= ['Actual 0', 'Actual 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_ros_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18649,116
Actual 1,4,615


In [None]:
# Print the classification report for the model
print(classification_report(y_test, y_test_ros_pred, target_names = ['healthy loan(0)', 'high-risk loan(1)']))

                   precision    recall  f1-score   support

  healthy loan(0)       1.00      0.99      1.00     18765
high-risk loan(1)       0.84      0.99      0.91       619

         accuracy                           0.99     19384
        macro avg       0.92      0.99      0.95     19384
     weighted avg       0.99      0.99      0.99     19384



**Question:** How well does the logistic regression model with oversampled data predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Logistic regression model with oversampled datapredicted healthy loan (0) with recall score of 0.99 and high-risk loan(1) with recall score of 0.99.

Both of the models have high accuracy score. Whereas, for second model precision score slightly less as compared to first model.

For recommendation, i would suggest model 2 as it decrease the numbers of False positive meaning that the loans that were predicted as healthy but in actuality, were high-risk. And we would like to avoid those circumstances as much as possible.