In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
del dirname, filenames

## The Data

We will be using a **subset** of the LendingClub DataSet obtained from Kaggle: https://www.kaggle.com/wordsforthewise/lending-club


LendingClub is a US peer-to-peer lending company, headquartered in San Francisco, California.[3] It was the first peer-to-peer lender to register its offerings as securities with the Securities and Exchange Commission (SEC), and to offer loan trading on a secondary market. LendingClub is the world's largest peer-to-peer lending platform.

### Our Goal

Given historical data on loans given out with information on whether or not the borrower defaulted (charge-off), can we build a model thatcan predict wether or nor a borrower will pay back their loan? This way in the future when we get a new potential customer we can assess whether or not they are likely to pay back the loan. Keep in mind classification metrics when evaluating the performance of your model!

The "loan_status" column contains our label.

### Data Overview

----
-----
There are many LendingClub data sets on Kaggle. Here is the information on this particular data set:

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>LoanStatNew</th>
      <th>Description</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>loan_amnt</td>
      <td>The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.</td>
    </tr>
    <tr>
      <th>1</th>
      <td>term</td>
      <td>The number of payments on the loan. Values are in months and can be either 36 or 60.</td>
    </tr>
    <tr>
      <th>2</th>
      <td>int_rate</td>
      <td>Interest Rate on the loan</td>
    </tr>
    <tr>
      <th>3</th>
      <td>installment</td>
      <td>The monthly payment owed by the borrower if the loan originates.</td>
    </tr>
    <tr>
      <th>4</th>
      <td>grade</td>
      <td>LC assigned loan grade</td>
    </tr>
    <tr>
      <th>5</th>
      <td>sub_grade</td>
      <td>LC assigned loan subgrade</td>
    </tr>
    <tr>
      <th>6</th>
      <td>emp_title</td>
      <td>The job title supplied by the Borrower when applying for the loan.*</td>
    </tr>
    <tr>
      <th>7</th>
      <td>emp_length</td>
      <td>Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.</td>
    </tr>
    <tr>
      <th>8</th>
      <td>home_ownership</td>
      <td>The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER</td>
    </tr>
    <tr>
      <th>9</th>
      <td>annual_inc</td>
      <td>The self-reported annual income provided by the borrower during registration.</td>
    </tr>
    <tr>
      <th>10</th>
      <td>verification_status</td>
      <td>Indicates if income was verified by LC, not verified, or if the income source was verified</td>
    </tr>
    <tr>
      <th>11</th>
      <td>issue_d</td>
      <td>The month which the loan was funded</td>
    </tr>
    <tr>
      <th>12</th>
      <td>loan_status</td>
      <td>Current status of the loan</td>
    </tr>
    <tr>
      <th>13</th>
      <td>purpose</td>
      <td>A category provided by the borrower for the loan request.</td>
    </tr>
    <tr>
      <th>14</th>
      <td>title</td>
      <td>The loan title provided by the borrower</td>
    </tr>
    <tr>
      <th>15</th>
      <td>zip_code</td>
      <td>The first 3 numbers of the zip code provided by the borrower in the loan application.</td>
    </tr>
    <tr>
      <th>16</th>
      <td>addr_state</td>
      <td>The state provided by the borrower in the loan application</td>
    </tr>
    <tr>
      <th>17</th>
      <td>dti</td>
      <td>A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.</td>
    </tr>
    <tr>
      <th>18</th>
      <td>earliest_cr_line</td>
      <td>The month the borrower's earliest reported credit line was opened</td>
    </tr>
    <tr>
      <th>19</th>
      <td>open_acc</td>
      <td>The number of open credit lines in the borrower's credit file.</td>
    </tr>
    <tr>
      <th>20</th>
      <td>pub_rec</td>
      <td>Number of derogatory public records</td>
    </tr>
    <tr>
      <th>21</th>
      <td>revol_bal</td>
      <td>Total credit revolving balance</td>
    </tr>
    <tr>
      <th>22</th>
      <td>revol_util</td>
      <td>Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.</td>
    </tr>
    <tr>
      <th>23</th>
      <td>total_acc</td>
      <td>The total number of credit lines currently in the borrower's credit file</td>
    </tr>
    <tr>
      <th>24</th>
      <td>initial_list_status</td>
      <td>The initial listing status of the loan. Possible values are – W, F</td>
    </tr>
    <tr>
      <th>25</th>
      <td>application_type</td>
      <td>Indicates whether the loan is an individual application or a joint application with two co-borrowers</td>
    </tr>
    <tr>
      <th>26</th>
      <td>mort_acc</td>
      <td>Number of mortgage accounts.</td>
    </tr>
    <tr>
      <th>27</th>
      <td>pub_rec_bankruptcies</td>
      <td>Number of public record bankruptcies</td>
    </tr>
  </tbody>
</table>

---
----

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Read the data information file
data_info = pd.read_csv('../input/lendingclub-data-sets/lending_club_info.csv', index_col='LoanStatNew')
data_info.head()

In [None]:
# Define a function to print out feature information
def feat_info(col_name):
    return data_info.loc[col_name]['Description']

feat_info('loan_amnt')

In [None]:
# Read the data
df = pd.read_csv('../input/lendingclub-data-sets/lending_club_loan_two.csv')
df.info()

In [None]:
df_describe = df.describe().T
d_types = pd.DataFrame(df.dtypes, columns=['Data Type'])
isnull = pd.DataFrame(df.isnull().sum(), columns=['is_null'])
# [*map(len, (df_describe, d_types, isnull))]
df.head(2).T.merge(d_types, left_index=True, right_index=True).merge(isnull, left_index=True, right_index=True).merge(df_describe, left_index=True, right_index=True, how='left')
del df_describe, isnull, d_types

------

# Section 1: Exploratory Data Analysis

**GOAL: Get an understanding for which variables are important, view summary statistics, and visualize the data**


----

In [None]:
# Create a count plot of loan_status
df['loan_status'].value_counts().plot(kind='pie', 
                                      colors=['lightblue', 'salmon'],
                                      autopct='%1.1f%%',
                                      legend=False);

We are dealing with an unbalanced problem
* Evaluation based on accuracy might be misleading
* Precision and recall are important evaluation metrics

In [None]:
# Create a histogram of the loan_amnt column
plt.figure(figsize=(12, 4))
sns.histplot(df['loan_amnt'], bins=40)

Spikes happening at 'even' money amounts such 10,000 or 15,000, or 20,000 which makes sense

In [None]:
# Explore correlation between the continuous numeric feature variables
corr = df.corr().round(2)
mask = np.triu(np.ones_like(corr, dtype=np.bool))
plt.figure(figsize=(12, 7))
heatmap = sns.heatmap(corr,
            mask=mask,
           annot=True,
           xticklabels=corr.columns.values,
           yticklabels=corr.columns.values,
           vmin=-1, 
           vmax=1, 
           cmap='viridis' 
           )
plt.xticks(rotation=60)
heatmap.set_title('Features Correlation Heatmap', fontdict={'fontsize':12}, pad=12);
del corr, mask

In [None]:
# Explore features with high correlation
dict(
    zip(
        ('total_acc', 'open_acc'), 
        map(feat_info, ('total_acc', 'open_acc'))
)
)

In [None]:
# Explore features with high correlation
dict(zip(('pub_rec_bankruptcies', 'pub_rec'), map(feat_info, ('pub_rec_bankruptcies', 'pub_rec'))))

In [None]:
# Explore features with high correlation
dict(zip(('mort_acc', 'total_acc'), map(feat_info, ('mort_acc', 'total_acc'))))

In [None]:
# Explore features with high correlation
dict(zip(('installment', 'loan_amnt'), map(feat_info, ('installment', 'loan_amnt'))))

If, no matter what the amount the loan, the number of installments are similar, loan_amount and loan installment amount are linearly dependent and representing the same information and we have duplicate information in here

In [None]:
sns.scatterplot(data=df, x='installment', y='loan_amnt');

In [None]:
# Explore the relationship between loan_status and loan amount
sns.boxplot(x='loan_status', y='loan_amnt', data=df);

In [None]:
# Calculate the summary statistics for the loan amount, grouped by the loan_status.
df.groupby('loan_status')['loan_amnt'].describe()

In [None]:
# Explore the Grade and SubGrade columns that attributed to the loans by LendingClub (LC).
dict(zip(('grade', 'sub_grade'), 
         map(feat_info, ('grade', 'sub_grade'))
        )
    )

In [None]:
plt.figure(figsize=(12, 4))
subgrade_order = sorted(df['sub_grade'].unique())
sns.countplot(x='sub_grade', data=df, order=subgrade_order, palette='coolwarm');

In [None]:
plt.figure(figsize=(12, 4))
subgrade_order = sorted(df['sub_grade'].unique())
sns.countplot(x='sub_grade', data=df, order=subgrade_order, palette='coolwarm', hue='loan_status');

In [None]:
# Create a countplot per grade. Set the hue to the loan_status label
sns.countplot(x='grade', data=df, hue='loan_status')

In [None]:
# Explore Gardes F and G
f_and_g = df[(df['grade'] == 'F') | (df['grade'] == 'G') ]
f_and_g_subgrade_order = sorted(f_and_g['sub_grade'].unique())
sns.countplot(x='sub_grade', data=f_and_g, hue='loan_status', 
              order=f_and_g_subgrade_order, palette='coolwarm')
del f_and_g_subgrade_order, f_and_g

In [None]:
# Change the 'loan_repaid' column from "Fully Paid" and "Charged Off" to 1 and 0
df['loan_repaid'] = df['loan_status'].map({'Fully Paid': 1, 'Charged Off': 0})
df[['loan_status', 'loan_repaid']].head()

In [None]:
# Explore the correlation of loan_repaid with some features
plt.figure
df.corr()['loan_repaid'].drop('loan_repaid').sort_values().plot.barh()

---
---
# Section 2: Data PreProcessing

**Goals: Remove or fill any missing data. Remove unnecessary or repetitive features. Convert categorical string features to dummy variables.**


# Missing Data

**Let's explore this missing data columns. We use a variety of factors to decide whether or not they would be useful, to see if we should keep, discard, or fill in the missing data.**

In [None]:
# Explore the shape of the dataframe
df.shape

In [None]:
# Create a Series that displays the total count of missing values per column.
plt.figure(figsize=(12, 4))
df.isnull().sum().div(df.shape[0]).mul(100).plot.bar();
plt.title('% Missing Values');

In [None]:
# Examine `emp_title` and `emp_length` to see whether it will be okay to drop them.
dict(zip(('emp_title', 'emp_length'), map(feat_info, ('emp_title', 'emp_length'))))

In [None]:
df['emp_title'].nunique()

In [None]:
df['emp_title'].value_counts()

There are too many unique job titles to convert to dummy features

In [None]:
# Drop `emp_title` column
df.drop('emp_title', axis=1, inplace=True)

In [None]:
# Examine emp_length column
df['emp_length'].value_counts()

In [None]:
# Examine emp_length column
df['emp_length'].value_counts().sort_values(ascending=False).index

In [None]:
plt.figure(figsize=(12, 4))
emp_length_order = [ '< 1 year', '1 year', '2 years', 
                    '3 years', '5 years', '4 years', 
                    '6 years', '7 years', '8 years', 
                    '9 years', '10+ years'
                   ]
sns.countplot(x='emp_length', data=df, order=emp_length_order, hue='loan_status');

The chart shows no significatnt relationship between the length of employment and the rate of charge off.

In [None]:
# Explore furtur the importance of emp_length column
df.groupby('emp_length')['loan_repaid'].mean().loc[emp_length_order].plot.bar();
del emp_length_order

In [None]:
# Drop emp_length column as it does not have a significant effect on the rate of charge off
df.drop('emp_length', axis=1, inplace=True)

In [None]:
# Revisit the DataFrame to see what feature columns still have missing data.
plt.figure(figsize=(12, 4))
df.isnull().sum().div(df.shape[0]).mul(100).plot.bar();
plt.title('% Missing Values');

In [None]:
# Review the `title` column vs the `purpose` column to see if there is any repated information
dict(zip(['title', 'purpose'], map(feat_info, ['title', 'purpose'])))

In [None]:
df[['title', 'purpose']]

The `title` column looks like a description of the `purpose` 

In [None]:
# Drop `title` column
df.drop('title', axis=1, inplace=True)

In [None]:
# Revisit the DataFrame to see what feature columns still have missing data.
plt.figure(figsize=(12, 4))
df.isnull().sum().div(df.shape[0]).mul(100).plot.bar();
plt.title('% Missing Values');

In [None]:
# Find out what is the description of mort_acc feature
feat_info('mort_acc')

In [None]:
# Create a value_counts of the the mort_acc
df['mort_acc'].value_counts()


In order to decide whether or not fill the missing values of mort_acc and how to do so, we are going to explore other columns that have a high correlation with mort_acc

In [None]:
df.corr()['mort_acc'].drop('mort_acc').sort_values().plot.barh()

total_acc highly correlates with mort_acc
We are going to group_by total_acc and use the mean value of mort_acc for missing mort_acc value per group of total_acc

In [None]:
df['mort_acc'] = df.groupby('total_acc')['mort_acc'].transform(lambda x: x.fillna(x.mean()))

In [None]:
# Revisit the DataFrame to see what feature columns still have missing data.
plt.figure(figsize=(12, 4))
df.isnull().sum().div(df.shape[0]).mul(100).plot.bar();
plt.title('% Missing Values');

We are going to drop missing rows in revol_util and pub_rec_bankruptcies columns as they account for less than 0.5% of the total data.

In [None]:
df.drop(['revol_util', 'pub_rec_bankruptcies'], axis=1, inplace=True)

In [None]:
# Revisit the DataFrame to see what feature columns still have missing data.
plt.figure(figsize=(12, 4))
df.isnull().sum().div(df.shape[0]).mul(100).plot.bar();
plt.title('% Missing Values');

## Categorical and Dummy Variables

**Now we are going to deal with the string values due to the categorical columns.**

In [None]:
# List all the columns that are currently non-numeric.
df.select_dtypes(['object']).columns

---
**We now go through all the string features to see what we should do with them.**

---

In [None]:
# Examine the 'term' feature
feat_info('term')

In [None]:
df['term'].value_counts()

One method is two use one_hot encoding as term is actually a categorical variable but as length is also meaningful as a numeric variable, we are going to convert it into an int number

In [None]:
# Convert term to int varibale
df['term'] = df['term'].str[1:3].astype(int)

In [None]:
# Drop Grade as grade is part of sub_grade
df.drop('grade', axis=1, inplace=True)

In [None]:
# Convert the subgrade into dummy variables.
dummies = pd.get_dummies(df['sub_grade'], drop_first=True)
df = pd.concat([df.drop('sub_grade', axis=1), dummies], axis=1)

In [None]:
# Explore verification_status, application_type, initial_list_status, and purpose to dummies
dict(
zip(['verification_status', 'application_type', 'initial_list_status', 'purpose'],
    map(feat_info, ['verification_status', 'application_type', 'initial_list_status', 'purpose']))
)

In [None]:
# Convert verification_status, application_type, initial_list_status, and purpose to dummies
dummies = pd.get_dummies(df[['verification_status', 'application_type', 'initial_list_status', 'purpose']], drop_first=True)
df = pd.concat([df.drop(['verification_status', 'application_type', 'initial_list_status', 'purpose'], axis=1), dummies], axis=1)
del dummies

In [None]:
# Examine home_ownership column
df['home_ownership'].value_counts()

Because very few people are in `None` and `ANY` categories, we are going to recategorize them as `OTHER`

In [None]:
# Replace NONE and ANY with OTHER
df['home_ownership'] = df['home_ownership'].replace(['NONE', 'ANY'], 'OTHER')

In [None]:
# Convert the home_ownership into dummy variables.
dummies = pd.get_dummies(df['home_ownership'], drop_first=True)
df = pd.concat([df.drop('home_ownership', axis=1), dummies], axis=1)
del dummies

In [None]:
# Feature engineer a zip code column from the address in the data set.
df['zip_code'] = df['address'].apply(lambda address: address[-5:])
df['zip_code'].value_counts()

In [None]:
# Convert the zip_code into dummy variables.
dummies = pd.get_dummies(df['zip_code'], drop_first=True)
df = pd.concat([
    df.drop('zip_code', axis=1),
    dummies], axis=1)
df.drop('address', axis=1, inplace=True)

In [None]:
# Examine issue_d
feat_info('issue_d')

In [None]:
df['issue_d'].value_counts()

In [None]:
# Drop issue_d because this is a data leakage as we do not know whether or not a loan will be issued when predicting the default
df.drop('issue_d', axis=1, inplace=True)

In [None]:
# Explore earliest_cr_line
feat_info('earliest_cr_line')    

In [None]:
df['earliest_cr_line'].value_counts()

In [None]:
#  Extract the year from earliest_cr_line, then convert it to a numeric feature.
df['earliest_cr_line'] = df['earliest_cr_line'].str[-4:].astype(int)

## Train Test Split

In [None]:
# Create X and y
X = df.drop(['loan_status', 'loan_repaid'], axis=1).values
y = df['loan_repaid'].values
X.shape, y.shape

In [None]:
#  Create train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Normalizing the data

In [None]:
# Normalize the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Creating the model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
# Build a sequential model and train the data
model = Sequential()

model.add(Dense(units=78, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(units=39, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(units=19, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(units=1, activation='sigmoid'))
          
model.compile(loss='binary_crossentropy', optimizer='adam')
          
model.fit(x=X_train, y=y_train, epochs=25, batch_size=256,
          validation_data=(X_test, y_test))

In [None]:
model.save('my_model.h5')

## Evaluating model performance

In [None]:
losses = pd.DataFrame(model.history.history)
losses.plot()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
predictions = model.predict_classes(X_test)

In [None]:
print(classification_report(y_test, predictions))