In [None]:
#Our Goal
#Given historical data on loans given out with information on whether or not the borrower defaulted (charge-off), can we build a model thatcan predict wether or nor a borrower will pay back their loan? This way in the future when we get a new potential customer we can assess whether or not they are likely to pay back the loan. Keep in mind classification metrics when evaluating the performance of your model!

#The "loan_status" column contains our label.
import pandas as pd
data_info = pd.read_csv('../input/lendingclub-data-sets/lending_club_info.csv',index_col='LoanStatNew')
print(data_info.loc['revol_util']['Description'])

In [None]:
def feat_info(col_name):
    print(data_info.loc[col_name]['Description'])

In [None]:
feat_info('mort_acc')

** Loading Data and other imports**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# might be needed depending on your version of Jupyter
%matplotlib inline

In [None]:
df = pd.read_csv('../input/lendingclub-data-sets/lending_club_loan_two.csv')


In [None]:
df.info()

**Section 1: Exploratory Data Analysis
OVERALL GOAL: Get an understanding for which variables are important, view summary statistics, and visualize the data

**

In [None]:
sns.countplot(x='loan_status',data=df)

In [None]:
plt.figure(figsize=(12,4))
sns.distplot(df['loan_amnt'],kde=False,bins=40)
plt.xlim(0,45000)

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(df.corr(),annot=True,cmap='viridis')
plt.ylim(10, 0)

In [None]:
feat_info('installment')

In [None]:
feat_info('loan_amnt')

In [None]:
sns.scatterplot(x='installment',y='loan_amnt',data=df,)

In [None]:
sns.boxplot(x='loan_status',y='loan_amnt',data=df)

In [None]:
df.groupby('loan_status')['loan_amnt'].describe()

In [None]:
sorted(df['grade'].unique())

In [None]:
sorted(df['sub_grade'].unique())

In [None]:
sns.countplot(x='grade',data=df,hue='loan_status')

In [None]:
plt.figure(figsize=(12,4))
subgrade_order = sorted(df['sub_grade'].unique())
sns.countplot(x='sub_grade',data=df,order = subgrade_order,palette='coolwarm' )

In [None]:
plt.figure(figsize=(12,4))
subgrade_order = sorted(df['sub_grade'].unique())
sns.countplot(x='sub_grade',data=df,order = subgrade_order,palette='coolwarm' ,hue='loan_status')

In [None]:
f_and_g = df[(df['grade']=='G') | (df['grade']=='F')]

plt.figure(figsize=(12,4))
subgrade_order = sorted(f_and_g['sub_grade'].unique())
sns.countplot(x='sub_grade',data=f_and_g,order = subgrade_order,hue='loan_status')

In [None]:
df['loan_status'].unique()

In [None]:
df['loan_repaid'] = df['loan_status'].map({'Fully Paid':1,'Charged Off':0})

In [None]:
df[['loan_repaid','loan_status']]

In [None]:
df.corr()['loan_repaid'].sort_values().drop('loan_repaid').plot(kind='bar')

**Section 2: Data PreProcessing
Section Goals: Remove or fill any missing data. Remove unnecessary or repetitive features. Convert categorical string features to dummy variables.**

In [None]:
df.head()

**Missing Data
Let's explore this missing data columns. We use a variety of factors to decide whether or not they would be useful, to see if we should keep, discard, or fill in the missing data.**

In [None]:
len(df)

In [None]:
df.isnull().sum()

In [None]:
100* df.isnull().sum()/len(df)

In [None]:
feat_info('emp_title')
print('\n')
feat_info('emp_length')

In [None]:
df['emp_title'].nunique()

In [None]:
df['emp_title'].value_counts()

In [None]:
df = df.drop('emp_title',axis=1)

In [None]:
sorted(df['emp_length'].dropna().unique())

In [None]:
emp_length_order = [ '< 1 year',
                      '1 year',
                     '2 years',
                     '3 years',
                     '4 years',
                     '5 years',
                     '6 years',
                     '7 years',
                     '8 years',
                     '9 years',
                     '10+ years']

In [None]:
plt.figure(figsize=(12,4))

sns.countplot(x='emp_length',data=df,order=emp_length_order)

In [None]:
plt.figure(figsize=(12,4))
sns.countplot(x='emp_length',data=df,order=emp_length_order,hue='loan_status')

In [None]:
emp_co = df[df['loan_status']=="Charged Off"].groupby("emp_length").count()['loan_status']

In [None]:
emp_fp = df[df['loan_status']=="Fully Paid"].groupby("emp_length").count()['loan_status']

In [None]:
emp_len = emp_co/emp_fp

In [None]:
emp_len

In [None]:
emp_len.plot(kind='bar')

**TASK: Charge off rates are extremely similar across all employment lengths. Go ahead and drop the emp_length column.**

In [None]:
df = df.drop('emp_length',axis=1)

In [None]:
df.isnull().sum()

In [None]:
df['purpose'].head(10)

In [None]:
df['title'].head(10)

**TASK: The title column is simply a string subcategory/description of the purpose column. Go ahead and drop the title column.**

In [None]:
df = df.drop('title',axis=1)

In [None]:
feat_info('mort_acc')

In [None]:
df['mort_acc'].value_counts()

**There are many ways we could deal with this missing data. We could attempt to build a simple model to fill it in, such as a linear model, we could just fill it in based on the mean of the other columns, or you could even bin the columns into categories and then set NaN as its own category. There is no 100% correct approach! Let's review the other columns to see which most highly correlates to mort_acc**

In [None]:
print("Correlation with the mort_acc column")
df.corr()['mort_acc'].sort_values()

**Looks like the total_acc feature correlates with the mort_acc , this makes sense! Let's try this fillna() approach. We will group the dataframe by the total_acc and calculate the mean value for the mort_acc per total_acc entry.**

In [None]:
print("Mean of mort_acc column per total_acc")
df.groupby('total_acc').mean()['mort_acc']

In [None]:
total_acc_avg = df.groupby('total_acc').mean()['mort_acc']

In [None]:
total_acc_avg[2.0]

In [None]:
def fill_mort_acc(total_acc,mort_acc):
    '''
    Accepts the total_acc and mort_acc values for the row.
    Checks if the mort_acc is NaN , if so, it returns the avg mort_acc value
    for the corresponding total_acc value for that row.
    
    total_acc_avg here should be a Series or dictionary containing the mapping of the
    groupby averages of mort_acc per total_acc values.
    '''
    if np.isnan(mort_acc):
        return total_acc_avg[total_acc]
    else:
        return mort_acc

In [None]:
df['mort_acc'] = df.apply(lambda x: fill_mort_acc(x['total_acc'], x['mort_acc']), axis=1)

In [None]:
df.isnull().sum()

**revol_util and the pub_rec_bankruptcies have missing data points, but they account for less than 0.5% of the total data. Go ahead and remove the rows that are missing those values in those columns with dropna().**

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

**Categorical Variables and Dummy Variables
We're done working with the missing data! Now we just need to deal with the string values due to the categorical columns.**

In [None]:
df.select_dtypes(['object']).columns

In [None]:
df['term'].value_counts()

In [None]:
# Or just use .map()
df['term'] = df['term'].apply(lambda term: int(term[:3]))

df=df.drop('grade',axis=1)

**Convert the subgrade into dummy variables. Then concatenate these new columns to the original dataframe. Remember to drop the original subgrade column and to add drop_first=True to your get_dummies call.**

In [None]:
subgrade_dummies = pd.get_dummies(df['sub_grade'],drop_first=True)

In [None]:
df = pd.concat([df.drop('sub_grade',axis=1),subgrade_dummies],axis=1)

In [None]:
df.columns

In [None]:
df.select_dtypes(['object']).columns

**Convert these columns: ['verification_status', 'application_type','initial_list_status','purpose'] into dummy variables and concatenate them with the original dataframe. Remember to set drop_first=True and to drop the original columns.**

In [None]:
dummies = pd.get_dummies(df[['verification_status', 'application_type','initial_list_status','purpose' ]],drop_first=True)
df = df.drop(['verification_status', 'application_type','initial_list_status','purpose'],axis=1)
df = pd.concat([df,dummies],axis=1)

In [None]:
df['home_ownership'].value_counts()

Convert these to dummy variables, but replace NONE and ANY with OTHER, so that we end up with just 4 categories, MORTGAGE, RENT, OWN, OTHER. Then concatenate them with the original dataframe. Remember to set drop_first=True and to drop the original columns.

In [None]:
df['home_ownership']=df['home_ownership'].replace(['NONE', 'ANY'], 'OTHER')

dummies = pd.get_dummies(df['home_ownership'],drop_first=True)
df = df.drop('home_ownership',axis=1)
df = pd.concat([df,dummies],axis=1)

**Let's feature engineer a zip code column from the address in the data set. Create a column called 'zip_code' that extracts the zip code from the address column.**

In [None]:
df['zip_code'] = df['address'].apply(lambda address:address[-5:])

In [None]:
dummies = pd.get_dummies(df['zip_code'],drop_first=True)
df = df.drop(['zip_code','address'],axis=1)
df = pd.concat([df,dummies],axis=1)

### issue_d 

**This would be data leakage, we wouldn't know beforehand whether or not a loan would be issued when using our model, so in theory we wouldn't have an issue_date, drop this feature.**

In [None]:
df = df.drop('issue_d',axis=1)

### earliest_cr_line
**This appears to be a historical time stamp feature. Extract the year from this feature using a .apply function, then convert it to a numeric feature. Set this new data to a feature column called 'earliest_cr_year'.Then drop the earliest_cr_line feature.**

In [None]:
df['earliest_cr_year'] = df['earliest_cr_line'].apply(lambda date:int(date[-4:]))
df = df.drop('earliest_cr_line',axis=1)

In [None]:
df.select_dtypes(['object']).columns

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

**drop the load_status column we created earlier, since its a duplicate of the loan_repaid column. We'll use the loan_repaid column since its already in 0s and 1s.**

In [None]:
df = df.drop('loan_status',axis=1)

In [None]:
df['grade']

In [None]:
dummies1 = pd.get_dummies(df['grade'],drop_first=True)
df = df.drop(['grade'],axis=1)
df = pd.concat([df,dummies1],axis=1)

In [None]:
X = df.drop('loan_repaid',axis=1).values
y = df['loan_repaid'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

## Normalizing the Data

**Use a MinMaxScaler to normalize the feature data X_train and X_test. Recall we don't want data leakge from the test set so we only fit on the X_train data.**

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

# Creating the Model

**Run the cell below to import the necessary Keras functions.**

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.constraints import max_norm

**Build a sequential model to will be trained on the data. You have unlimited options here, but here is what the solution uses: a model that goes 78 --> 39 --> 19--> 1 output neuron.**

In [None]:
# CODE HERE
model = Sequential()

# Choose whatever number of layers/neurons you want.


In [None]:
model = Sequential()


# input layer
model.add(Dense(78,  activation='relu'))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(39, activation='relu'))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(19, activation='relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(units=1,activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
model.fit(x=X_train, 
          y=y_train, 
          epochs=25,
          batch_size=256,
          validation_data=(X_test, y_test), 
          )

In [None]:
from tensorflow.keras.models import load_model

In [None]:
model.save('full_data_project_model.h5')  

# Section 3: Evaluating Model Performance.

**Plot out the validation loss versus the training loss.**

In [None]:
losses = pd.DataFrame(model.history.history)

In [None]:
losses[['loss','val_loss']].plot()

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
predictions = model.predict_classes(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
confusion_matrix(y_test,predictions)

**TASK: Given the customer below, would you offer this person a loan?**

In [None]:
import random
random.seed(101)
random_ind = random.randint(0,len(df))

new_customer = df.drop('loan_repaid',axis=1).iloc[random_ind]
new_customer

In [None]:
model.predict_classes(new_customer.values.reshape(1,84))

 **TASK: Now check, did this person actually end up paying back their loan?**

In [None]:
df.iloc[random_ind]['loan_repaid']