#### Downloading data from Kaggle
Upload your Kaggle API token (kaggle.json file) below to download data directly from kaggle.

In [1]:
from google.colab import files
files.upload()

! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
!kaggle datasets download -d wordsforthewise/lending-club

Saving kaggle.json to kaggle.json
Downloading lending-club.zip to /content
 99% 1.25G/1.26G [00:06<00:00, 214MB/s]
100% 1.26G/1.26G [00:06<00:00, 212MB/s]


In [2]:
!unzip lending-club.zip

Archive:  lending-club.zip
  inflating: accepted_2007_to_2018Q4.csv.gz  
  inflating: accepted_2007_to_2018q4.csv/accepted_2007_to_2018Q4.csv  
  inflating: rejected_2007_to_2018Q4.csv.gz  
  inflating: rejected_2007_to_2018q4.csv/rejected_2007_to_2018Q4.csv  


### Reading and Clening Data

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from LoanGrader import LoanGrader

In [2]:

#columns to be used
cols= ["loan_amnt" ,'term' ,'int_rate' , 'installment' ,'grade'  ,'emp_title' ,'emp_length' ,'home_ownership' ,'annual_inc' , 'dti' ,'verification_status', 
        'pymnt_plan'  ,'purpose' ,'title' ,'zip_code' ,'addr_state' ,'dti' ,'fico_range_low' ,'fico_range_high' ,'tax_liens', 
       "open_acc", "pub_rec", "revol_bal", "revol_util", "total_acc", "mort_acc", 'application_type']



In [3]:
accepted_loans = pd.read_csv("/content/accepted_2007_to_2018Q4.csv.gz",  low_memory=False, usecols=cols)


In [4]:
accepted_loans.drop(accepted_loans.tail(2).index, inplace = True) #droping lost two rows

accepted_loans = accepted_loans[accepted_loans['application_type'] =="Individual"] #removing joint loan applications
data = accepted_loans.drop([ "application_type" ], axis=1 )

accepted_loans= accepted_loans[[(not i) for i in accepted_loans['loan_amnt'].isna()]] #removing all instances with NA values for loan amount. 

print("Loan Grades: ",sorted(accepted_loans["grade"].unique()))

Loan Grades:  ['A', 'B', 'C', 'D', 'E', 'F', 'G']


#### Imputing Missing Values

In [5]:
data=data.fillna({"emp_title": "not known", "home_ownership":"not known", "emp_length": "not known", 'title':"No title", "zip_code": "000XX"})
data=data.fillna(data.mean(numeric_only=True))

#preparing Data for training

In [6]:
data['text']=  data['emp_title'] + " " + data['purpose']+ " " + data['title']


In [7]:
data=data.drop([ 'emp_title',  'purpose', 'title'], axis=1)
data.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,zip_code,addr_state,dti,fico_range_low,fico_range_high,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc,tax_liens,text
0,3600.0,36 months,13.99,123.03,C,10+ years,MORTGAGE,55000.0,Not Verified,n,190xx,PA,5.91,675.0,679.0,7.0,0.0,2765.0,29.7,13.0,1.0,0.0,leadman debt_consolidation Debt consolidation
1,24700.0,36 months,11.99,820.28,C,10+ years,MORTGAGE,65000.0,Not Verified,n,577xx,SD,16.06,715.0,719.0,22.0,0.0,21470.0,19.2,38.0,4.0,0.0,Engineer small_business Business
3,35000.0,60 months,14.85,829.9,C,10+ years,MORTGAGE,110000.0,Source Verified,n,076xx,NJ,17.06,785.0,789.0,13.0,0.0,7802.0,11.6,17.0,1.0,0.0,Information Systems Officer debt_consolidation...
4,10400.0,60 months,22.45,289.91,F,3 years,MORTGAGE,104433.0,Source Verified,n,174xx,PA,25.37,695.0,699.0,12.0,0.0,21929.0,64.5,35.0,6.0,0.0,Contract Specialist major_purchase Major purchase
5,11950.0,36 months,13.44,405.18,C,4 years,RENT,34000.0,Source Verified,n,300xx,GA,10.2,690.0,694.0,5.0,0.0,8822.0,68.4,6.0,0.0,0.0,Veterinary Tecnician debt_consolidation Debt c...


#### Encoding Categorical Data

In [8]:
def encode_categorical_data(columns):
    for col in columns:
        data[col]=pd.Categorical(data[col])
        data[col]= data[col].cat.codes

encode_categorical_data(["grade", "term", "pymnt_plan", "emp_length", "verification_status", "home_ownership", "zip_code", "addr_state"])

data.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,zip_code,addr_state,dti,fico_range_low,fico_range_high,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc,tax_liens,text
0,3600.0,0,13.99,123.03,2,1,1,55000.0,0,0,183,38,5.91,675.0,679.0,7.0,0.0,2765.0,29.7,13.0,1.0,0.0,leadman debt_consolidation Debt consolidation
1,24700.0,0,11.99,820.28,2,1,1,65000.0,0,0,555,41,16.06,715.0,719.0,22.0,0.0,21470.0,19.2,38.0,4.0,0.0,Engineer small_business Business
3,35000.0,1,14.85,829.9,2,1,1,110000.0,1,0,70,31,17.06,785.0,789.0,13.0,0.0,7802.0,11.6,17.0,1.0,0.0,Information Systems Officer debt_consolidation...
4,10400.0,1,22.45,289.91,5,3,1,104433.0,1,0,167,38,25.37,695.0,699.0,12.0,0.0,21929.0,64.5,35.0,6.0,0.0,Contract Specialist major_purchase Major purchase
5,11950.0,0,13.44,405.18,2,4,5,34000.0,1,0,291,10,10.2,690.0,694.0,5.0,0.0,8822.0,68.4,6.0,0.0,0.0,Veterinary Tecnician debt_consolidation Debt c...


#### Tokenizing Text Data

In [9]:
#Preparig text data to fit tokenizer
texts=list(data['text'])
modified_text=[]

for i in range(len(data['text'])):
    str= texts[i].replace('_', ' ')
    modified_text.append(str)
data['text']= modified_text

#fitting text data on tokenizer
tokenizer= tf.keras.preprocessing.text.Tokenizer(lower=True, split=' ', oov_token=1)
tokenizer.fit_on_texts(data['text'])
tokenizer.get_config()

{'char_level': False,
 'document_count': 2139958,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'index_docs': '{"3": 1213518, "2": 1214793, "1104": 339, "18": 44976, "24": 51217, "48": 24120, "192": 4222, "74": 13650, "33": 40170, "13": 48850, "14": 48565, "285": 2665, "23": 52608, "1590": 192, "607": 838, "39": 29953, "43": 27822, "20": 55626, "64": 16387, "700": 678, "27": 49255, "577": 904, "31": 42318, "7": 295252, "5": 502668, "4": 509371, "6": 448058, "529": 1020, "936": 438, "53": 22884, "15": 78268, "122": 7552, "1012": 390, "25": 51148, "10": 132460, "52": 23260, "51": 23473, "190": 4309, "77": 13190, "36": 31622, "16": 70886, "974": 413, "1418": 224, "22": 52898, "69": 14372, "37": 30778, "154": 5527, "266": 2983, "99": 9167, "272": 2847, "92": 10120, "149": 5606, "57": 19590, "26": 49650, "78": 13013, "198": 4114, "45": 25656, "76": 13242, "189": 4341, "127": 6840, "8": 159879, "9": 142346, "138": 6071, "349": 1994, "564": 942, "3715": 50, "32": 41883, "370": 1881, "3

In [10]:
#tokenizing text data
sequence= tokenizer.texts_to_sequences(data['text'])
sequence= tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post', value=0 )
data['sequence']=list(sequence)

#### Splitting dataset into Training, Validation and Test sets

>  Training set: 80% \\
   Validation set: 10% \\
   Test set: 10%








In [24]:

y= data['grade']
x=data.drop(['text', "grade"], axis=1)


x_train, x_test,y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=1)

x_test, x_val,y_test, y_val= train_test_split(x_test, y_test, test_size=0.5, random_state=1)


In [25]:
train_sequence= list(x_train['sequence'])
test_sequence= list(x_test['sequence'])
val_sequence= list(x_val['sequence'])

x_train= x_train.drop('sequence', axis=1 )
x_test= x_test.drop('sequence', axis=1)
x_val= x_val.drop('sequence', axis=1)

In [26]:
#normalizing datasets
x_train=(x_train-x_train.mean())/x_train.std()
x_val=(x_val-x_val.mean())/x_val.std()
x_test=(x_test-x_test.mean())/x_test.std()

In [27]:
#Using Tesorflow Dataset API to make input pipeline much more effecient 
training_data= tf.data.Dataset.from_tensor_slices((x_train.values, train_sequence, y_train.values))
validation_data= tf.data.Dataset.from_tensor_slices((x_val.values, val_sequence,y_val.values))
test_data= tf.data.Dataset.from_tensor_slices((x_test.values, test_sequence, y_test.values))

In [28]:
training_data= training_data.batch(256)
validation_data=validation_data.batch(513)
test_data= test_data.batch(512)

#Training Model

In [33]:
 with tf.device('/GPU:0'): 
    model= LoanGrader(2, 128, 4, 256, max_positional_encoding=25)


In [34]:
for x, seq, y in training_data:
    model(x, seq)
    break

model.summary()

Model: "loan_grader_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_2 (Encoder)          multiple                  11093376  
_________________________________________________________________
flatten_2 (Flatten)          multiple                  0         
_________________________________________________________________
concatenate_2 (Concatenate)  multiple                  0         
_________________________________________________________________
dense_58 (Dense)             multiple                  412416    
_________________________________________________________________
dense_59 (Dense)             multiple                  8256      
_________________________________________________________________
dense_60 (Dense)             multiple                  4160      
_________________________________________________________________
dense_61 (Dense)             multiple                

In [22]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weights

array([ 0.74388348,  0.48534454,  0.49873275,  1.0001332 ,  2.36852292,
        7.65179186, 26.68775332])

In [35]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=3500):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    0.001,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True)

optimizer=tf.keras.optimizers.Adam(learning_rate=0.001) #CustomSchedule(128, warmup_steps=500)
#encoder_optimizer=  tf.keras.optimizers.Adam(learning_rate=CustomSchedule(64, warmup_steps=500))



model.compile(optimizer=optimizer)

In [36]:

with tf.device('/GPU:0'): 
    model.train(training_data, 10, validation_data=validation_data, class_weights=[1,1,1,1,1,1,1]) 

Epoch  1  :


'     100% Compleated  | Loss 0.3821 | Accuracy 83.40% | Time: 11.0 mins'

'   Validation: 100% Compleated | Loss 0.3415 Accuracy | 84.30% | Time: 0.0 mins'

Epoch  2  :


'     100% Compleated  | Loss 0.2921 | Accuracy 86.67% | Time: 11.0 mins'

'   Validation: 100% Compleated | Loss 0.2969 Accuracy | 86.56% | Time: 0.0 mins'

Epoch  3  :


'     66% Compleated  | Loss 0.2733 | Accuracy 87.47% | Time: 7.0 mins'

'     100% Compleated  | Loss 0.2706 | Accuracy 87.61% | Time: 11.0 mins'

'   Validation: 100% Compleated | Loss 0.2651 Accuracy | 87.86% | Time: 0.0 mins'

Epoch  4  :


'     100% Compleated  | Loss 0.2536 | Accuracy 88.45% | Time: 11.0 mins'

'   Validation: 100% Compleated | Loss 0.2698 Accuracy | 87.72% | Time: 0.0 mins'

Epoch  5  :


'     100% Compleated  | Loss 0.2422 | Accuracy 89.01% | Time: 11.0 mins'

'   Validation: 100% Compleated | Loss 0.2539 Accuracy | 88.15% | Time: 0.0 mins'

Epoch  6  :


'     100% Compleated  | Loss 0.2332 | Accuracy 89.39% | Time: 11.0 mins'

'   Validation: 100% Compleated | Loss 0.2400 Accuracy | 88.86% | Time: 0.0 mins'

Epoch  7  :


'     100% Compleated  | Loss 0.2266 | Accuracy 89.71% | Time: 11.0 mins'

'   Validation: 100% Compleated | Loss 0.2562 Accuracy | 87.92% | Time: 0.0 mins'

Epoch  8  :


'     100% Compleated  | Loss 0.2218 | Accuracy 89.92% | Time: 11.0 mins'

'   Validation: 100% Compleated | Loss 0.2310 Accuracy | 89.39% | Time: 0.0 mins'

Epoch  9  :


'     100% Compleated  | Loss 0.2173 | Accuracy 90.11% | Time: 11.0 mins'

'   Validation: 100% Compleated | Loss 0.2466 Accuracy | 88.49% | Time: 0.0 mins'

Epoch  10  :


'     100% Compleated  | Loss 0.2134 | Accuracy 90.32% | Time: 11.0 mins'

'   Validation: 100% Compleated | Loss 0.2257 Accuracy | 89.62% | Time: 0.0 mins'

In [41]:
model.save_weights("model_weights/model")