In [1]:
##### # Imports

# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
from random import randint
from fastai.vision import *

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 60)

# Matplotlib visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Set default font size
plt.rcParams['font.size'] = 24

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

import treeinterpreter as ti

# Seaborn for visualization
import seaborn as sns
sns.set(font_scale = 2)

# Splitting data into training and testing
from sklearn.model_selection import train_test_split
import sklearn.utils.validation

##import model
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn import metrics

#Machine Learning Models
from sklearn.ensemble import RandomForestClassifier

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

## DL imports
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

## Pytorch has this 
#export
from torch import optim


#Imputing missing values and scaling values
from sklearn.preprocessing import Imputer, MinMaxScaler

In [2]:
# Read in data into dataframes 
train_features = pd.read_csv('processed/training_features.csv')
test_features = pd.read_csv('processed/testing_features.csv')
train_labels = pd.read_csv('processed/training_labels.csv', names= ['score'])
test_labels = pd.read_csv('processed/testing_labels.csv' , names= ['score'])

# Display sizes of data
print('Training Feature Size: ', train_features.shape)
print('Testing Feature Size:  ', test_features.shape)
print('Training Labels Size:  ', train_labels.shape)
print('Testing Labels Size:   ', test_labels.shape)

Training Feature Size:  (20000, 67)
Testing Feature Size:   (22317, 67)
Training Labels Size:   (20001, 1)
Testing Labels Size:    (22318, 1)


In [3]:
train_features.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,grade,annual_inc,issue_d,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_int,total_rec_late_fee,last_pymnt_d,last_pymnt_amnt,collections_12_mths_ex_med,policy_code,acc_now_delinq,chargeoff_within_12_mths,delinq_amnt,...,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,emp_length_8 years,emp_length_9 years,emp_length_< 1 year,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
0,13000,13000,11525.0,0.1062,423.28,1,86000.0,0,12.35,0,0,10,0,9871,0.456,23,0,0,0,15161.06573,13440.86,2161.07,0.0,34,2907.14,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,7000,7000,6785.947887,0.0751,217.77,0,80000.0,0,14.88,0,0,10,1,8433,0.413,17,0,0,0,7838.664792,7582.93,838.66,0.0,36,455.06,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,12000,12000,11196.56473,0.1025,388.62,1,48645.0,0,13.64,0,0,10,0,13039,0.31,17,0,0,0,13922.04177,12966.9,1922.04,0.0,33,2701.47,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,22750,22750,21931.62159,0.1136,748.73,1,58600.0,0,24.66,0,1,8,0,1449,0.296,37,0,0,0,25428.89573,24460.37,2678.9,0.0,13,14957.35,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,15500,15500,15500.0,0.1645,548.36,4,415000.0,0,0.68,0,2,11,0,11886,0.563,31,0,0,0,19418.69958,19418.7,3918.7,0.0,25,5188.83,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
cols = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate',
       'installment', 'annual_inc', 'dti', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'total_acc','out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_int',
       'total_rec_late_fee','last_pymnt_amnt', 'collections_12_mths_ex_med']

In [5]:
## Scale both Training and test set

x_train = train_features
# Create the scaler object with a range of 0-1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit on the training data
scaler.fit(train_features[cols])

# Transform both the training and testing data
x_train[cols] = scaler.transform(train_features[cols])
#test_labels = scaler.transform(test_features)

In [6]:
x_train = x_train.drop(columns='last_pymnt_d')

In [7]:
x_valid = test_features
x_valid[cols] = scaler.transform(test_features[cols])
x_valid = x_valid.drop(columns='last_pymnt_d')

In [8]:
y_train = np.ravel(train_labels)
y_valid = np.ravel(test_labels)
print(y_train.shape, y_valid.shape)

(20001,) (22318,)


In [9]:
x_valid.shape

(22317, 66)

In [10]:
y_valid = y_valid[:22317]
y_train = y_train[:20000]

In [11]:
m = RandomForestClassifier(n_estimators=100)

In [12]:
m.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
print("Accuracy:",metrics.accuracy_score(y_valid, m.predict(x_valid)))

Accuracy: 0.843975444728234


In [14]:
class RFmodel():
    def __init__(self, trees, criterion, x_train, y_train, x_valid, y_valid, depth=15, model=None):
            if(model != None):
                self.fit = False
                self.m = model
                self.x_train = x_train
                self.y_train = y_train
                self.x_valid = x_valid
                self.y_valid = y_valid
                
            else:
                self.fit = True
                self.m = RandomForestRegressor(max_features='auto', criterion=criterion, max_depth=depth,max_leaf_nodes=None, 
                          min_samples_split=4, min_samples_leaf=4, n_estimators=trees,n_jobs=-1, 
                          min_weight_fraction_leaf=0.0)
                
                self.x_train = x_train
                self.y_train = y_train
                self.x_valid = x_valid
                self.y_valid = y_valid
            print(self.m)
            
    def fit_model(self, dt = False):
        
        if(self.fit):
            print('Fitting model...')
            %time self.m = self.m.fit(self.x_train, self.y_train)

        if(dt):
            m = self.m
            print("Getting values from desicion trees...")
            dt_train = []
            dt_valid = []
            for dt in m.estimators_:
                dt_train.append(dt.predict(self.x_train))
                dt_valid.append(dt.predict(self.x_valid))
                                
            dt_train = torch.Tensor(dt_train)
            dt_valid = torch.Tensor(dt_valid)
            
            self.train_ds, self.valid_ds = self.getDataset(dt_train, dt_valid)
            return self.train_ds, self.valid_ds
            
    def getRfLoss(self):
        valid_pred = self.m.predict(self.x_valid)
        train_pred = self.m.predict(self.x_train)
        
        valid_loss = metrics.accuracy_score(valid_pred, self.y_valid)
        train_loss = metrics.accuracy_score(train_pred, self.y_train)
        
        return train_loss, valid_loss
    
    def getDataset(self, dt_train, dt_valid):
        print(dt_train.shape)
        print(dt_valid.shape)
         
        yb_train = np.array(self.y_train)
        yb_train = torch.Tensor(yb_train)
        
        yb_valid = np.array(y_valid)
        yb_valid = torch.Tensor(yb_valid)
        print(yb_valid.shape)
        print(yb_train.shape)

        xb_train = torch.Tensor(dt_train)
        xb_train = xb_train.t()
        
        xb_valid = torch.Tensor(dt_valid)
        xb_valid = xb_valid.t()
        print(xb_train.shape)
        print(xb_valid.shape)
        
        train_ds = TensorDataset(xb_train, yb_train)
        valid_ds = TensorDataset(xb_valid, yb_valid)
        
        return train_ds, valid_ds

In [15]:
rf = RFmodel(300, 'mae', x_train, y_train, x_valid, y_valid,model=m)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [16]:
train_ds, valid_ds = rf.fit_model(True)
len(train_ds[0])

Getting values from desicion trees...
torch.Size([100, 20000])
torch.Size([100, 22317])
torch.Size([22317])
torch.Size([20000])
torch.Size([20000, 100])
torch.Size([22317, 100])


2

In [17]:
t, v = rf.getRfLoss()
print(t, v)

1.0 0.843975444728234


In [18]:
class Recorder():
    def __init__(self):
        self.valid_losses, self.losses, self.lr, self.mse, self.mse_valid =[], [], [], [], []
        
    def accumulate(self,loss, mse, lr):
        self.losses.append(loss)
        self.mse.append(mse)
        self.lr.append(lr)
    
    def accumulate_v(self, loss, mse):
        self.valid_losses.append(loss)
        self.mse_valid.append(mse)
        
    def plot_loss(self):
        plt.plot(self.losses)
        return self.losses
    
    def plot_validloss(self):
        plt.plot(self.valid_losses)
        return self.valid_losses
    
    def plot_lr(self):
        plt.plot(self.lr)
        return self.lr
    
    def plot_mse(self):
        plt.plot(self.mse)
        plt.plot(self.mse_valid)
        return self.mse, self.mse_valid

In [46]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)


def getDNNmodel(n, lr = 1e-03):
    n = n
    nh = 1000
    nh2 = 500
    nh3 = 100
    c = 1
    model = nn.Sequential(nn.Linear(n,nh), 
                          nn.ReLU(), 
                          nn.Linear(nh,nh2), 
                          nn.ReLU(), 
                          nn.Linear(nh2, nh3),
                          nn.ReLU(),
                          nn.Linear(nh3,1),
                          nn.ReLU(),
                          nn.Sigmoid())

    
    return model, optim.Adam(model.parameters(), lr=lr)

In [47]:
accuracy = metrics.accuracy_score
loss_func = nn.BCELoss()

def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 3 epochs"""
    lr = 1e-03
    for param_group in optimizer.param_groups:
        lr = param_group['lr']
        if(epoch == 7):
            param_group['lr'] = lr / 2
            lr = lr / 10
    return lr

In [48]:
class RfDNN():
    def __init__(self, train_ds, valid_ds, error_metrics, trees, model=None, bs = 3000): 

        self.model = model
        self.train_ds = train_ds
        self.valid_ds = valid_ds
        self.bs = bs
        self.n = len(train_ds)
        self.nv = len(valid_ds)
        self.error_metrics = error_metrics
        self.record = Recorder()
        self.trees = trees
    
    @property
    def recorder(self):
        return self.record

    def runDNN(self, epoch):
        self.epoch = epoch
        
        if(self.model == None):
            self.model, self.opt = getDNNmodel(self.trees, lr= 1e-03)
            
        print(self.model)
        ## Refactored mini batch training
        bs = self.bs
        tot_loss,tot_acc, tot_mse = 0.,0.,0.
        for epoc in range(epoch):
            lr = adjust_learning_rate(self.opt, epoc)
            print('Learning rate: ', lr)
            for i in range((self.n-1)//bs + 1):
                
                self.xb, self.yb = train_ds[i*bs:i*bs+bs]
                xb, yb = self.xb, self.yb.unsqueeze(-1)
                pred = self.model(xb)
                loss = loss_func(pred, yb)
                loss.backward()
                self.opt.step()
                self.opt.zero_grad()
                print('#', end="")
                acc_loss = self.error_metrics(pred, yb)
                mse = self.error_metrics(pred, yb)
                self.record.accumulate([loss.detach().cpu() ,acc_loss], mse, lr)
            
            print()
            print('For epoch', epoc, ' loss:', loss.detach().cpu().numpy(), 'acc:',acc_loss.detach().numpy(),'mse:', mse)
            
            with torch.no_grad():
                
                for i in range((self.nv-1)//bs + 1):
                    xb, yb = valid_ds[i*bs:i*bs+bs]
                    yb = yb.unsqueeze(-1)
                    pred_v = self.model(xb)
                    loss_v = loss_func(pred_v, yb)
                    acc_v = self.error_metrics(pred_v, yb)
                    mse_v = self.error_metrics(pred_v, yb)
                    self.record.accumulate_v([loss_v.detach().cpu().numpy() ,acc_v], mse)
                print('Valid_loss:', loss_v.detach().cpu().numpy(),'valid_acc:',acc_v.numpy(), ' mse:', mse_v)
                print()
                
        
    def getValidError(self, rows):
        self.xv, self.yv = valid_ds[:rows]
        pred = self.model(self.xv)
        print('Valid accuracy: ', self.error_metrics(pred.squeeze(-1), self.yv))
        
        return pred, self.error_metrics(pred.squeeze(-1), self.yv)

In [49]:
rfDNN = RfDNN(train_ds, valid_ds,trees = 100, bs=5000,error_metrics=mse)

In [50]:
rfDNN.runDNN(20)

Sequential(
  (0): Linear(in_features=100, out_features=1000, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1000, out_features=500, bias=True)
  (3): ReLU()
  (4): Linear(in_features=500, out_features=100, bias=True)
  (5): ReLU()
  (6): Linear(in_features=100, out_features=1, bias=True)
  (7): ReLU()
  (8): Sigmoid()
)
Learning rate:  0.001
####
For epoch 0  loss: 0.39882985 acc: 0.11334107 mse: tensor(0.1133, grad_fn=<MseLossBackward>)
Valid_loss: 0.42570257 valid_acc: 0.12797458  mse: tensor(0.1280)

Learning rate:  0.001
####
For epoch 1  loss: 0.22456785 acc: 0.08685117 mse: tensor(0.0869, grad_fn=<MseLossBackward>)
Valid_loss: 0.595307 valid_acc: 0.14169392  mse: tensor(0.1417)

Learning rate:  0.001
####
For epoch 2  loss: 0.1387878 acc: 0.040674962 mse: tensor(0.0407, grad_fn=<MseLossBackward>)
Valid_loss: 0.5516141 valid_acc: 0.13907616  mse: tensor(0.1391)

Learning rate:  0.001
####
For epoch 3  loss: 0.11405052 acc: 0.0414157 mse: tensor(0.0414, grad_fn=<MseLossBackwar

In [51]:
preds, _ = rfDNN.getValidError(20000)

Valid accuracy:  tensor(0.1558, grad_fn=<MseLossBackward>)


In [52]:
for i in range(preds.shape[0]):
    if(preds[i] > 0.8):
        preds[i] = 1
    else: preds[i] = 0

In [53]:
metrics.accuracy_score(y_valid[:20000], preds)

0.84165