In [1]:
import numpy as np
import pandas as pd
import warnings; warnings.simplefilter('ignore')
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv") 

In [140]:
train.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,...,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13
0,268055008619,Z,"Turner, Baldwin and Rhodes",4.25,214000,360,2012-03-01,05/2012,95,1.0,...,0,0,0,1,0,0,0,0,0,1
1,672831657627,Y,"Swanson, Newton and Miller",4.875,144000,360,2012-01-01,03/2012,72,1.0,...,0,0,0,0,0,0,0,1,0,1
2,742515242108,Z,Thornton-Davis,3.25,366000,180,2012-01-01,03/2012,49,1.0,...,0,0,0,0,0,0,0,0,0,1
3,601385667462,X,OTHER,4.75,135000,360,2012-02-01,04/2012,46,2.0,...,0,0,0,0,0,1,1,1,1,1
4,273870029961,X,OTHER,4.75,124000,360,2012-02-01,04/2012,80,1.0,...,3,4,5,6,7,8,9,10,11,1


In [141]:
y = pd.DataFrame(train['m13'])
ID = test['loan_id']
train.isnull().sum()
test.isnull().sum()

loan_id                     0
source                      0
financial_institution       0
interest_rate               0
unpaid_principal_bal        0
loan_term                   0
origination_date            0
first_payment_date          0
loan_to_value               0
number_of_borrowers         0
debt_to_income_ratio        0
borrower_credit_score       0
loan_purpose                0
insurance_percent           0
co-borrower_credit_score    0
insurance_type              0
m1                          0
m2                          0
m3                          0
m4                          0
m5                          0
m6                          0
m7                          0
m8                          0
m9                          0
m10                         0
m11                         0
m12                         0
dtype: int64

In [142]:
print(train.shape)
print(test.shape)

(116058, 29)
(35866, 28)


In [143]:
train["m13"].value_counts()

0    115422
1       636
Name: m13, dtype: int64

In [144]:
train.dtypes

loan_id                       int64
source                       object
financial_institution        object
interest_rate               float64
unpaid_principal_bal          int64
loan_term                     int64
origination_date             object
first_payment_date           object
loan_to_value                 int64
number_of_borrowers         float64
debt_to_income_ratio        float64
borrower_credit_score       float64
loan_purpose                 object
insurance_percent           float64
co-borrower_credit_score    float64
insurance_type              float64
m1                            int64
m2                            int64
m3                            int64
m4                            int64
m5                            int64
m6                            int64
m7                            int64
m8                            int64
m9                            int64
m10                           int64
m11                           int64
m12                         

In [145]:
X1 = train.drop(['loan_id', 'first_payment_date','origination_date','m13','insurance_percent','insurance_type'],axis=1)
test = test.drop(['loan_id' ,'origination_date','first_payment_date','insurance_percent','insurance_type'],axis=1)

Here by label encoder I convert object values to numerical values 

In [146]:
from sklearn.preprocessing import LabelEncoder
lm = LabelEncoder()
a  = ['financial_institution']
for i in np.arange(len(a)):
    X1[a[i]] = lm.fit_transform(X1[a[i]])
    
for i in np.arange(len(a)):
    test[a[i]] = lm.fit_transform(test[a[i]])

By one hot encoding I convert 'source','loan_purpose' into dummy variables

In [147]:
sourc = pd.get_dummies(X1['source'])
LoanP = pd.get_dummies(X1['loan_purpose'])
X1 = pd.concat([X1,LoanP],axis = 1)
X1 = X1.drop(['source','loan_purpose'],axis=1)

sourc1 = pd.get_dummies(test['source'])
LoanP1 = pd.get_dummies(test['loan_purpose'])
Test = pd.concat([test,LoanP1],axis = 1)
Test = Test.drop(['source','loan_purpose'],axis=1)

By using 'ADASYN'( Adaptive Synthetic sampling approach) which is one of the oversampling techniques, I balanced the dataset and overcome the situation in which our model only predicts the majority class

In [148]:
from imblearn.over_sampling import ADASYN 
sm = ADASYN()
X, y = sm.fit_sample(X1, y)
X = pd.DataFrame(X, columns = X1.columns)
y=pd.DataFrame(y)

In [149]:
print(X.shape)


(230643, 24)


In [150]:
X1.head()

Unnamed: 0,financial_institution,interest_rate,unpaid_principal_bal,loan_term,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,co-borrower_credit_score,m1,...,m6,m7,m8,m9,m10,m11,m12,A23,B12,C86
0,18,4.25,214000,360,95,1.0,22.0,694.0,0.0,0,...,0,1,0,0,0,0,0,0,0,1
1,15,4.875,144000,360,72,1.0,44.0,697.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0
2,17,3.25,366000,180,49,1.0,33.0,780.0,0.0,0,...,0,0,0,0,0,0,0,0,1,0
3,8,4.75,135000,360,46,2.0,44.0,633.0,638.0,0,...,0,0,0,1,1,1,1,0,1,0
4,8,4.75,124000,360,80,1.0,43.0,681.0,0.0,0,...,5,6,7,8,9,10,11,0,0,1


I hyper tuned my random forest classifier by varying depth between [6,15] and min sample split between[2,5]. I used criterion as "Entropy" because entropy has an edge in some data cases involving a high imbalance. 
I have given class weight { 0:0.8, 1:1 } to minimize 'class1' error which means "It is ok that If I don't give loan to 2-3 good guys but I don't want to give loan to those people who can be loan defaulters".

In [151]:
model = RandomForestClassifier(random_state=42,max_depth= 10 ,min_samples_split= 5 ,n_estimators= 500,n_jobs=-1, min_samples_leaf=2,  criterion='entropy',class_weight= {0: 0.8, 1:1})
model.fit(X,y)
y_pred = model.predict(Test)

In [152]:
pd.DataFrame({'loan_id':ID,'m13':  y_pred}).set_index('loan_id').to_csv('submission.csv')
pd.Series(model.feature_importances_,index=X1.columns).sort_values(ascending=False)


m12                         0.222896
m11                         0.134618
m10                         0.086091
co-borrower_credit_score    0.066642
borrower_credit_score       0.062377
number_of_borrowers         0.060914
A23                         0.057362
m9                          0.056508
m8                          0.042183
m7                          0.032203
B12                         0.031399
C86                         0.027774
interest_rate               0.022741
financial_institution       0.020052
loan_term                   0.018583
m5                          0.017432
m6                          0.014583
m4                          0.006696
unpaid_principal_bal        0.006302
debt_to_income_ratio        0.004128
loan_to_value               0.004090
m1                          0.001848
m3                          0.001389
m2                          0.001188
dtype: float64

In [153]:
sum(y_pred)

153