In [1]:
import warnings 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt  

pd.set_option("display.max_colwidth", 200) 
warnings.filterwarnings("ignore", category=DeprecationWarning) 

%matplotlib inline

In [2]:
train  = pd.read_csv('train.csv') 
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,...,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13
0,268055008619,Z,"Turner, Baldwin and Rhodes",4.25,214000,360,2012-03-01,05/2012,95,1.0,...,0,0,0,1,0,0,0,0,0,1
1,672831657627,Y,"Swanson, Newton and Miller",4.875,144000,360,2012-01-01,03/2012,72,1.0,...,0,0,0,0,0,0,0,1,0,1
2,742515242108,Z,Thornton-Davis,3.25,366000,180,2012-01-01,03/2012,49,1.0,...,0,0,0,0,0,0,0,0,0,1
3,601385667462,X,OTHER,4.75,135000,360,2012-02-01,04/2012,46,2.0,...,0,0,0,0,0,1,1,1,1,1
4,273870029961,X,OTHER,4.75,124000,360,2012-02-01,04/2012,80,1.0,...,3,4,5,6,7,8,9,10,11,1


In [4]:
train.dtypes

loan_id                       int64
source                       object
financial_institution        object
interest_rate               float64
unpaid_principal_bal          int64
loan_term                     int64
origination_date             object
first_payment_date           object
loan_to_value                 int64
number_of_borrowers         float64
debt_to_income_ratio        float64
borrower_credit_score       float64
loan_purpose                 object
insurance_percent           float64
co-borrower_credit_score    float64
insurance_type              float64
m1                            int64
m2                            int64
m3                            int64
m4                            int64
m5                            int64
m6                            int64
m7                            int64
m8                            int64
m9                            int64
m10                           int64
m11                           int64
m12                         

In [5]:
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()

In [6]:
train['source'].unique(), test['source'].unique()

(array(['Z', 'Y', 'X'], dtype=object), array(['Y', 'X', 'Z'], dtype=object))

In [7]:
train['source']=label.fit_transform(train['source'].astype('str'))
test['source']=label.fit_transform(test['source'].astype('str'))

In [8]:
train['loan_purpose'].unique(), test['loan_purpose'].unique()

(array(['C86', 'B12', 'A23'], dtype=object),
 array(['A23', 'C86', 'B12'], dtype=object))

In [9]:
train['loan_purpose']=label.fit_transform(train['loan_purpose'].astype('str'))
test['loan_purpose']=label.fit_transform(test['loan_purpose'].astype('str'))

In [10]:
orig_date=train['origination_date']
orig_date=list(orig_date)

In [11]:
day=[]
month=[]
year=[]
for i in orig_date:
    temp = [int(x)for x in i.split("-")]
    day.append(temp[2])
    month.append(temp[1])
    year.append(temp[0])

In [12]:
from collections import Counter
Counter(year), Counter(day), Counter(month)

(Counter({2012: 116058}),
 Counter({1: 116058}),
 Counter({3: 14631, 1: 49093, 2: 52334}))

In [13]:
train['orig_month']=month

In [14]:
test.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,...,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12
0,1,1,Browning-Hart,3.875,417000,360,01/02/12,Apr-12,75,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,OTHER,4.5,113000,360,01/02/12,Apr-12,80,2,...,0,0,0,0,0,0,0,0,0,0
2,3,1,OTHER,4.5,72000,360,01/01/12,Mar-12,75,1,...,0,0,0,0,0,0,0,0,0,0
3,4,0,"Miller, Mcclure and Allen",4.125,123000,180,01/02/12,Apr-12,41,2,...,0,0,0,0,0,0,0,0,0,0
4,5,0,Browning-Hart,3.25,166000,180,01/02/12,Apr-12,53,2,...,0,0,0,0,0,0,0,0,0,0


In [15]:
orig_date_test=test['origination_date']
orig_date_test=list(orig_date_test)

In [16]:
day_test=[]
month_test=[]
year_test=[]
for i in orig_date_test:
    temp = [int(x)for x in i.split("/")]
    day_test.append(temp[0])
    month_test.append(temp[1])
    year_test.append(2000+temp[2])

In [17]:
Counter(year_test), Counter(day_test), Counter(month_test)

(Counter({2012: 35866}),
 Counter({1: 35866}),
 Counter({2: 16423, 1: 15051, 3: 4392}))

In [18]:
test['orig_month']=month_test

In [19]:
first_pay=train['first_payment_date']
pay_month=[]
pay_year=[]
for i in first_pay:
    temp = [int(x)for x in i.split("/")]
    pay_month.append(temp[0])
    pay_year.append(temp[1])

In [20]:
Counter(pay_month), Counter(pay_year)

(Counter({5: 15014, 3: 47680, 4: 52840, 2: 524}), Counter({2012: 116058}))

In [21]:
train['first_pay_month']=pay_month

In [22]:
first_pay_test=test['first_payment_date']
pay_month_test=[]
pay_year_test=[]
for i in first_pay_test:
    temp = [x for x in i.split("-")]
    pay_month_test.append(temp[0])
    pay_year_test.append(int('20'+temp[1]))

In [23]:
Counter(pay_month_test), Counter(pay_year_test)

(Counter({'Apr': 16551, 'Mar': 14661, 'May': 4510, 'Feb': 144}),
 Counter({2012: 35866}))

In [24]:
pay_month_test_int=[]
for i in pay_month_test:
    if(i=='Apr'):
        pay_month_test_int.append(4)
    elif(i=='Mar'):
        pay_month_test_int.append(3)
    elif(i=='May'):
        pay_month_test_int.append(5)
    elif(i=='Feb'):
        pay_month_test_int.append(2)

In [25]:
Counter(pay_month_test),Counter(pay_month_test_int) 

(Counter({'Apr': 16551, 'Mar': 14661, 'May': 4510, 'Feb': 144}),
 Counter({4: 16551, 3: 14661, 5: 4510, 2: 144}))

In [26]:
test['first_pay_month']=pay_month_test_int

In [27]:
train['month_diff']=train['first_pay_month']-train['orig_month']
test['month_diff']=test['first_pay_month']-test['orig_month']

In [28]:
train['insurance_type']=train['insurance_type'].astype('int')
test['insurance_type']=test['insurance_type'].astype('int')

In [29]:
train['insurance_percent']=train['insurance_percent'].apply(lambda x:x/100)

In [30]:
test['insurance_percent']=test['insurance_percent'].apply(lambda x:x/100)

In [31]:
train['interest_rate']=train['interest_rate'].apply(lambda x:x/100)
test['interest_rate']=test['interest_rate'].apply(lambda x:x/100)

In [38]:
train['financial_institution']=train['financial_institution'].replace('OTHER',0)
test['financial_institution']=test['financial_institution'].replace('OTHER',0)

In [39]:
train.loc[train['financial_institution'] != 0, 'financial_institution'] = 1
test.loc[test['financial_institution'] != 0, 'financial_institution'] = 1

In [40]:
train.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,...,m7,m8,m9,m10,m11,m12,m13,orig_month,first_pay_month,month_diff
0,268055008619,2,1,0.0425,214000,360,2012-03-01,05/2012,95,1.0,...,1,0,0,0,0,0,1,3,5,2
1,672831657627,1,1,0.04875,144000,360,2012-01-01,03/2012,72,1.0,...,0,0,0,0,1,0,1,1,3,2
2,742515242108,2,1,0.0325,366000,180,2012-01-01,03/2012,49,1.0,...,0,0,0,0,0,0,1,1,3,2
3,601385667462,0,0,0.0475,135000,360,2012-02-01,04/2012,46,2.0,...,0,0,1,1,1,1,1,2,4,2
4,273870029961,0,0,0.0475,124000,360,2012-02-01,04/2012,80,1.0,...,6,7,8,9,10,11,1,2,4,2


In [41]:
train['total_delin']=train['m1']+train['m2']+train['m3']+train['m4']+train['m5']+train['m6']+train['m7']+train['m8']+train['m9']+train['m10']+train['m11']+train['m12']

In [42]:
test['total_delin']=test['m1']+test['m2']+test['m3']+test['m4']+test['m5']+test['m6']+test['m7']+test['m8']+test['m9']+test['m10']+test['m11']+test['m12']

In [44]:
train.columns

Index(['loan_id', 'source', 'financial_institution', 'interest_rate',
       'unpaid_principal_bal', 'loan_term', 'origination_date',
       'first_payment_date', 'loan_to_value', 'number_of_borrowers',
       'debt_to_income_ratio', 'borrower_credit_score', 'loan_purpose',
       'insurance_percent', 'co-borrower_credit_score', 'insurance_type', 'm1',
       'm2', 'm3', 'm4', 'm5', 'm6', 'm7', 'm8', 'm9', 'm10', 'm11', 'm12',
       'm13', 'orig_month', 'first_pay_month', 'month_diff', 'total_delin'],
      dtype='object')

In [46]:
y=train['m13']
x=train.drop(['loan_id','m13','origination_date','first_payment_date'],axis=1)

In [47]:
test_id=test['loan_id']
test_data=test.drop(['loan_id','origination_date','first_payment_date'],axis=1)

In [48]:
submission=pd.DataFrame(test_id,columns=['loan_id'])

In [49]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [50]:
x_scaled=scaler.fit_transform(x)
test_scaled=scaler.fit_transform(test_data)

In [51]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3)

In [52]:
import xgboost as xgb

In [53]:
dtrain = xgb.DMatrix(x_train, label=y_train) 
dvalid = xgb.DMatrix(x_test, label=y_test) 
dtest = xgb.DMatrix(test_scaled)

  if getattr(data, 'base', None) is not None and \


In [54]:
params = {
    'objective':'binary:logistic',
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'gamma': 0.0
 }

In [55]:
def custom_eval(preds, dtrain):
    labels = dtrain.get_label().astype(np.int)
    preds = (preds >= 0.3).astype(np.int)
    return [('f1_score', f1_score(labels, preds))]

In [56]:
from sklearn.metrics import f1_score

In [57]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(5,13)
     for min_child_weight in range(5,13)
 ]
max_f1 = 0. # initializing with 0 
best_params = None 
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
     # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

     # Cross-validation
    cv_results = xgb.cv(params,
        dtrain,feval= custom_eval,
        num_boost_round=200,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=10
    )     
    # Finding best F1 Score

    mean_f1 = cv_results['test-f1_score-mean'].max()

    boost_rounds = cv_results['test-f1_score-mean'].idxmax()    
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))    
    if mean_f1 > max_f1:
            max_f1 = mean_f1
            best_params = (max_depth,min_child_weight) 

print("Best params: {}, {}, F1 Score: {}".format(best_params[0], best_params[1], max_f1))

CV with max_depth=5, min_child_weight=5
	F1 Score 0.5460534 for 5 rounds
CV with max_depth=5, min_child_weight=6
	F1 Score 0.5502559999999999 for 8 rounds
CV with max_depth=5, min_child_weight=7
	F1 Score 0.552834 for 5 rounds
CV with max_depth=5, min_child_weight=8
	F1 Score 0.5547862 for 5 rounds
CV with max_depth=5, min_child_weight=9
	F1 Score 0.5585354 for 5 rounds
CV with max_depth=5, min_child_weight=10
	F1 Score 0.5603276 for 6 rounds
CV with max_depth=5, min_child_weight=11
	F1 Score 0.5553938 for 5 rounds
CV with max_depth=5, min_child_weight=12
	F1 Score 0.5549139999999999 for 5 rounds
CV with max_depth=6, min_child_weight=5
	F1 Score 0.548572 for 8 rounds
CV with max_depth=6, min_child_weight=6
	F1 Score 0.547855 for 5 rounds
CV with max_depth=6, min_child_weight=7
	F1 Score 0.5536760000000001 for 5 rounds
CV with max_depth=6, min_child_weight=8
	F1 Score 0.554098 for 5 rounds
CV with max_depth=6, min_child_weight=9
	F1 Score 0.5598058 for 5 rounds
CV with max_depth=6, min_

In [58]:
params['max_depth'] = 6
params['min_child_weight'] = 10

In [59]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(5,10)]
    for colsample in [i/10. for i in range(5,10)] ]
max_f1 = 0.
best_params = None 
for subsample, colsample in gridsearch_params:
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
     # Update our parameters
    params['colsample'] = colsample
    params['subsample'] = subsample
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=200,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=10
    )
     # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].idxmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (subsample, colsample) 

print("Best params: {}, {}, F1 Score: {}".format(best_params[0], best_params[1], max_f1))

CV with subsample=0.5, colsample=0.5
	F1 Score 0.5678993999999999 for 8 rounds
CV with subsample=0.5, colsample=0.6
	F1 Score 0.5678993999999999 for 8 rounds
CV with subsample=0.5, colsample=0.7
	F1 Score 0.5678993999999999 for 8 rounds
CV with subsample=0.5, colsample=0.8
	F1 Score 0.5678993999999999 for 8 rounds
CV with subsample=0.5, colsample=0.9
	F1 Score 0.5678993999999999 for 8 rounds
CV with subsample=0.6, colsample=0.5
	F1 Score 0.5585907999999999 for 6 rounds
CV with subsample=0.6, colsample=0.6
	F1 Score 0.5585907999999999 for 6 rounds
CV with subsample=0.6, colsample=0.7
	F1 Score 0.5585907999999999 for 6 rounds
CV with subsample=0.6, colsample=0.8
	F1 Score 0.5585907999999999 for 6 rounds
CV with subsample=0.6, colsample=0.9
	F1 Score 0.5585907999999999 for 6 rounds
CV with subsample=0.7, colsample=0.5
	F1 Score 0.5673268 for 10 rounds
CV with subsample=0.7, colsample=0.6
	F1 Score 0.5673268 for 10 rounds
CV with subsample=0.7, colsample=0.7
	F1 Score 0.5673268 for 10 roun

In [60]:
params['subsample'] = .5
params['colsample_bytree'] = .5

In [61]:
max_f1 = 0. 
best_params = None 
for eta in [.5, .3, .1, .05, .03, .01]:
    print("CV with eta={}".format(eta))
     # Update ETA
    params['eta'] = eta

     # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=1000,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=20
    )

     # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].idxmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = eta 
print("Best params: {}, F1 Score: {}".format(best_params, max_f1))

CV with eta=0.5
	F1 Score 0.5434844 for 8 rounds
CV with eta=0.3
	F1 Score 0.5607784 for 24 rounds
CV with eta=0.1
	F1 Score 0.5541763999999999 for 18 rounds
CV with eta=0.05
	F1 Score 0.5477346000000001 for 20 rounds
CV with eta=0.03
	F1 Score 0.5625576000000001 for 44 rounds
CV with eta=0.01
	F1 Score 0.0110412 for 0 rounds
Best params: 0.03, F1 Score: 0.5625576000000001


In [62]:
params['eta'] = 0.03

In [63]:
max_f1 = 0. 
best_params = None 
for gamma in [i/10 for i in range(0,10)]:
    print("CV with gamma={}".format(gamma))
     # Update gamma
    params['gamma'] = gamma

     # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=1000,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=20
    )

     # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].idxmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = gamma 
print("Best params: {}, F1 Score: {}".format(best_params, max_f1))

CV with gamma=0.0
	F1 Score 0.5625576000000001 for 44 rounds
CV with gamma=0.1
	F1 Score 0.5625576000000001 for 44 rounds
CV with gamma=0.2
	F1 Score 0.5626778 for 40 rounds
CV with gamma=0.3
	F1 Score 0.5645632 for 44 rounds
CV with gamma=0.4
	F1 Score 0.5645632 for 44 rounds
CV with gamma=0.5
	F1 Score 0.566373 for 44 rounds
CV with gamma=0.6
	F1 Score 0.566373 for 44 rounds
CV with gamma=0.7
	F1 Score 0.5663814 for 44 rounds
CV with gamma=0.8
	F1 Score 0.5681912 for 44 rounds
CV with gamma=0.9
	F1 Score 0.5642722000000001 for 44 rounds
Best params: 0.8, F1 Score: 0.5681912


In [64]:
params['gamma'] = 0.8

In [65]:
params

{'objective': 'binary:logistic',
 'max_depth': 6,
 'min_child_weight': 10,
 'eta': 0.03,
 'subsample': 0.5,
 'colsample_bytree': 0.5,
 'gamma': 0.8,
 'colsample': 0.9}

In [66]:
xgb_model = xgb.train(
    params,
    dtrain,
    feval= custom_eval,
    num_boost_round= 1000,
    maximize=True,
    evals=[(dvalid, "Validation")],
    early_stopping_rounds=100
)

[0]	Validation-error:0.004739	Validation-f1_score:0.010571
Multiple eval metrics have been passed: 'Validation-f1_score' will be used for early stopping.

Will train until Validation-f1_score hasn't improved in 100 rounds.
[1]	Validation-error:0.004222	Validation-f1_score:0.010571
[2]	Validation-error:0.004136	Validation-f1_score:0.010571
[3]	Validation-error:0.004193	Validation-f1_score:0.010571
[4]	Validation-error:0.004107	Validation-f1_score:0.010571
[5]	Validation-error:0.004107	Validation-f1_score:0.010571
[6]	Validation-error:0.004165	Validation-f1_score:0.010571
[7]	Validation-error:0.004193	Validation-f1_score:0.010571
[8]	Validation-error:0.004251	Validation-f1_score:0.010571
[9]	Validation-error:0.004165	Validation-f1_score:0.010571
[10]	Validation-error:0.004165	Validation-f1_score:0.010571
[11]	Validation-error:0.004078	Validation-f1_score:0.010571
[12]	Validation-error:0.004107	Validation-f1_score:0.010571
[13]	Validation-error:0.004136	Validation-f1_score:0.010571
[14]	V

[135]	Validation-error:0.004251	Validation-f1_score:0.491909
[136]	Validation-error:0.004251	Validation-f1_score:0.487013
[137]	Validation-error:0.004251	Validation-f1_score:0.487013
[138]	Validation-error:0.004251	Validation-f1_score:0.488599
[139]	Validation-error:0.004251	Validation-f1_score:0.493506
[140]	Validation-error:0.004251	Validation-f1_score:0.501608
[141]	Validation-error:0.004251	Validation-f1_score:0.496774
Stopping. Best iteration:
[41]	Validation-error:0.004107	Validation-f1_score:0.519403



In [67]:
test_pred = xgb_model.predict(dtest)

In [68]:
m13_split = (test_pred >= 0.3).astype(np.int)

In [69]:
Counter(m13_split)

Counter({0: 35744, 1: 122})

In [70]:
submission['m13']=m13_split

In [71]:
submission.to_csv("sub/sub_xgb_featengg.csv",index=False)

In [72]:
submission.head()

Unnamed: 0,loan_id,m13
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
