In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import lightgbm as lgbm

In [None]:
# load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sub = pd.read_csv("sample_submission.csv")
train.loan_default.mean() # dist of target var

In [None]:
# train characteristics
print(train.shape)
train.nunique()

In [None]:
# missing data
train.isnull().sum()

In [None]:
# imput missing employment type as missing
train["Employment.Type"]= train["Employment.Type"].fillna("Missing")
test["Employment.Type"]= test["Employment.Type"].fillna("Missing")

In [None]:
# review dtypes
train.dtypes

In [None]:
train.head().T

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures

# append test and train sets
train_len = len(train)
merged = pd.concat([train,test],axis=0)
merged.head()

# add date type feats
merged['Date.of.Birth'] = pd.to_datetime(merged['Date.of.Birth'],format="%d-%m-%y")
merged['DisbursalDate'] = pd.to_datetime(merged['DisbursalDate'],format="%d-%m-%y")

# add other date features
merged['dob_year'] = merged['Date.of.Birth'].dt.year
merged['age'] = merged['dob_year'].max() - merged['dob_year']
merged['DD_year']= merged['DisbursalDate'].dt.year

# drop original date feats
merged = merged.drop(['Date.of.Birth','DisbursalDate'],axis=1)

# drop id
merged = merged.drop('UniqueID',axis=1)

# column selector
cat_cols = merged.select_dtypes(include=['object']).columns
num_cols = merged.columns[merged.dtypes!='object'] # select num cols all
num_cols = [c for c in num_cols if c!='UniqueID'] # remove unique ID

# split x and y
X = merged.drop('loan_default',axis=1)
y = merged.loan_default

# label enc cat colsn
# le = LabelEncoder()
# for c in cat_cols:
#    print(c)
#    X[c] = le.fit_transform(X[c])

# one hot
X_dummies = pd.get_dummies(X,columns=cat_cols)

# split out train test sets
train_processed = X_dummies.iloc[:train_len]
test_processed = X_dummies.iloc[train_len:]


# define y in train set only
y = y.iloc[:train_len]

# split train / val
xtrain,xval,ytrain,yval = train_test_split(train_processed,y,test_size=0.2,stratify=y)

print("train len:",len(train_processed))
print("xtrain len:",len(train))
print("train cols:",len(train_processed.columns))
print("xtrain cols:",len(train.columns))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures

# flag to apply poly feats to analysis
usePoly = True

# append test and train sets
train_len = len(train)
merged = pd.concat([train,test],axis=0)
merged.head()

# add date type feats
merged['Date.of.Birth'] = pd.to_datetime(merged['Date.of.Birth'],format="%d-%m-%y")
merged['DisbursalDate'] = pd.to_datetime(merged['DisbursalDate'],format="%d-%m-%y")

# add other date features
merged['dob_year'] = merged['Date.of.Birth'].dt.year
merged['age'] = merged['dob_year'].max() - merged['dob_year']
merged['DD_year']= merged['DisbursalDate'].dt.year

# drop original date feats
merged = merged.drop(['Date.of.Birth','DisbursalDate'],axis=1)

# drop id
merged = merged.drop('UniqueID',axis=1)

# column selector
cat_cols = merged.select_dtypes(include=['object']).columns
num_cols = merged.columns[merged.dtypes!='object'] # select num cols all
num_cols = [c for c in num_cols if c!='UniqueID'] # remove unique ID

# split x and y
X = merged.drop('loan_default',axis=1)
y = merged.loan_default

# label enc cat colsn
# le = LabelEncoder()
# for c in cat_cols:
#    print(c)
#    X[c] = le.fit_transform(X[c])
    
# one hot
X_dummies = pd.get_dummies(X,columns=cat_cols)

# poly feats
if usePoly == True:
    poly = PolynomialFeatures(include_bias=False,interaction_only=True)
    X_dummies = poly.fit_transform(X_dummies)

# split out train test sets
train_processed = X_dummies.iloc[:train_len]
test_processed = X_dummies.iloc[train_len:]    
    
# define y in train set only
y = y.iloc[:train_len]

# split train / val
xtrain,xval,ytrain,yval = train_test_split(train_processed,y,test_size=0.2,stratify=y)

print("train len:",len(train_processed))
print("xtrain len:",len(train))
print("train cols:",len(train_processed.columns))
print("xtrain cols:",len(train.columns))

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures

# append test and train sets
train_len = len(train)
merged = pd.concat([train,test],axis=0)
merged.head()

# add date type feats
merged['Date.of.Birth'] = pd.to_datetime(merged['Date.of.Birth'],format="%d-%m-%y")
merged['DisbursalDate'] = pd.to_datetime(merged['DisbursalDate'],format="%d-%m-%y")

# add other date features
merged['dob_year'] = merged['Date.of.Birth'].dt.year
merged['age'] = merged['dob_year'].max() - merged['dob_year']
merged['DD_year']= merged['DisbursalDate'].dt.year

# drop original date feats
merged = merged.drop(['Date.of.Birth','DisbursalDate'],axis=1)

# drop id
merged = merged.drop('UniqueID',axis=1)

# column selector
cat_cols = merged.select_dtypes(include=['object']).columns
num_cols = merged.columns[merged.dtypes!='object'] # select num cols all
num_cols = [c for c in num_cols if c!='UniqueID'] # remove unique ID

# split x and y
X = merged.drop('loan_default',axis=1)
y = merged.loan_default

# label enc cat colsn
# le = LabelEncoder()
# for c in cat_cols:
#    print(c)
#    X[c] = le.fit_transform(X[c])
    
# one hot
X_dummies = pd.get_dummies(X,columns=cat_cols)

# split out train test sets
train_processed = X_dummies.iloc[:train_len]
test_processed = X_dummies.iloc[train_len:]

# define y in train set only
y = y.iloc[:train_len]

# split train / val
xtrain,xval,ytrain,yval = train_test_split(train_processed,y,test_size=0.2,stratify=y)

print("train len:",len(train_processed))
print("xtrain len:",len(train))
print("train cols:",len(train_processed.columns))
print("xtrain cols:",len(train.columns))

train len: 233154
xtrain len: 233154
train cols: 566
xtrain cols: 41


In [None]:
# apply gridseatchcv
from sklearn.model_selection import GridSearchCV, cross_val_score

param_grid = {
    'n_estimators' : [300,400,500],
    'max_depth' : [3,5,7],
    'bagging_fraction' : [0.8],
    'feature_fraction' : [0.7,0.8],
    'num_leaves' : [9,11,13,15]
}

clf = lgbm.LGBMClassifier(class_weight={0:1,1:4})
gridcv = GridSearchCV(clf,param_grid,scoring='roc_auc',n_jobs=1,cv=4, return_train_score=True)
results = gridcv.fit(xtrain,ytrain)
results.best_score_

In [26]:
### Hyper Opt on LightGBM

from hyperopt import hp, tpe, Trials
from hyperopt.fmin import fmin
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold

# the trials object to store details of each iter
trials = Trials()

# define dictionary search space
space = {}

space['n_estimators']= 100 + 100*hp.randint('n_estimators',10)
space['max_depth']= hp.choice('max_depth',np.arange(3,20,dtype=int))
space['num_leaves']= hp.choice('num_leaves', np.arange(2,30,dtype=int))
space['min_data_in_leaf']= hp.choice('min_data_in_leaf', np.arange(20,150,dtype=int))
space['feature_fraction']= hp.uniform('feature_fraction', 0.5,0.9)
space['bagging_fraction']= hp.uniform('bagging_fraction', 0.5,0.9)
space['lambda_l1']= hp.uniform('reg_lambda', 5,30)

# create a counter object
counter = 0
FOLDS= 4
# set objective function
def objective(params):
        global counter
        clf = lgbm.LGBMClassifier(boosting_type='gbdt',
                                  metric='auc',
                                  learning_rate=0.05,
                                  scale_pos_weight=4,
                                  n_jobs=-1)
        clf.set_params(**params)
        # cross val by default uses stratified kfold if no fold object is passed
        score = cross_val_score(clf,xtrain,ytrain,cv=FOLDS,scoring='roc_auc')
        counter +=1
        print("Eval No:",counter,"Score:",1-score.mean(),"Params:",params)
        # cross val score returns ranges of mean and we want to minimize the function hence 1 - metric
        return 1-score.mean() 

In [None]:
# run function to minimize objective

best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=400,trials=trials,verbose=0)
print("Hyperopt estimated optimum {}".format(best))


In [39]:
# get best param values
from hyperopt import space_eval
best_params = space_eval(space,best)
print(best_params)

{'bagging_fraction': 0.8938820204303998, 'feature_fraction': 0.8582316318535632, 'lambda_l1': 28.278990042794067, 'max_depth': 16, 'min_data_in_leaf': 90, 'n_estimators': 700, 'num_leaves': 18}


In [40]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

best_params = {'bagging_fraction': 0.8830336528248924,
 'feature_fraction': 0.6012577219030728,
 'lambda_l1': 11.304845997489313,
 'max_depth': 10,
 'min_data_in_leaf': 36,
 'n_estimators': 900,
 'num_leaves': 25}

# model
clf = lgbm.LGBMClassifier(**best_params,
                          learning_rate=0.05,
                         scale_pos_weight=4)
clf.fit(xtrain,ytrain)
preds_train = clf.predict(xtrain)
preds = clf.predict(xval)

print("accuracy on train:",roc_auc_score(ytrain,preds_train))
print("accuracy on val:",roc_auc_score(yval,preds))

  if diff:
  if diff:


accuracy on train: 0.6576876872916599
accuracy on val: 0.6207292586769982


In [34]:
# use feat importances to see if we can use poly feats on smaller data set
# poly feats

feat_cols = train_processed.columns
feat_imp = clf.feature_importances_

feature_imp_df = pd.DataFrame({'col':feat_cols,'importances':feat_imp})
feature_imp_df = feature_imp_df.sort_values('importances',ascending=False)
cols_for_poly = feature_imp_df.col[:60].values

# apply poly feats
poly = PolynomialFeatures(interaction_only=True)
train_processed_poly= poly.fit_transform(train_processed[cols_for_poly])
test_processed_poly = poly.transform(test_processed[cols_for_poly])

print(len(train_processed.columns))
print((train_processed_poly.shape[1]))

# split train / val
xtrain,xval,ytrain,yval = train_test_split(train_processed_poly,y,test_size=0.2,stratify=y)

566
1831


In [35]:
print(len(y))
print(train_processed_poly.shape[0])


233154
233154


In [36]:
# hyperopt for poly feats
counter = 0
FOLDS= 4
def objective_poly(params):
        global counter
        clf = lgbm.LGBMClassifier(boosting_type='gbdt',
                                  metric='auc',
                                  learning_rate=0.05,
                                  scale_pos_weight=4,
                                  n_jobs=-1)
        clf.set_params(**params)
        # cross val by default uses stratified kfold if no fold object is passed
        score = cross_val_score(clf,xtrain,ytrain,cv=FOLDS,scoring='roc_auc')
        counter +=1
        print("Eval No:",counter,"Score:",1-score.mean(),"Params:",params)
        # cross val score returns ranges of mean and we want to minimize the function hence 1 - metric
        return 1-score.mean() 

In [37]:
# run function to minimize objective poly

best = fmin(fn=objective_poly, space=space, algo=tpe.suggest, max_evals=300,trials=trials,verbose=0)
print("Hyperopt estimated optimum {}".format(best))

# get best param values
from hyperopt import space_eval
best_params = space_eval(space,best)
print(best_params)

Eval No: 1 Score: 0.3470315185307199 Params: {'bagging_fraction': 0.8434756212608181, 'feature_fraction': 0.7093820092517485, 'lambda_l1': 5.518617905065938, 'max_depth': 19, 'min_data_in_leaf': 128, 'n_estimators': 200, 'num_leaves': 5}
Eval No: 2 Score: 0.3403835844639158 Params: {'bagging_fraction': 0.8135705489297143, 'feature_fraction': 0.7865227398625927, 'lambda_l1': 7.0447729809209365, 'max_depth': 8, 'min_data_in_leaf': 137, 'n_estimators': 400, 'num_leaves': 7}
Eval No: 3 Score: 0.3384318012175753 Params: {'bagging_fraction': 0.7609159893137618, 'feature_fraction': 0.6445596784431933, 'lambda_l1': 28.265225868434204, 'max_depth': 3, 'min_data_in_leaf': 26, 'n_estimators': 600, 'num_leaves': 15}
Eval No: 4 Score: 0.3363299272188396 Params: {'bagging_fraction': 0.5873485645540982, 'feature_fraction': 0.5460163153466682, 'lambda_l1': 16.872010252925453, 'max_depth': 7, 'min_data_in_leaf': 29, 'n_estimators': 300, 'num_leaves': 28}
Eval No: 5 Score: 0.3365583672071746 Params: {'b

Eval No: 36 Score: 0.338231107072251 Params: {'bagging_fraction': 0.8768444860989949, 'feature_fraction': 0.79957567786391, 'lambda_l1': 24.343241893579687, 'max_depth': 16, 'min_data_in_leaf': 69, 'n_estimators': 300, 'num_leaves': 13}
Eval No: 37 Score: 0.3379513470929978 Params: {'bagging_fraction': 0.8306117863313051, 'feature_fraction': 0.842553701403856, 'lambda_l1': 12.281089696889872, 'max_depth': 17, 'min_data_in_leaf': 98, 'n_estimators': 700, 'num_leaves': 8}
Eval No: 38 Score: 0.3393622050809477 Params: {'bagging_fraction': 0.7984552985593789, 'feature_fraction': 0.8979393851055729, 'lambda_l1': 20.42593687471716, 'max_depth': 18, 'min_data_in_leaf': 79, 'n_estimators': 200, 'num_leaves': 15}
Eval No: 39 Score: 0.33690855235343986 Params: {'bagging_fraction': 0.736639657804447, 'feature_fraction': 0.6115884552999303, 'lambda_l1': 15.659904347386963, 'max_depth': 9, 'min_data_in_leaf': 70, 'n_estimators': 900, 'num_leaves': 26}
Eval No: 40 Score: 0.3574650411280209 Params: {

Eval No: 71 Score: 0.335358093758098 Params: {'bagging_fraction': 0.8825845396616837, 'feature_fraction': 0.8226190903132873, 'lambda_l1': 29.599634078311574, 'max_depth': 10, 'min_data_in_leaf': 142, 'n_estimators': 900, 'num_leaves': 15}
Eval No: 72 Score: 0.3353557350556007 Params: {'bagging_fraction': 0.7839899685120512, 'feature_fraction': 0.8520696810508658, 'lambda_l1': 29.680351969963514, 'max_depth': 10, 'min_data_in_leaf': 132, 'n_estimators': 900, 'num_leaves': 16}
Eval No: 73 Score: 0.33563615827324234 Params: {'bagging_fraction': 0.7869259443790222, 'feature_fraction': 0.8301678714313936, 'lambda_l1': 29.302019158671886, 'max_depth': 10, 'min_data_in_leaf': 132, 'n_estimators': 900, 'num_leaves': 14}
Eval No: 74 Score: 0.33539058982917114 Params: {'bagging_fraction': 0.764418116030254, 'feature_fraction': 0.8511372723456923, 'lambda_l1': 27.996583979981118, 'max_depth': 10, 'min_data_in_leaf': 57, 'n_estimators': 900, 'num_leaves': 15}
Eval No: 75 Score: 0.3352952191755179

Eval No: 106 Score: 0.34296937134251515 Params: {'bagging_fraction': 0.8668746801654298, 'feature_fraction': 0.6465741607486083, 'lambda_l1': 23.46585424903405, 'max_depth': 18, 'min_data_in_leaf': 59, 'n_estimators': 300, 'num_leaves': 6}
Eval No: 107 Score: 0.3396891263403349 Params: {'bagging_fraction': 0.5694102373042937, 'feature_fraction': 0.5453558463414245, 'lambda_l1': 11.570828991021038, 'max_depth': 8, 'min_data_in_leaf': 112, 'n_estimators': 1000, 'num_leaves': 26}
Eval No: 108 Score: 0.33944271099184087 Params: {'bagging_fraction': 0.6673096108746013, 'feature_fraction': 0.7774369251175621, 'lambda_l1': 18.212397602612903, 'max_depth': 14, 'min_data_in_leaf': 30, 'n_estimators': 200, 'num_leaves': 14}
Eval No: 109 Score: 0.33836240385349803 Params: {'bagging_fraction': 0.8114542916270633, 'feature_fraction': 0.7058595979447785, 'lambda_l1': 17.284431230201005, 'max_depth': 3, 'min_data_in_leaf': 119, 'n_estimators': 700, 'num_leaves': 25}
Eval No: 110 Score: 0.335916205906

Eval No: 141 Score: 0.33596164013900665 Params: {'bagging_fraction': 0.8421623816781068, 'feature_fraction': 0.8335456863346516, 'lambda_l1': 19.142873884762157, 'max_depth': 15, 'min_data_in_leaf': 130, 'n_estimators': 900, 'num_leaves': 15}
Eval No: 142 Score: 0.33760275491592096 Params: {'bagging_fraction': 0.7389423358055304, 'feature_fraction': 0.5753702934671838, 'lambda_l1': 25.806609738722738, 'max_depth': 6, 'min_data_in_leaf': 82, 'n_estimators': 1000, 'num_leaves': 29}
Eval No: 143 Score: 0.33581903462742924 Params: {'bagging_fraction': 0.8634918739153316, 'feature_fraction': 0.5529370535313384, 'lambda_l1': 29.913486847457207, 'max_depth': 12, 'min_data_in_leaf': 31, 'n_estimators': 800, 'num_leaves': 16}
Eval No: 144 Score: 0.33650817497367425 Params: {'bagging_fraction': 0.8539245983716597, 'feature_fraction': 0.8099993505809566, 'lambda_l1': 28.630968490773405, 'max_depth': 8, 'min_data_in_leaf': 132, 'n_estimators': 300, 'num_leaves': 22}
Eval No: 145 Score: 0.339258509

Eval No: 175 Score: 0.33609752163068585 Params: {'bagging_fraction': 0.8558042972132892, 'feature_fraction': 0.6850045747232465, 'lambda_l1': 14.727856079190786, 'max_depth': 10, 'min_data_in_leaf': 120, 'n_estimators': 900, 'num_leaves': 15}
Eval No: 176 Score: 0.3370969470978734 Params: {'bagging_fraction': 0.5659846393299325, 'feature_fraction': 0.7683428899328838, 'lambda_l1': 22.2208481592547, 'max_depth': 3, 'min_data_in_leaf': 77, 'n_estimators': 1000, 'num_leaves': 28}
Eval No: 177 Score: 0.3359167328169175 Params: {'bagging_fraction': 0.8117398791478719, 'feature_fraction': 0.8125008718693748, 'lambda_l1': 25.04467177319829, 'max_depth': 19, 'min_data_in_leaf': 117, 'n_estimators': 800, 'num_leaves': 16}
Eval No: 178 Score: 0.3366566317416597 Params: {'bagging_fraction': 0.8820249146308351, 'feature_fraction': 0.858696242472453, 'lambda_l1': 17.130085652038183, 'max_depth': 18, 'min_data_in_leaf': 143, 'n_estimators': 300, 'num_leaves': 22}
Eval No: 179 Score: 0.33775369711400

Eval No: 210 Score: 0.33571244969190894 Params: {'bagging_fraction': 0.8597981703845824, 'feature_fraction': 0.8557956829236331, 'lambda_l1': 19.943660069310457, 'max_depth': 15, 'min_data_in_leaf': 149, 'n_estimators': 400, 'num_leaves': 29}
Eval No: 211 Score: 0.33550416727997834 Params: {'bagging_fraction': 0.7444204551139472, 'feature_fraction': 0.7821264880820546, 'lambda_l1': 28.00599616326553, 'max_depth': 17, 'min_data_in_leaf': 122, 'n_estimators': 500, 'num_leaves': 22}
Eval No: 212 Score: 0.33884101852834414 Params: {'bagging_fraction': 0.8029509867618531, 'feature_fraction': 0.8240472607409572, 'lambda_l1': 23.53927492029469, 'max_depth': 8, 'min_data_in_leaf': 146, 'n_estimators': 800, 'num_leaves': 5}
Eval No: 213 Score: 0.33613574487456255 Params: {'bagging_fraction': 0.672350872342444, 'feature_fraction': 0.8148737562088713, 'lambda_l1': 22.430161222788758, 'max_depth': 12, 'min_data_in_leaf': 44, 'n_estimators': 900, 'num_leaves': 16}
Eval No: 214 Score: 0.337355171784

Eval No: 245 Score: 0.3355936401991745 Params: {'bagging_fraction': 0.7617347926298215, 'feature_fraction': 0.8187982651150814, 'lambda_l1': 22.415173920254702, 'max_depth': 10, 'min_data_in_leaf': 58, 'n_estimators': 600, 'num_leaves': 16}
Eval No: 246 Score: 0.33583568325291313 Params: {'bagging_fraction': 0.8540036520035249, 'feature_fraction': 0.7567020516884491, 'lambda_l1': 19.766447853118205, 'max_depth': 7, 'min_data_in_leaf': 107, 'n_estimators': 800, 'num_leaves': 21}
Eval No: 247 Score: 0.3358800462826701 Params: {'bagging_fraction': 0.7110541114575718, 'feature_fraction': 0.8871391176826844, 'lambda_l1': 28.550394110972324, 'max_depth': 18, 'min_data_in_leaf': 93, 'n_estimators': 1000, 'num_leaves': 17}
Eval No: 248 Score: 0.3361271456309787 Params: {'bagging_fraction': 0.7902563860399097, 'feature_fraction': 0.6452633533575138, 'lambda_l1': 22.96621790789403, 'max_depth': 4, 'min_data_in_leaf': 119, 'n_estimators': 700, 'num_leaves': 24}
Eval No: 249 Score: 0.3414794323412

Eval No: 279 Score: 0.33677552669509314 Params: {'bagging_fraction': 0.8818313223322514, 'feature_fraction': 0.8751852581577607, 'lambda_l1': 12.07626880644183, 'max_depth': 16, 'min_data_in_leaf': 94, 'n_estimators': 700, 'num_leaves': 18}
Eval No: 280 Score: 0.3351631297987506 Params: {'bagging_fraction': 0.8560614457849536, 'feature_fraction': 0.7993330309892885, 'lambda_l1': 29.397966004124957, 'max_depth': 16, 'min_data_in_leaf': 128, 'n_estimators': 700, 'num_leaves': 18}
Eval No: 281 Score: 0.33545224251494987 Params: {'bagging_fraction': 0.8551269242222773, 'feature_fraction': 0.7546445812888893, 'lambda_l1': 29.52458044169375, 'max_depth': 16, 'min_data_in_leaf': 128, 'n_estimators': 700, 'num_leaves': 18}
Eval No: 282 Score: 0.3358283502270798 Params: {'bagging_fraction': 0.842562471161455, 'feature_fraction': 0.7376937331863036, 'lambda_l1': 28.16682134866541, 'max_depth': 16, 'min_data_in_leaf': 28, 'n_estimators': 700, 'num_leaves': 18}
Eval No: 283 Score: 0.33573989092024

In [41]:
# prediction on poly feats
best_params = {'bagging_fraction': 0.8938820204303998,
               'feature_fraction': 0.8582316318535632,
               'lambda_l1': 28.278990042794067,
               'max_depth': 16,
               'min_data_in_leaf': 90,
               'n_estimators': 700,
               'num_leaves': 18}

# model
clf = lgbm.LGBMClassifier(**best_params,
                          learning_rate=0.05,
                         scale_pos_weight=4)
clf.fit(xtrain,ytrain)
preds_train = clf.predict(xtrain)
preds = clf.predict(xval)

print("accuracy on train:",roc_auc_score(ytrain,preds_train))
print("accuracy on val:",roc_auc_score(yval,preds))

  if diff:


accuracy on train: 0.6576876872916599
accuracy on val: 0.6207292586769982


  if diff:


In [43]:
# preds on test
preds_test = clf.predict(test_processed_poly)

sub['loan_default'] = preds_test

# save to csv
sub.to_csv("poly not full.csv",index=False)

  if diff:


In [44]:
# fit on entire data set and submit
clf.fit(train_processed_poly,y)

# preds on test
preds_test = clf.predict(test_processed_poly)

sub['loan_default'] = preds_test

# save to csv
sub.to_csv("poly feats full.csv",index=False)


  if diff:
