In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold,train_test_split,StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from datetime import date
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [20]:
train=pd.read_csv(r"C:\Users\Saiganne\Documents\analytics_vidhya\train.csv",parse_dates=True)
test=pd.read_csv(r"C:\Users\Saiganne\Documents\analytics_vidhya\test.csv",parse_dates=True)

In [21]:
#checking the class distribution
print('No Default (Target:0)', round(train['loan_default'].value_counts()[0]/len(train) * 100,2), '% of the dataset')
print('Default (Target:1)', round(train['loan_default'].value_counts()[1]/len(train) * 100,2), '% of the dataset')

No Default (Target:0) 78.29 % of the dataset
Default (Target:1) 21.71 % of the dataset


In [22]:
# lets combine the data for data prep
test['loan_default']=np.nan
train['data']='train'
test['data']='test'
test=test[train.columns]
all=pd.concat([train,test],axis=0)
uid=test['UniqueID']

In [23]:
# dropping columns
for col in ['UniqueID','supplier_id','Current_pincode_ID','Employee_code_ID','MobileNo_Avl_Flag']:
  all.drop([col],axis=1,inplace=True)
  
#all['PRIratio_overdue_no.accs']=(all['PRI.OVERDUE.ACCTS']/all['PRI.NO.OF.ACCTS'])
#all['PRIratio_overdue_no.accs']=np.where(np.isfinite(all['PRIratio_overdue_no.accs']),all['PRIratio_overdue_no.accs'],0)
#all['ratio_dis_sanc']=(all['PRI.DISBURSED.AMOUNT']/all['PRI.SANCTIONED.AMOUNT'])
#all['ratio_dis_sanc']=np.where(np.isfinite(all['ratio_dis_sanc']),all['ratio_dis_sanc'],0)
#all['ratio_def_acc']=(all['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS']/all['NEW.ACCTS.IN.LAST.SIX.MONTHS'])
#all['ratio_def_acc']=np.where(np.isfinite(all['ratio_def_acc']),all['ratio_def_acc'],0)
#all.drop(['PRI.OVERDUE.ACCTS','PRI.NO.OF.ACCTS'],axis=1,inplace=True) dropped this got auc 0.580 instead of 0.581
#all['months_to_pay']=round(all['PRI.CURRENT.BALANCE']/all['PRIMARY.INSTAL.AMT'])#total months to clear loan balance
#all['balance_amount']=all['PRI.DISBURSED.AMOUNT']-all['PRI.CURRENT.BALANCE']#gives balance ammount to pay for active loans
#all['months_to_pay']=np.where(np.isfinite(all['months_to_pay']),all['months_to_pay'],0)

In [24]:
#created age column  from Date.of.Birth and dropped Date.of.Birth
yr=all['Date.of.Birth'].str.slice(6,8).astype(int)
all['age']=np.where(yr==0,(yr+19),((100-yr)+19))
all.drop(['Date.of.Birth'],axis=1,inplace=True)

# AVERAGE.ACCT.AGE is changed to total num of  months 
n=all['AVERAGE.ACCT.AGE'].str.replace('yrs',"-")
n=n.str.replace('mon',"")
k=n.str.split("-",expand=True).astype(int) #k[0] is years and k[1] gives months
all['AVERAGE.ACCT.AGE']=(k[0]*12)+k[1]

#CREDIT.HISTORY.LENGTH is changed to total num of months
m=all['CREDIT.HISTORY.LENGTH'].str.replace('yrs',"-")
m=m.str.replace('mon',"")
k=m.str.split("-",expand=True).astype(int) #k[0] is years and k[1] gives months
all['CREDIT.HISTORY.LENGTH']=(k[0]*12)+k[1]

#creating dummies for branch_id
branch=pd.get_dummies(all['branch_id'],drop_first=True,prefix='branch')
all=pd.concat([all,branch],axis=1)
all.drop(['branch_id'],axis=1,inplace=True)

#creating dummies for manufacturer_id
manufac=pd.get_dummies(all['manufacturer_id'],drop_first=True,prefix='manufac')
all=pd.concat([all,manufac],axis=1)
all.drop(['manufacturer_id'],axis=1,inplace=True)

#creating dummies for Employment.Type
all["Employment.Type"].fillna("Not_employed", inplace = True)
emp=pd.get_dummies(all['Employment.Type'],drop_first=True,prefix='E')
all=pd.concat([all,emp],axis=1)
all.drop(['Employment.Type'],axis=1,inplace=True)

#creating dummies for DisbursalDate
all['DisbursalDate']=all['DisbursalDate'].str.slice(3,5).astype(int)
disbursal=pd.get_dummies(all['DisbursalDate'],drop_first=True,prefix='Disbursal')
all=pd.concat([all,disbursal],axis=1)
all.drop(['DisbursalDate'],axis=1,inplace=True)

#creating dummies for State_ID
state=pd.get_dummies(all['State_ID'],drop_first=True,prefix='State_ID')
all=pd.concat([all,state],axis=1)
all.drop(['State_ID'],axis=1,inplace=True)

In [25]:
#ranking based on above scenarios
all['PERFORM_CNS.SCORE.DESCRIPTION']=all['PERFORM_CNS.SCORE.DESCRIPTION'].map({'Not Scored: Sufficient History Not Available':0,
       'Not Scored: Only a Guarantor':0,
       'Not Scored: Not Enough Info available on the customer':0,
       'Not Scored: No Updates available in last 36 months':0,
       'Not Scored: No Activity seen on the customer (Inactive)':0,
       'Not Scored: More than 50 active Accounts found':0, 
       'No Bureau History Available':0, 'M-Very High Risk':1, 
       'L-Very High Risk':2, 'K-High Risk':3, 'J-High Risk':4, 'I-Medium Risk':5,
       'H-Medium Risk':6, 'G-Low Risk':7, 'F-Low Risk':8, 'E-Low Risk':9,
      'D-Very Low Risk':10, 'C-Very Low Risk':11, 'B-Very Low Risk':12,
      'A-Very Low Risk':13})

In [26]:
train=all[all['data']=='train']
del train['data']
test=all[all['data']=='test']
test.drop(['loan_default','data'],axis=1,inplace=True)

In [27]:
train=pd.DataFrame(train)
test=pd.DataFrame(test)

In [28]:
print(train.shape)
print(test.shape)

(233154, 149)
(112392, 148)


In [29]:
x_train=train.drop(['loan_default'],axis=1)
y_train=train['loan_default']
x_test=test


In [30]:
traincol=x_train.columns
testcol=x_test.columns

scaler=StandardScaler()
scaler.fit(x_train)
scaler.fit(x_test)

x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

x_train = pd.DataFrame(x_train)
x_test=pd.DataFrame(x_test)

In [31]:
clf1=DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
            max_depth=8, max_features=0.75, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

clf2=LogisticRegression(C=0.05, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)


Algos=[clf1,clf2]

In [32]:
rows=x_train.shape[0]
rows

233154

In [33]:
layer1=pd.DataFrame({'clf1':np.zeros(rows),'clf2':np.zeros(rows)})

In [34]:
#layer1

In [35]:
kf=StratifiedKFold(n_splits=10)

In [36]:
fold=1

for train,left_out_chunk in kf.split(x_train,y_train):
    print('fold number : ', fold)
    
    for i,clf in enumerate(Algos):
        print('Algo number :',i+1)
        
        x_train_train=x_train.loc[train]
        y_train_train=y_train[train]
        x_train_left_out_chunk=x_train.loc[left_out_chunk]
        
        clf.fit(x_train_train,y_train_train)
        p=clf.predict_proba(x_train_left_out_chunk)[:,1]
        
        layer1.iloc[left_out_chunk,i]=p
        
    fold+=1      

fold number :  1
Algo number : 1
Algo number : 2
fold number :  2
Algo number : 1
Algo number : 2
fold number :  3
Algo number : 1
Algo number : 2
fold number :  4
Algo number : 1
Algo number : 2
fold number :  5
Algo number : 1
Algo number : 2
fold number :  6
Algo number : 1
Algo number : 2
fold number :  7
Algo number : 1
Algo number : 2
fold number :  8
Algo number : 1
Algo number : 2
fold number :  9
Algo number : 1
Algo number : 2
fold number :  10
Algo number : 1
Algo number : 2


In [37]:
rows=x_test.shape[0]
layer2_test=pd.DataFrame({'clf1':np.zeros(rows),'clf2':np.zeros(rows)})

In [38]:
#layer2_test

In [39]:
for i,clf in enumerate(Algos):
    print( 'Algo number',i+1)
    clf.fit(x_train,y_train)
    p=clf.predict_proba(x_test)[:,1]
    
    layer2_test.iloc[:,i]=p


Algo number 1
Algo number 2


In [40]:
# second layer linear model 
logr=LogisticRegression(class_weight='balanced')

In [41]:
logr.fit(layer1,y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [42]:
final_pred=logr.predict_proba(layer2_test)[:,1]

In [43]:
final_pred=pd.DataFrame(final_pred)
final_pred.columns=['loan_default']


In [44]:
final_pred=pd.concat([uid,final_pred],axis=1)
pd.DataFrame(final_pred).to_csv("analtics_vidhya.csv",index=False)