In [None]:
## build a stack regression model for loan data

In [1]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import warnings 
warnings.filterwarnings('ignore')
from mypipes import *

In [2]:
file=r'/Users/lalitsachan/Dropbox/0.0 Data/census_income.csv'

cd= pd.read_csv(file)

In [3]:
cd.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
cd['Y']=(cd['Y']==' >50K').astype(int)

In [5]:
cd['Y'].value_counts()

0    24720
1     7841
Name: Y, dtype: int64

In [6]:
cd_train,cd_test=train_test_split(cd,test_size=0.2,random_state=2)

In [9]:
cd_train.reset_index(drop=True,inplace=True)
cd_test.reset_index(drop=True,inplace=True)

In [12]:
cat_vars=list(cd_train.select_dtypes(include=['object']).columns)

In [13]:
cat_vars=[_ for _ in cat_vars if _ not in ['Y','education']]

In [14]:
num_vars=list(cd_train.select_dtypes(exclude=['object']).columns)

In [15]:
p1=pdPipeline([
    ('cat_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(300))
])

p2=pdPipeline([
    ('num_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

data_pipe=FeatureUnion([
    ('cat_pipe',p1),
    ('num_pipe',p2)
])

In [16]:
data_pipe.fit(cd_train)

FeatureUnion(transformer_list=[('cat_pipe',
                                pdPipeline(steps=[('cat_select',
                                                   VarSelector(feature_names=['workclass',
                                                                              'marital.status',
                                                                              'occupation',
                                                                              'relationship',
                                                                              'race',
                                                                              'sex',
                                                                              'native.country'])),
                                                  ('missing_trt',
                                                   DataFrameImputer()),
                                                  ('create_dummies',
                                         

In [17]:
x_train=pd.DataFrame(data=data_pipe.transform(cd_train),
                     columns=data_pipe.get_feature_names())
x_test=pd.DataFrame(data=data_pipe.transform(cd_test),
                     columns=data_pipe.get_feature_names())
y_train=cd_train['Y']
y_test=cd_test['Y']

In [18]:
clf1=KNeighborsClassifier(n_neighbors=50)
clf2=RandomForestClassifier(class_weight='balanced',n_estimators=200)
clf3=RandomForestClassifier(class_weight=None,n_estimators=100)
clf4=RandomForestClassifier(n_estimators=50)
clf5=RandomForestClassifier(n_estimators=10)

Algos=[clf1,clf2,clf3,clf4,clf5]

In [19]:
rows=x_train.shape[0]

In [20]:
rows

26048

In [23]:
layer1=pd.DataFrame({'clf'+str(i):np.zeros(rows) for i in range(1,len(Algos)+1)})

In [25]:
layer1.shape

(26048, 5)

In [26]:
kf=KFold(n_splits=10)

In [27]:
fold=1
for train,left_out_chunk in kf.split(x_train):
    print('fold number : ', fold)
    
    for i,clf in enumerate(Algos):
        print('Algo number :',i+1)
        
        x_train_train=x_train.loc[train]
        y_train_train=y_train[train]
        
        x_train_left_out_chunk=x_train.loc[left_out_chunk]
        
        
        clf.fit(x_train_train,y_train_train)
        p=clf.predict_proba(x_train_left_out_chunk)[:,1]
        
        layer1.iloc[left_out_chunk,i]=p
        
    fold+=1  
    

fold number :  1
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  2
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  3
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  4
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  5
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  6
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  7
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  8
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  9
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5
fold number :  10
Algo number : 1
Algo number : 2
Algo number : 3
Algo number : 4
Algo number : 5


In [28]:
layer1

Unnamed: 0,clf1,clf2,clf3,clf4,clf5
0,0.16,0.000,0.00,0.00,0.0
1,0.12,0.000,0.00,0.00,0.0
2,0.10,0.000,0.00,0.00,0.0
3,0.18,0.000,0.01,0.00,0.0
4,0.20,0.000,0.01,0.00,0.0
5,0.30,0.000,0.00,0.00,0.0
6,0.16,0.995,1.00,1.00,1.0
7,0.32,0.995,1.00,1.00,1.0
8,0.16,0.005,0.00,0.00,0.0
9,0.28,0.000,0.00,0.00,0.0


In [29]:
rows=x_test.shape[0]
layer2_test=pd.DataFrame({'clf'+str(i):np.zeros(rows) for i in range(1,len(Algos)+1)})

In [32]:
layer2_test.shape

(6513, 5)

In [33]:
for i,clf in enumerate(Algos):
    
    print( 'Algo number',i+1)
    clf.fit(x_train,y_train)
    p=clf.predict_proba(x_test)[:,1]
    
    layer2_test.iloc[:,i]=p


Algo number 1
Algo number 2
Algo number 3
Algo number 4
Algo number 5


In [35]:
# layer2_test

In [36]:
# second layer linear model 
logr=LogisticRegression(class_weight='balanced')


In [37]:
logr.fit(layer1,y_train)

LogisticRegression(class_weight='balanced')

In [38]:
roc_auc_score(y_test,logr.predict_proba(layer2_test)[:,1])

1.0

In [39]:
xgb2=XGBClassifier(objective='binary:logistic',n_estimators=100,
                   max_depth=3,learning_rate=.1,scale_pos_weight=3)

In [40]:
xgb2.fit(layer1,y_train)

XGBClassifier(scale_pos_weight=3)

In [41]:
roc_auc_score(y_test,xgb2.predict_proba(layer2_test)[:,1])

1.0