# Classification Problem 

## Read and explore the data

In [394]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
plt.style.use('seaborn')

In [395]:
train = pd.read_json('train.json')
train.shape

(391282, 20)

In [396]:
test = pd.read_json('test.json') # 20% 
test.shape

(97405, 19)

### Missing data

In [397]:
m = train.isnull().sum()/len(train)
m[m>0.00]

esr       0.183604
intp      0.171086
pap       0.171086
pincp     0.171086
povpip    0.033454
retp      0.171086
schl      0.030592
wkhp      0.488292
dtype: float64

In [398]:
m_test = test.isnull().sum()/len(test)
m_test[m_test>0.00]

esr       0.183492
intp      0.171172
pap       0.171172
pincp     0.171172
povpip    0.033386
retp      0.171172
schl      0.030204
wkhp      0.486423
dtype: float64

Working hours might be related to workging status, so if person isn't working I can impute 0 hours.

In [399]:
pd.crosstab(train['esr'], train['wkhp'].isnull())

wkhp,False,True
esr,Unnamed: 1_level_1,Unnamed: 2_level_1
Employed,180553,0
Not in labor force,13645,113751
Unemployed,6024,5468


In [400]:
pd.crosstab(test['esr'], test['wkhp'].isnull())

wkhp,False,True
esr,Unnamed: 1_level_1,Unnamed: 2_level_1
Employed,45174,0
Not in labor force,3365,28178
Unemployed,1486,1329


It makes sense, most of the people not working hast missing working hours. 

In [401]:
# impute working hours
not_working = ['Not in labor force', 'Unemployed']
train.loc[train.esr.isin(not_working) & train.wkhp.isnull(), 'wkhp'] = 0.0
test.loc[test.esr.isin(not_working) & test.wkhp.isnull(), 'wkhp'] = 0.0

In [117]:
m_train = train.isnull().sum()/len(train)
variables = list(m_train[m_train>0.00].index)

In [36]:
m_test = test.isnull().sum()/len(test)
m_test[m_test>0.00]

In [214]:
# simple function to impute missing values 
# more complex approach can be used in future application (e.g., KNN)
def impute_values(data):
    
    # get variables with missing values
    df = data.copy()
    m = df.isnull().sum()/len(df)
    variables = list(m[m>0.00].index)
    
    # impute 
    for v in variables:
        if (df[v].dtypes == np.dtype('int')) | (df[v].dtypes == np.dtype('float')):
            df[v].fillna(df[v].median(), inplace=True)
        elif (df[v].dtypes == np.dtype('O')):
            df.loc[df[v].isnull(), v] = df[v].mode().values[0]
    return df

In [252]:
# impute values
imp_train = impute_values(train)

In [253]:
# function to create dummies
def get_dummies(data, variables):
    df = data.copy()
    dd = pd.DataFrame()
    for v in variables: 
        pd.get_dummies
        dd = pd.concat([dd, pd.get_dummies(df.loc[:,v], drop_first=True, prefix=v)], axis=1, )

    final = pd.concat([df.drop(dummies, axis=1), dd], axis=1)
    final.columns = final.columns.str.replace('\+|\.|\s', '_').str.replace('_+', '_').str.lower()
    return final

In [254]:
# create dummies
variables = ['cit', 'dear', 'deye', 'esr', 'hicov', 'mar', 
          'race', 'schl', 'sex', 'st', 'vet']
clean_train = get_dummies(imp_train, variables)

In [255]:
# check missing records
m_train = clean_train.isnull().sum()/len(clean_train)
m_train[m_train>0] # no missing data

Series([], dtype: float64)

In [256]:
clean_labels = clean_train['hicov_without_healthcare']
clean_train.drop(['id', 'puma', 'hicov_without_healthcare'], axis=1, inplace=True)

# Create validation set

The training set has enough data to create a validation set

In [257]:
from sklearn.model_selection import train_test_split

In [268]:
X_train, X_test, y_train, y_test = train_test_split(clean_train, clean_labels, test_size=0.20, 
                                                    random_state=123)

# xgboost

In [349]:
import xgboost as xgb
import sklearn.metrics as met

In [270]:
churn_dmatrix = xgb.DMatrix(data=X_train, label=y_train)

In [370]:
params = {'objective':'binary:logistic', 'max_depth':10, 'colsample_bytree': 0.3,
          'learning_rate': 0.1, 'n_estimators':1000}

In [344]:
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=10, num_boost_round=10,
                   metrics='auc', as_pandas=True, seed=123)

In [386]:
clt = xgb.XGBClassifier(n_estimators=1000, objective='binary:logistic')

In [387]:
clt.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [391]:
pred = clt.predict(X_train)

In [392]:
sum(pred)

2281

In [393]:
met.f1_score(y_train, pred)

0.12737287749806528

In [346]:
print('AUC: %f' % (cv_results['test-auc-mean'].iloc[-1]))

AUC: 0.814038
