In [1]:
# In this assignment students need to predict whether a person makes over 50K per year
# or not from classic adult dataset using XGBoost.

In [2]:
import numpy as np
import pandas as pd
import xgboost.sklearn as XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

In [3]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', skiprows = 1, header = None)

In [4]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
              'occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
              'native_country', 'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels

In [5]:
train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
wage_class        32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
LE = LabelEncoder()
for l in train_set.columns:
    train_set[l] = LE.fit_transform(train_set[l])

for l in test_set.columns:
    test_set[l] = LE.fit_transform(test_set[l])

In [8]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null int32
fnlwgt            32561 non-null int64
education         32561 non-null int32
education_num     32561 non-null int64
marital_status    32561 non-null int32
occupation        32561 non-null int32
relationship      32561 non-null int32
race              32561 non-null int32
sex               32561 non-null int32
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null int32
wage_class        32561 non-null int32
dtypes: int32(9), int64(6)
memory usage: 2.6 MB


In [9]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 15 columns):
age               16281 non-null int64
workclass         16281 non-null int32
fnlwgt            16281 non-null int64
education         16281 non-null int32
education_num     16281 non-null int64
marital_status    16281 non-null int32
occupation        16281 non-null int32
relationship      16281 non-null int32
race              16281 non-null int32
sex               16281 non-null int32
capital_gain      16281 non-null int64
capital_loss      16281 non-null int64
hours_per_week    16281 non-null int64
native_country    16281 non-null int32
wage_class        16281 non-null int32
dtypes: int32(9), int64(6)
memory usage: 1.3 MB


In [10]:
X_train = train_set.drop('wage_class',axis=1)
y_train = train_set['wage_class']

In [11]:
X_test = test_set.drop('wage_class',axis=1)
y_test = test_set['wage_class']

In [12]:
paramst = {
        'min_child_weight': [0.5, 1, 1.5, 2, 2.5],
        'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
        'max_depth': [2, 3, 4, 5, 6]
        }

In [13]:
xgb_model  = xgb.XGBClassifier(n_estimators=300, objective='binary:logistic',
                    silent=True, nthread=1)

In [14]:
# Randomized Search with the XGB Classifier passed along with parameterized values 
random_search = RandomizedSearchCV(xgb_model, param_distributions=paramst,
                                   n_iter=5, scoring='roc_auc', n_jobs=4, cv=5,
                                   verbose=3, random_state=1001 )

In [15]:
random_search.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:  2.6min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=5, n_jobs=4,
          param_distributions={'min_child_weight': [0.5, 1, 1.5, 2, 2.5], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5], 'max_depth': [2, 3, 4, 5, 6]},
          pre_dispatch='2*n_jobs', random_state=1001, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=3)

In [16]:
print('\n Best estimator:')
bestmodel = random_search.best_estimator_
print(bestmodel)
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)


 Best estimator:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.4, max_delta_step=0,
       max_depth=2, min_child_weight=2.5, missing=None, n_estimators=300,
       n_jobs=1, nthread=1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
0.8555098219437167

 Best hyperparameters:
{'min_child_weight': 2.5, 'max_depth': 2, 'learning_rate': 0.4}


In [17]:
bestmodelprediction = bestmodel.predict(X_test)

In [18]:
correct = 0
for i in range(len(bestmodelprediction)):
    if (y_test[i] == bestmodelprediction[i]):
        correct += 1
    
print('Predicted correctly {0}/{1}'.format(correct, len(bestmodelprediction)))

Predicted correctly 13917/16281


In [19]:
acct = accuracy_score(y_test,bestmodelprediction)
print('Accuracy score is',format(acct))
print('Error: {0:.4f}'.format(1-acct))

Accuracy score is 0.8548000737055463
Error: 0.1452


In [20]:
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [21]:
prediction = xgb_model.predict(X_test)

In [22]:
correct = 0
for i in range(len(prediction)):
    if (y_test[i] == prediction[i]):
        correct += 1
    
print('Predicted correctly {0}/{1}'.format(correct, len(prediction)))

Predicted correctly 14035/16281


In [23]:
acc = accuracy_score(y_test,prediction)
print('Accuracy score is',format(acc))
print('Error: {0:.4f}'.format(1-acc))

Accuracy score is 0.8620477857625453
Error: 0.1380
