In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn
from sklearn.datasets import load_svmlight_files
from sklearn.metrics import accuracy_score
from xgboost.sklearn import XGBClassifier

In [2]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)

In [3]:
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',\
                       skiprows = 1, header = None)

In [4]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',\
'occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',\
'native_country', 'wage_class']

In [5]:
train_set.columns = col_labels
test_set.columns = col_labels

In [6]:
train_set.shape, test_set.shape

((32561, 15), (16281, 15))

In [7]:
train_set.workclass = train_set.workclass.str.strip()
train_set.education = train_set.education.str.strip()
train_set.marital_status = train_set.marital_status.str.strip()
train_set.occupation = train_set.occupation.str.strip()
train_set.relationship = train_set.relationship.str.strip()
train_set.race = train_set.race.str.strip()
train_set.sex = train_set.sex.str.strip()
train_set.native_country = train_set.native_country.str.strip()
train_set.wage_class = train_set.wage_class.str.strip()

test_set.workclass = test_set.workclass.str.strip()
test_set.education = test_set.education.str.strip()
test_set.marital_status = test_set.marital_status.str.strip()
test_set.occupation = test_set.occupation.str.strip()
test_set.relationship = test_set.relationship.str.strip()
test_set.race = test_set.race.str.strip()
test_set.sex = test_set.sex.str.strip()
test_set.native_country = test_set.native_country.str.strip()
test_set.wage_class = test_set.wage_class.str.strip()

In [8]:
train_set.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'wage_class'],
      dtype='object')

In [9]:
train_set.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [10]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [11]:
train_set.wage_class.value_counts()

<=50K    24720
>50K      7841
Name: wage_class, dtype: int64

In [12]:
train_set['income'] = train_set.wage_class.map({'<=50K':0, '>50K': 1})
test_set['income'] = test_set.wage_class.map({'<=50K.':0, '>50K.': 1})

In [13]:
train_set.income.value_counts()

0    24720
1     7841
Name: income, dtype: int64

In [14]:
test_set.income.value_counts()

0    12435
1     3846
Name: income, dtype: int64

In [15]:
train_set = train_set.drop(['wage_class'], 1)
test_set = test_set.drop(['wage_class'], 1)

In [16]:
train_set.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [17]:
train_set.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income             int64
dtype: object

In [18]:
train_colsnum = train_set.dtypes[train_set.dtypes != 'object'].index
train_colsnum

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'income'],
      dtype='object')

In [19]:
train_colsstr = train_set.dtypes[train_set.dtypes == 'object'].index
train_colsstr

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country'],
      dtype='object')

In [20]:
test_colsnum = test_set.dtypes[test_set.dtypes != 'object'].index
test_colsnum

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'income'],
      dtype='object')

In [21]:
test_colsstr = test_set.dtypes[train_set.dtypes == 'object'].index
test_colsstr

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country'],
      dtype='object')

In [22]:
for cols in train_colsstr:
    train_set = train_set.join(pd.get_dummies(train_set[cols], prefix=cols, drop_first=True))

In [23]:
for cols in test_colsstr:
    test_set = test_set.join(pd.get_dummies(test_set[cols], prefix=cols, drop_first=True))

In [24]:
train_set.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       ...
       'native_country_Portugal', 'native_country_Puerto-Rico',
       'native_country_Scotland', 'native_country_South',
       'native_country_Taiwan', 'native_country_Thailand',
       'native_country_Trinadad&Tobago', 'native_country_United-States',
       'native_country_Vietnam', 'native_country_Yugoslavia'],
      dtype='object', length=109)

In [25]:
train_set.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,...,0,0,0,0,0,0,0,1,0,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,...,0,0,0,0,0,0,0,1,0,0
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,...,0,0,0,0,0,0,0,1,0,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,...,0,0,0,0,0,0,0,1,0,0
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,...,0,0,0,0,0,0,0,1,0,0


In [26]:
train_set_new = train_set.drop(train_colsstr, axis=1)
test_set_new = test_set.drop(test_colsstr, axis=1)

In [27]:
train_set_new.shape, test_set_new.shape

((32561, 101), (16281, 100))

In [28]:
# Get missing columns in the training test
missing_cols = set( train_set_new.columns ) - set( test_set_new.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    test_set_new[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
test_set_new = test_set_new[train_set_new.columns]

In [29]:
test_set_new['native_country_Holand-Netherlands'].unique()

array([0], dtype=int64)

In [30]:
train_set_new.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
x_train = train_set_new.drop('income', axis=1)
y_train = train_set_new['income']
x_test = test_set_new.drop('income', axis=1)
y_test = test_set_new['income']

In [32]:
test_set_new.columns

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'income', 'workclass_Federal-gov',
       'workclass_Local-gov', 'workclass_Never-worked',
       ...
       'native_country_Portugal', 'native_country_Puerto-Rico',
       'native_country_Scotland', 'native_country_South',
       'native_country_Taiwan', 'native_country_Thailand',
       'native_country_Trinadad&Tobago', 'native_country_United-States',
       'native_country_Vietnam', 'native_country_Yugoslavia'],
      dtype='object', length=101)

In [33]:
train_set_new.columns

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'income', 'workclass_Federal-gov',
       'workclass_Local-gov', 'workclass_Never-worked',
       ...
       'native_country_Portugal', 'native_country_Puerto-Rico',
       'native_country_Scotland', 'native_country_South',
       'native_country_Taiwan', 'native_country_Thailand',
       'native_country_Trinadad&Tobago', 'native_country_United-States',
       'native_country_Vietnam', 'native_country_Yugoslavia'],
      dtype='object', length=101)

In [34]:
from sklearn.datasets import load_svmlight_files
from sklearn.metrics import log_loss, accuracy_score

from xgboost.sklearn import XGBClassifier

In [35]:
params = {
    'objective': 'binary:logistic', # our data is binary (1 & 0) and bydefault logistic is used with binary data.
    'max_depth': 6,
    'learning_rate': 0.2,
    'silent': 1,
    'n_estimators': 30,
    'reg_alpha' : .27
}

In [36]:
bst = XGBClassifier(**params).fit(x_train, y_train)

In [37]:
bst.feature_importances_

array([1.3491473e-02, 2.5058871e-03, 6.5161638e-02, 5.1602006e-02,
       1.8273870e-02, 1.2237448e-02, 1.2115999e-02, 3.7843869e-03,
       0.0000000e+00, 1.3329238e-03, 3.0456320e-03, 8.3946977e-03,
       7.6580341e-03, 0.0000000e+00, 8.3382876e-04, 0.0000000e+00,
       9.4625581e-04, 5.3769612e-04, 8.4799068e-04, 5.7488482e-04,
       1.3037581e-03, 2.5301459e-03, 2.2076275e-03, 0.0000000e+00,
       1.7024836e-03, 3.3134692e-03, 0.0000000e+00, 1.8314660e-02,
       1.5798951e-03, 4.7137281e-03, 4.9235943e-01, 4.5569890e-04,
       6.2389635e-03, 0.0000000e+00, 0.0000000e+00, 6.3301255e-03,
       0.0000000e+00, 1.9736101e-03, 1.9565351e-02, 1.0372728e-02,
       1.3977093e-02, 1.9180650e-02, 3.3540200e-02, 0.0000000e+00,
       1.4416191e-02, 4.0067243e-03, 1.0978773e-02, 1.3869288e-02,
       2.2470921e-03, 5.7428107e-03, 1.3265411e-02, 3.5772730e-02,
       1.9725764e-03, 8.0818580e-03, 1.9811396e-03, 4.5930534e-03,
       0.0000000e+00, 7.2862576e-03, 8.2711205e-03, 0.0000000e

In [38]:
y_train_pred = bst.predict(x_train)
y_train_pred

correct = 0
for i in range(len(y_train_pred)):
    if (y_train[i] == y_train_pred[i]):
        correct += 1
        
acc = accuracy_score(y_train, y_train_pred)
print('Predicted correctly: {0}/{1}'.format(correct, len(y_train_pred)))
print('Training Accuracy: {0:.4f}'.format(acc))
print('Training Error: {0:.4f}'.format(1-acc))

Predicted correctly: 28564/32561
Training Accuracy: 0.8772
Training Error: 0.1228


In [39]:
y_pred = bst.predict(x_test)
y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [40]:
correct = 0
for i in range(len(y_pred)):
    if (y_test[i] == y_pred[i]):
        correct += 1
        
acc = accuracy_score(y_test, y_pred)
print('Predicted correctly: {0}/{1}'.format(correct, len(y_pred)))
print('Test Accuracy: {0:.4f}'.format(acc))
print('Test Error: {0:.4f}'.format(1-acc))

Predicted correctly: 14249/16281
Test Accuracy: 0.8752
Test Error: 0.1248


In [41]:
log_loss(y_test, y_pred)

4.310747827951482

In [42]:
from sklearn.metrics import precision_recall_curve

In [43]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

In [44]:
precision, recall, thresholds

(array([0.23622628, 0.79796321, 1.        ]),
 array([1.        , 0.63156526, 0.        ]),
 array([0, 1], dtype=int64))

<span style="color:black; font-size:1.3em;">Thus the model accuracy is 87.52 % where 14249 values out of 16281 have been predicted correctly.</span>