## Load Libraries

In [93]:
import pandas as pd
import numpy as np

In [94]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [103]:
import xgboost as xgb

## Functions

In [None]:
def encode_binary_variables(df,binary_variable):

    le = preprocessing.LabelEncoder()
    df[binary_variable] = le.fit_transform(df[binary_variable])

    return df

## Constants

In [96]:
quant_variables = ['age',' education-num',' capital-gain',' capital-loss',' hours-per-week']
categorical_variables = [' workclass',' education',' marital-status',' occupation',' relationship',' race',' native-country']
binary_variables = [' sex',' salary']
drop_variables = [' fnlgt',' workclass',' education',' marital-status',' occupation',' relationship',' race',' native-country']

## Load Data

In [97]:
df = pd.read_csv("data/census.csv")
df.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Data Cleaning

In [98]:
## Remove unrequired variables
df = df.drop(' fnlgt',axis=1)
## Encode binary variables
for binary_variable in binary_variables:
    df = encode_binary_variables(df,binary_variable)
## Encode catagorical variables
df = pd.get_dummies(df,columns=categorical_variables)
df.head()

Unnamed: 0,age,education-num,sex,capital-gain,capital-loss,hours-per-week,salary,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,13,1,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,13,1,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,9,1,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,7,1,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,13,0,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Data Check

## Train val split

In [99]:
X_data = df.drop(' salary',axis = 1)    
y_data = df[' salary']
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size= 0.3, random_state=42)

## Train Model

In [102]:
xg_cl = xgb.XGBClassifier(n_estimators=10,seed=42,use_label_encoder =False,eval_metric='logloss')
xg_cl.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='logloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=10, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)

## Compute model metrics

In [101]:
preds = xg_cl.predict(X_test)
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" %(accuracy))

accuracy: 0.864674


## Inference