In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.metrics import accuracy_score

In [2]:
import xgboost as xgb
from xgboost import XGBClassifier

In [3]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',header = None)

In [4]:
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',skiprows = 1,header = None)

In [5]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num','marital_status', 'occupation','relationship',
              'race', 'sex', 'capital_gain',
'capital_loss', 'hours_per_week', 'native_country', 'wage_class']

In [6]:
train_set.columns = col_labels

In [7]:
test_set.columns = col_labels

In [8]:
train_set.shape

(32561, 15)

In [9]:
test_set.shape

(16281, 15)

In [10]:
data_set = pd.concat([train_set,test_set])

In [11]:
data_set

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K.
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [12]:
data_set.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
wage_class        0
dtype: int64

In [13]:
data_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [14]:
data_set.shape

(48842, 15)

In [15]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48842 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       48842 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education_num   48842 non-null  int64 
 5   marital_status  48842 non-null  object
 6   occupation      48842 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital_gain    48842 non-null  int64 
 11  capital_loss    48842 non-null  int64 
 12  hours_per_week  48842 non-null  int64 
 13  native_country  48842 non-null  object
 14  wage_class      48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 6.0+ MB


In [16]:
data_set['wage_class'].value_counts()

 <=50K     24720
 <=50K.    12435
 >50K       7841
 >50K.      3846
Name: wage_class, dtype: int64

In [17]:
data_set['wage_class'].unique()

array([' <=50K', ' >50K', ' <=50K.', ' >50K.'], dtype=object)

In [18]:
data_set['wage_class']= data_set['wage_class'].str.replace(" <=50K."," <=50K",case=True)

In [19]:
data_set['wage_class']= data_set['wage_class'].str.replace(" >50K."," >50K")

In [20]:
data_set['wage_class'].value_counts()

 <=50K    37155
 >50K     11687
Name: wage_class, dtype: int64

In [21]:
change_wage_class={
    ' <=50K':0,
    ' >50K':1
}

In [22]:
data_set['wage_class']=data_set['wage_class'].map(change_wage_class)

In [23]:
occupation_map=data_set['occupation'].value_counts().to_dict()

In [24]:
education_map=data_set['education'].value_counts().to_dict()

In [25]:
native_country_map=data_set['native_country'].value_counts().to_dict()

In [26]:
data_set['native_country']=data_set['native_country'].map(native_country_map)

In [27]:
data_set['education']=data_set['education'].map(education_map)

In [28]:
data_set['occupation']=data_set['occupation'].map(occupation_map)

In [29]:
data_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,8025,13,Never-married,5611,Not-in-family,White,Male,2174,0,40,43832,0
1,50,Self-emp-not-inc,83311,8025,13,Married-civ-spouse,6086,Husband,White,Male,0,0,13,43832,0
2,38,Private,215646,15784,9,Divorced,2072,Not-in-family,White,Male,0,0,40,43832,0
3,53,Private,234721,1812,7,Married-civ-spouse,2072,Husband,Black,Male,0,0,40,43832,0
4,28,Private,338409,8025,13,Married-civ-spouse,6172,Wife,Black,Female,0,0,40,138,0


In [30]:
relationship_map={
    ' Not-in-family':1,
    ' Husband':2,
    ' Wife':3,
    ' Own-child':4,
    ' Unmarried':5,
    ' Other-relative':6,
}

In [31]:
relationship_map

{' Not-in-family': 1,
 ' Husband': 2,
 ' Wife': 3,
 ' Own-child': 4,
 ' Unmarried': 5,
 ' Other-relative': 6}

In [32]:
data_set['relationship']=data_set['relationship'].map(relationship_map)

In [33]:
data_set['marital_status'].unique()

array([' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
       ' Widowed'], dtype=object)

In [34]:
marital_status_map={
    ' Never-married':1,
    ' Married-civ-spouse':2,
    ' Divorced':3,
    ' Married-spouse-absent':4,
    ' Separated':5,
    ' Married-AF-spouse':6,
    ' Widowed':7,
}

In [35]:
data_set['marital_status']=data_set['marital_status'].map(marital_status_map)

In [36]:
data_set['race'].unique()

array([' White', ' Black', ' Asian-Pac-Islander', ' Amer-Indian-Eskimo',
       ' Other'], dtype=object)

In [37]:
race_map={
    ' White':1,
    ' Black':2,
    ' Asian-Pac-Islander':3,
    ' Amer-Indian-Eskimo':4,
    ' Other':5,
}

In [38]:
data_set['race']=data_set['race'].map(race_map)

In [39]:
data_set['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [40]:
data_set['workclass'].mode()[0]

' Private'

In [41]:
data_set['workclass']=np.where(data_set['workclass']==' ?',data_set['workclass'].mode()[0],data_set['workclass'])

In [42]:
workclass_map={
    ' State-gov':1,
    ' Self-emp-not-inc':2,
    ' Private':3,
    ' Federal-gov':4,
    ' Local-gov':5,
    ' Self-emp-inc':6,
    ' Without-pay':7,
    ' Never-worked':8,
}

In [43]:
data_set['workclass']=data_set['workclass'].map(workclass_map)

In [44]:
data_set

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,1,77516,8025,13,1,5611,1,1,Male,2174,0,40,43832,0
1,50,2,83311,8025,13,2,6086,2,1,Male,0,0,13,43832,0
2,38,3,215646,15784,9,3,2072,1,1,Male,0,0,40,43832,0
3,53,3,234721,1812,7,2,2072,2,2,Male,0,0,40,43832,0
4,28,3,338409,8025,13,2,6172,3,2,Female,0,0,40,138,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,3,215419,8025,13,3,6172,1,1,Female,0,0,36,43832,0
16277,64,3,321403,15784,9,7,2809,6,2,Male,0,0,40,43832,0
16278,38,3,374983,8025,13,2,6172,2,1,Male,0,0,50,43832,0
16279,44,3,83891,8025,13,3,5611,4,3,Male,5455,0,40,43832,0


In [45]:
data_set=pd.get_dummies(data_set,drop_first=True)

In [46]:
data_set

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,wage_class,sex_ Male
0,39,1,77516,8025,13,1,5611,1,1,2174,0,40,43832,0,1
1,50,2,83311,8025,13,2,6086,2,1,0,0,13,43832,0,1
2,38,3,215646,15784,9,3,2072,1,1,0,0,40,43832,0,1
3,53,3,234721,1812,7,2,2072,2,2,0,0,40,43832,0,1
4,28,3,338409,8025,13,2,6172,3,2,0,0,40,138,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,3,215419,8025,13,3,6172,1,1,0,0,36,43832,0,0
16277,64,3,321403,15784,9,7,2809,6,2,0,0,40,43832,0,1
16278,38,3,374983,8025,13,2,6172,2,1,0,0,50,43832,0,1
16279,44,3,83891,8025,13,3,5611,4,3,5455,0,40,43832,0,1


In [47]:
y= data_set['wage_class']
x=data_set.drop(labels='wage_class', axis=1)

In [48]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.3,random_state=42)

In [49]:
model = XGBClassifier(objective='binary:logistic')
model.fit(train_x, train_y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=8, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       use_label_encoder=True, validate_parameters=1, verbosity=None)

In [50]:
y_pred = model.predict(train_x)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(train_y,predictions)
accuracy

0.9026587498903156

In [51]:
y_pred = model.predict(test_x)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(test_y,predictions)
accuracy

0.8714938920357606