In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics    
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

In [2]:
train_set = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', sep=' *, *',header = None,na_values="?")

  """Entry point for launching an IPython kernel.


In [3]:
test_set = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test' , sep=' *, *',skiprows = 1, header = None,na_values= "?")

  """Entry point for launching an IPython kernel.


In [4]:
train_set.shape

(32561, 15)

In [5]:
test_set.shape

(16281, 15)

In [6]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       32561 non-null  int64 
 1   1       30725 non-null  object
 2   2       32561 non-null  int64 
 3   3       32561 non-null  object
 4   4       32561 non-null  int64 
 5   5       32561 non-null  object
 6   6       30718 non-null  object
 7   7       32561 non-null  object
 8   8       32561 non-null  object
 9   9       32561 non-null  object
 10  10      32561 non-null  int64 
 11  11      32561 non-null  int64 
 12  12      32561 non-null  int64 
 13  13      31978 non-null  object
 14  14      32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
train_set.isnull().sum()

0        0
1     1836
2        0
3        0
4        0
5        0
6     1843
7        0
8        0
9        0
10       0
11       0
12       0
13     583
14       0
dtype: int64

In [8]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to 16280
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       16281 non-null  int64 
 1   1       15318 non-null  object
 2   2       16281 non-null  int64 
 3   3       16281 non-null  object
 4   4       16281 non-null  int64 
 5   5       16281 non-null  object
 6   6       15315 non-null  object
 7   7       16281 non-null  object
 8   8       16281 non-null  object
 9   9       16281 non-null  object
 10  10      16281 non-null  int64 
 11  11      16281 non-null  int64 
 12  12      16281 non-null  int64 
 13  13      16007 non-null  object
 14  14      16281 non-null  object
dtypes: int64(6), object(9)
memory usage: 1.9+ MB


In [9]:
test_set.isnull().sum()

0       0
1     963
2       0
3       0
4       0
5       0
6     966
7       0
8       0
9       0
10      0
11      0
12      0
13    274
14      0
dtype: int64

In [10]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation','relationship', 'race', 'sex', 'capital_gain',
'capital_loss', 'hours_per_week', 'native_country', 'wage_class']


In [11]:
train_set.columns = col_labels
test_set.columns = col_labels

In [12]:
adult=pd.concat([train_set,test_set])

In [13]:
adult

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
16277,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [14]:
adult['wage_class'] = adult['wage_class'].replace('<=50K', 0).replace('>50K', 1)
adult['wage_class'] =adult['wage_class'].replace('<=50K.', 0).replace('>50K.', 1)


In [15]:
ad_target=adult['wage_class']

In [16]:
ad_target

0        0
1        0
2        0
3        0
4        0
        ..
16276    0
16277    0
16278    0
16279    0
16280    1
Name: wage_class, Length: 48842, dtype: int64

In [17]:
adult=adult.drop(['wage_class'],axis=1)

In [18]:
adult.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48842 entries, 0 to 16280
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education_num   48842 non-null  int64 
 5   marital_status  48842 non-null  object
 6   occupation      46033 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital_gain    48842 non-null  int64 
 11  capital_loss    48842 non-null  int64 
 12  hours_per_week  48842 non-null  int64 
 13  native_country  47985 non-null  object
dtypes: int64(6), object(8)
memory usage: 5.6+ MB


In [19]:
adult.isnull().sum()

age                  0
workclass         2799
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        2809
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     857
dtype: int64

In [20]:
adult.describe(include='all')

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
count,48842.0,46043,48842.0,48842,48842.0,48842,46033,48842,48842,48842,48842.0,48842.0,48842.0,47985
unique,,8,,16,,7,14,6,5,2,,,,41
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States
freq,,33906,,15784,,22379,6172,19716,41762,32650,,,,43832
mean,38.643585,,189664.1,,10.078089,,,,,,1079.067626,87.502314,40.422382,
std,13.71051,,105604.0,,2.570973,,,,,,7452.019058,403.004552,12.391444,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,
25%,28.0,,117550.5,,9.0,,,,,,0.0,0.0,40.0,
50%,37.0,,178144.5,,10.0,,,,,,0.0,0.0,40.0,
75%,48.0,,237642.0,,12.0,,,,,,0.0,0.0,45.0,


In [21]:
adult.fillna(method='pad',inplace=True)
adult.fillna(method='bfill',inplace=True)

In [22]:
adult.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
dtype: int64

In [23]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [24]:
adult.drop(['education','fnlwgt'],axis=1,inplace=True)

In [25]:
def data_transform(df):
    """Normalize features."""
    binary_data = pd.get_dummies(df,drop_first=True)
    feature_cols = binary_data[binary_data.columns]
    scaler = StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(feature_cols), columns=feature_cols.columns)
    return data


In [26]:
adult = data_transform(adult)

In [27]:
adult.head()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,0.025996,1.136512,0.146932,-0.217127,-0.034087,-0.269229,-0.01431,-1.676343,-0.195216,-0.30207,...,-0.037063,-0.062161,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.307964,-0.043204,-0.02263
1,0.828308,1.136512,-0.144804,-0.217127,-2.213032,-0.269229,-0.01431,-1.676343,-0.195216,3.310491,...,-0.037063,-0.062161,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.307964,-0.043204,-0.02263
2,-0.046942,-0.419335,-0.144804,-0.217127,-0.034087,-0.269229,-0.01431,0.596537,-0.195216,-0.30207,...,-0.037063,-0.062161,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.307964,-0.043204,-0.02263
3,1.047121,-1.197259,-0.144804,-0.217127,-0.034087,-0.269229,-0.01431,0.596537,-0.195216,-0.30207,...,-0.037063,-0.062161,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.307964,-0.043204,-0.02263
4,-0.776316,1.136512,-0.144804,-0.217127,-0.034087,-0.269229,-0.01431,0.596537,-0.195216,-0.30207,...,-0.037063,-0.062161,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,-3.247128,-0.043204,-0.02263


In [28]:
adult.shape

(48842, 81)

In [29]:
train_set=adult[:32561]

In [30]:
train_set.shape

(32561, 81)

In [31]:
test_set=adult[32561:]

In [32]:
test_set.shape

(16281, 81)

In [33]:
x_train=train_set
y_train=ad_target[:32561]

In [34]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: wage_class, Length: 32561, dtype: int64

In [35]:
from xgboost import XGBClassifier
model = XGBClassifier(objective='binary:logistic')
model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [36]:
#training accuracy
y_train_pred=model.predict(x_train)
accuracy=accuracy_score(y_train,y_train_pred)
accuracy

0.8656368047664383

In [37]:
x_test=test_set
y_test=ad_target[32561:]

In [38]:
x_test.head()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
32561,-0.995129,-1.197259,-0.144804,-0.217127,-0.034087,-0.269229,-0.01431,0.596537,-0.195216,-0.30207,...,-0.037063,-0.062161,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.307964,-0.043204,-0.02263
32562,-0.046942,-0.419335,-0.144804,-0.217127,0.77293,-0.269229,-0.01431,0.596537,-0.195216,-0.30207,...,-0.037063,-0.062161,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.307964,-0.043204,-0.02263
32563,-0.776316,0.74755,-0.144804,-0.217127,-0.034087,3.714313,-0.01431,-1.676343,-0.195216,-0.30207,...,-0.037063,-0.062161,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.307964,-0.043204,-0.02263
32564,0.390683,-0.030373,0.886874,-0.217127,-0.034087,-0.269229,-0.01431,0.596537,-0.195216,-0.30207,...,-0.037063,-0.062161,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.307964,-0.043204,-0.02263
32565,-1.505691,-0.030373,-0.144804,-0.217127,-0.841104,-0.269229,-0.01431,0.596537,-0.195216,-0.30207,...,-0.037063,-0.062161,-0.02074,-0.048581,-0.036505,-0.024791,-0.023518,0.307964,-0.043204,-0.02263


In [39]:
#test accuracy
y_test_pred=model.predict(x_test)
accuracy=accuracy_score(y_test,y_test_pred)
accuracy

0.8670229101406548

In [40]:
param_grid={
   
    ' learning_rate':[1,0.5,0.1,0.01,0.001],
    'max_depth': [3,5,10,20],
    'n_estimators':[10,50,100,200]
    
}

In [42]:
from sklearn.model_selection import GridSearchCV
grid= GridSearchCV(XGBClassifier(objective='binary:logistic'),param_grid, verbose=3)

In [43]:
grid.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 80 candidates, totalling 240 fits
[CV]  learning_rate=1, max_depth=3, n_estimators=10 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=10, score=0.839, total=   1.5s
[CV]  learning_rate=1, max_depth=3, n_estimators=10 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV]   learning_rate=1, max_depth=3, n_estimators=10, score=0.845, total=   1.5s
[CV]  learning_rate=1, max_depth=3, n_estimators=10 ..................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.0s remaining:    0.0s


[CV]   learning_rate=1, max_depth=3, n_estimators=10, score=0.848, total=   1.4s
[CV]  learning_rate=1, max_depth=3, n_estimators=50 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=50, score=0.854, total=   5.6s
[CV]  learning_rate=1, max_depth=3, n_estimators=50 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=50, score=0.857, total=   5.4s
[CV]  learning_rate=1, max_depth=3, n_estimators=50 ..................
[CV]   learning_rate=1, max_depth=3, n_estimators=50, score=0.860, total=   5.5s
[CV]  learning_rate=1, max_depth=3, n_estimators=100 .................
[CV]   learning_rate=1, max_depth=3, n_estimators=100, score=0.859, total=  10.7s
[CV]  learning_rate=1, max_depth=3, n_estimators=100 .................
[CV]   learning_rate=1, max_depth=3, n_estimators=100, score=0.863, total=  10.9s
[CV]  learning_rate=1, max_depth=3, n_estimators=100 .................
[CV]   learning_rate=1, max_depth=3, n_estimators=100, score=0.869, total=  10.5s
[CV]

[CV]   learning_rate=0.5, max_depth=3, n_estimators=100, score=0.869, total=   6.3s
[CV]  learning_rate=0.5, max_depth=3, n_estimators=200 ...............
[CV]   learning_rate=0.5, max_depth=3, n_estimators=200, score=0.866, total=  12.2s
[CV]  learning_rate=0.5, max_depth=3, n_estimators=200 ...............
[CV]   learning_rate=0.5, max_depth=3, n_estimators=200, score=0.869, total=  13.1s
[CV]  learning_rate=0.5, max_depth=3, n_estimators=200 ...............
[CV]   learning_rate=0.5, max_depth=3, n_estimators=200, score=0.874, total=  11.9s
[CV]  learning_rate=0.5, max_depth=5, n_estimators=10 ................
[CV]   learning_rate=0.5, max_depth=5, n_estimators=10, score=0.850, total=   1.1s
[CV]  learning_rate=0.5, max_depth=5, n_estimators=10 ................
[CV]   learning_rate=0.5, max_depth=5, n_estimators=10, score=0.853, total=   1.2s
[CV]  learning_rate=0.5, max_depth=5, n_estimators=10 ................
[CV]   learning_rate=0.5, max_depth=5, n_estimators=10, score=0.858, tot

[CV]   learning_rate=0.1, max_depth=5, n_estimators=10, score=0.853, total=   1.1s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=10 ................
[CV]   learning_rate=0.1, max_depth=5, n_estimators=10, score=0.858, total=   1.1s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=50 ................
[CV]   learning_rate=0.1, max_depth=5, n_estimators=50, score=0.861, total=   4.5s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=50 ................
[CV]   learning_rate=0.1, max_depth=5, n_estimators=50, score=0.864, total=   4.6s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=50 ................
[CV]   learning_rate=0.1, max_depth=5, n_estimators=50, score=0.868, total=   4.5s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100 ...............
[CV]   learning_rate=0.1, max_depth=5, n_estimators=100, score=0.867, total=   9.4s
[CV]  learning_rate=0.1, max_depth=5, n_estimators=100 ...............
[CV]   learning_rate=0.1, max_depth=5, n_estimators=100, score=0.871, total

[CV]   learning_rate=0.01, max_depth=5, n_estimators=100, score=0.867, total=  17.9s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=100, score=0.871, total=  18.1s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=100 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=100, score=0.874, total=  17.2s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=200 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=200, score=0.870, total=  34.7s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=200 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=200, score=0.872, total=  35.2s
[CV]  learning_rate=0.01, max_depth=5, n_estimators=200 ..............
[CV]   learning_rate=0.01, max_depth=5, n_estimators=200, score=0.876, total=  26.9s
[CV]  learning_rate=0.01, max_depth=10, n_estimators=10 ..............
[CV]   learning_rate=0.01, max_depth=10, n_estimators=10, score=

[CV]   learning_rate=0.001, max_depth=5, n_estimators=200, score=0.876, total=  18.2s
[CV]  learning_rate=0.001, max_depth=10, n_estimators=10 .............
[CV]   learning_rate=0.001, max_depth=10, n_estimators=10, score=0.857, total=   2.1s
[CV]  learning_rate=0.001, max_depth=10, n_estimators=10 .............
[CV]   learning_rate=0.001, max_depth=10, n_estimators=10, score=0.860, total=   2.3s
[CV]  learning_rate=0.001, max_depth=10, n_estimators=10 .............
[CV]   learning_rate=0.001, max_depth=10, n_estimators=10, score=0.865, total=   2.2s
[CV]  learning_rate=0.001, max_depth=10, n_estimators=50 .............
[CV]   learning_rate=0.001, max_depth=10, n_estimators=50, score=0.864, total=   9.5s
[CV]  learning_rate=0.001, max_depth=10, n_estimators=50 .............
[CV]   learning_rate=0.001, max_depth=10, n_estimators=50, score=0.868, total=   9.8s
[CV]  learning_rate=0.001, max_depth=10, n_estimators=50 .............
[CV]   learning_rate=0.001, max_depth=10, n_estimators=50,

[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 70.3min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={' learning_rate': [1, 0.5, 0.1, 0.01, 0.001],
                         'max_depth': [3, 5, 10, 20],
                         'n_estimator

In [44]:
# To  find the parameters givingmaximum accuracy
grid.best_params_

{' learning_rate': 1, 'max_depth': 5, 'n_estimators': 200}

In [47]:
est=grid.best_estimator_

In [48]:
est

XGBClassifier( learning_rate=1, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [49]:
est.score(x_test,y_test)

0.8729807751366624

In [51]:
import pickle

In [53]:
filename = 'xg_model.pickle'
import pickle

pickle.dump(est, open(filename, 'wb'))

loaded_model = pickle.load(open(filename, 'rb'))