In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/adult-census-income/adult.csv


In [2]:
df = pd.read_csv('/kaggle/input/adult-census-income/adult.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [4]:
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection

In [5]:
# df = pd.read_csv("/kaggle/input/adult-census-income/adult.csv")
df["kfold"] = -1
df = df.sample(frac=1).reset_index(drop=True)
target_mapping = {
 "<=50K": 0,
 ">50K": 1
 }
df.loc[:, "income"] = df.income.map(target_mapping)
y = df[['income']]


In [6]:
y[:10]

Unnamed: 0,income
0,0
1,0
2,0
3,0
4,0
5,1
6,0
7,1
8,1
9,0


In [7]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,kfold
0,32,Private,126838,HS-grad,9,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,0,-1
1,32,State-gov,131588,Some-college,10,Never-married,Tech-support,Unmarried,Black,Female,0,0,20,United-States,0,-1
2,25,Private,367306,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,United-States,0,-1
3,68,Self-emp-not-inc,89011,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,Canada,0,-1
4,21,Private,29810,HS-grad,9,Never-married,Other-service,Not-in-family,White,Female,0,0,50,United-States,0,-1


In [8]:
kf = model_selection.StratifiedKFold(n_splits=5)
for f, (t_, v_) in enumerate(kf.split(X=df, y = y.astype(str))):
    df.loc[v_, 'kfold'] = f

In [9]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income', 'kfold'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,kfold
0,32,Private,126838,HS-grad,9,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,0,0
1,32,State-gov,131588,Some-college,10,Never-married,Tech-support,Unmarried,Black,Female,0,0,20,United-States,0,0
2,25,Private,367306,Some-college,10,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,United-States,0,0
3,68,Self-emp-not-inc,89011,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,Canada,0,0
4,21,Private,29810,HS-grad,9,Never-married,Other-service,Not-in-family,White,Female,0,0,50,United-States,0,0


In [11]:
import itertools
def feature_engineering(df, cat_cols):
    combi = list(itertools.combinations(cat_cols, 2))
    for c1, c2 in combi:
        df.loc[:, c1 + "_" + c2] = df[c1].astype(str) + "_" + df[c2].astype(str)
    return df

In [12]:
import xgboost as xgb
def run(fold,df):
    num_cols = [
         "fnlwgt",
         "age",
         "capital.gain",
         "capital.loss",
         "hours.per.week"
         ]
#     df = df.drop(num_cols, axis=1)

    cat_cols = [
     c for c in df.columns if c not in num_cols
     and c not in ("kfold", "income")
     ]
    df = feature_engineering(df, cat_cols[:3])
    features = [
    f for f in df.columns if f not in ("kfold", "income")
     ]
    for col in features:
        if col not in num_cols:
            df.loc[:, col] = df[col].astype(str).fillna("NONE")
    
    for col in features:
        if col not in num_cols:
            lbl = preprocessing.LabelEncoder()
            lbl.fit(df[col])
            df.loc[:, col] = lbl.transform(df[col])
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    x_train = df_train[features].values
    x_valid = df_valid[features].values
    model = xgb.XGBClassifier(
 n_jobs=-1,
#         max_depth=7, estimators=200
 )
 
    model.fit(x_train, df_train.income.values)
    valid_preds = model.predict_proba(x_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.income.values.astype(str), valid_preds)
    print(f"Fold = {fold}, AUC = {auc}")
    


In [13]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings(
    action="ignore",
    category=DeprecationWarning
)
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)


In [14]:
for fold_ in range(5):
    run(fold_,df)

Fold = 0, AUC = 0.9261324669310942
Fold = 1, AUC = 0.9276291789470641
Fold = 2, AUC = 0.9180850396894195
Fold = 3, AUC = 0.927081011409418
Fold = 4, AUC = 0.9293306331938116


In [15]:
import xgboost as xgb

In [16]:
def xgb_run(fold):
    global df 
   
    features = [
    f for f in df.columns if f not in ("kfold", "income")
     ]
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    le = preprocessing.LabelEncoder()
    df_train['income'] = le.fit_transform(df_train['income'])
    df_valid['income'] = le.transform(df_valid['income'])
    ohe = preprocessing.OneHotEncoder()
    full_data = pd.concat(
        [df_train[features], df_valid[features]],
         axis=0
        )
    ohe.fit(full_data[features])
    x_train = ohe.transform(df_train[features])
    x_valid = ohe.transform(df_valid[features])
    model = model = xgb.XGBClassifier(
     n_jobs=-1
     )
    model.fit(x_train, df_train.income.values)
    valid_preds = model.predict_proba(x_valid)[:, 1]
    auc = metrics.roc_auc_score(df_valid.income.values, valid_preds)
    print(f"Fold = {fold}, AUC = {auc}")
    

In [17]:
df.income.values.dtype

dtype('O')

In [18]:
for fold_ in range(5):
    xgb_run(fold_)

Fold = 0, AUC = 0.9221584486851848
Fold = 1, AUC = 0.9255100363871276
Fold = 2, AUC = 0.9121151798097878
Fold = 3, AUC = 0.9238427015223566
Fold = 4, AUC = 0.9245195423436365
