In [53]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import os, sys, gc, warnings, random, datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold

import lightgbm as lgb
import xgboost as xgb


from matplotlib import pyplot as plt
import seaborn as sns


In [54]:
def read_data(na_values=' ?', target='income', drop_cols=['education-num']):
    train = pd.read_csv('../data/train.csv', na_values=na_values)
    y = train.pop(target)
    train.drop(drop_cols, axis='columns', inplace=True)
    print('train.shape:', train.shape)
    display(train.head())
    
    test = pd.read_csv('../data/test.csv', na_values=na_values)
    test.drop(drop_cols, axis='columns', inplace=True)
    print('test.shape:', test.shape)
    display(test.head())
    
    return train, test, y

In [55]:
train, test, y = read_data()

train.shape: (29305, 14)


Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,1,25,Private,219199,11th,Divorced,Machine-op-inspct,Not-in-family,White,Male,0,0,40,United-States
1,2,39,Private,52978,Some-college,Divorced,Other-service,Not-in-family,White,Female,0,1721,55,United-States
2,3,35,Private,196899,Bachelors,Never-married,Handlers-cleaners,Not-in-family,Asian-Pac-Islander,Female,0,0,50,Haiti
3,4,64,Private,135527,Assoc-voc,Divorced,Tech-support,Not-in-family,White,Female,0,0,40,United-States
4,5,24,Private,60783,Some-college,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,70,United-States


test.shape: (19537, 14)


Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,29306,18,,245274,Some-college,Never-married,,Own-child,White,Male,0,0,16,United-States
1,29307,29,Private,83003,HS-grad,Married-civ-spouse,Other-service,Wife,White,Female,0,0,40,United-States
2,29308,45,Private,35136,Bachelors,Married-civ-spouse,Tech-support,Husband,Black,Male,0,0,40,United-States
3,29309,42,Self-emp-not-inc,64631,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States
4,29310,41,Private,195821,Doctorate,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,1902,40,United-States


In [56]:
y.shape

(29305,)

In [57]:
train.isna().sum()

no                   0
age                  0
workclass         1663
fnlwgt               0
education            0
marital-status       0
occupation        1668
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     495
dtype: int64

In [58]:
test.isna().sum()

no                   0
age                  0
workclass         1136
fnlwgt               0
education            0
marital-status       0
occupation        1141
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     362
dtype: int64

In [59]:
features = [col for col in train.columns if col != 'no']
features

['age',
 'workclass',
 'fnlwgt',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [60]:
pd.concat([train,y], axis=1)[train[features].duplicated(keep=False)].sort_values(features)

Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
7839,7840,17,Private,153021,12th,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,0
12266,12267,17,Private,153021,12th,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,0
17505,17506,19,Private,130431,5th-6th,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,36,Mexico,0
20332,20333,19,Private,130431,5th-6th,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,36,Mexico,0
5123,5124,19,Private,139466,Some-college,Never-married,Sales,Own-child,White,Female,0,0,25,United-States,0
6524,6525,19,Private,139466,Some-college,Never-married,Sales,Own-child,White,Female,0,0,25,United-States,0
787,788,19,Private,251579,Some-college,Never-married,Other-service,Own-child,White,Male,0,0,14,United-States,0
16146,16147,19,Private,251579,Some-college,Never-married,Other-service,Own-child,White,Male,0,0,14,United-States,0
17250,17251,19,,167428,Some-college,Never-married,,Own-child,White,Male,0,0,40,United-States,0
21015,21016,19,,167428,Some-college,Never-married,,Own-child,White,Male,0,0,40,United-States,0


In [61]:
features = [col for col in test.columns if col != 'no']
test[test[features].duplicated(keep=False)].sort_values(features)

Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
4299,33605,19,Private,146679,Some-college,Never-married,Exec-managerial,Own-child,Black,Male,0,0,30,United-States
14377,43683,19,Private,146679,Some-college,Never-married,Exec-managerial,Own-child,Black,Male,0,0,30,United-States
6952,36258,21,Private,243368,Preschool,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,50,Mexico
14702,44008,21,Private,243368,Preschool,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,50,Mexico
1322,30628,22,,334593,Some-college,Never-married,,Not-in-family,White,Male,0,0,40,United-States
15520,44826,22,,334593,Some-college,Never-married,,Not-in-family,White,Male,0,0,40,United-States
2081,31387,23,Private,240137,5th-6th,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,55,Mexico
13999,43305,23,Private,240137,5th-6th,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,55,Mexico
164,29470,25,Private,308144,Bachelors,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,Mexico
6551,35857,25,Private,308144,Bachelors,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,Mexico


In [62]:
grouped_trn = pd.concat([train,y], axis=1).groupby(features)['income'].agg(['count','mean'])
grouped_trn.head()
grouped_trn_dup = grouped_trn[grouped_trn['count']>1].reset_index()
grouped_trn_dup

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,count,mean
0,17,Private,153021,12th,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,2,0.0
1,19,Private,130431,5th-6th,Never-married,Farming-fishing,Not-in-family,White,Male,0,0,36,Mexico,2,0.0
2,19,Private,139466,Some-college,Never-married,Sales,Own-child,White,Female,0,0,25,United-States,2,0.0
3,19,Private,251579,Some-college,Never-married,Other-service,Own-child,White,Male,0,0,14,United-States,2,0.0
4,20,Private,107658,Some-college,Never-married,Tech-support,Not-in-family,White,Female,0,0,10,United-States,2,0.0
5,21,Private,250051,Some-college,Never-married,Prof-specialty,Own-child,White,Female,0,0,10,United-States,2,0.0
6,22,Private,137876,Some-college,Never-married,Protective-serv,Not-in-family,White,Male,0,0,20,United-States,2,0.0
7,23,Private,107882,Bachelors,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,United-States,2,0.0
8,23,Private,250630,Bachelors,Never-married,Sales,Not-in-family,White,Female,0,0,40,United-States,2,0.0
9,25,Private,195994,1st-4th,Never-married,Priv-house-serv,Not-in-family,White,Female,0,0,40,Guatemala,2,0.0


In [63]:
train_dedup = train.drop_duplicates(features, keep='last')
y_dedup = y[train_dedup.index]
train_dedup.reset_index(drop=True, inplace=True)
y_dedup.reset_index(drop=True, inplace=True)
print(train_dedup.shape, y_dedup.shape)
train_dedup.head()

(29282, 14) (29282,)


Unnamed: 0,no,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,1,25,Private,219199,11th,Divorced,Machine-op-inspct,Not-in-family,White,Male,0,0,40,United-States
1,2,39,Private,52978,Some-college,Divorced,Other-service,Not-in-family,White,Female,0,1721,55,United-States
2,3,35,Private,196899,Bachelors,Never-married,Handlers-cleaners,Not-in-family,Asian-Pac-Islander,Female,0,0,50,Haiti
3,4,64,Private,135527,Assoc-voc,Divorced,Tech-support,Not-in-family,White,Female,0,0,40,United-States
4,5,24,Private,60783,Some-college,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,70,United-States


In [64]:
def freq_encode_full(df1, df2, col, normalize=True):
    df = pd.concat([df1[col],df2[col]])
    vc = df.value_counts(dropna=False, normalize=normalize).to_dict()
    nm = col + '_FE_FULL'
    df1[nm] = df1[col].map(vc)
    df1[nm] = df1[nm].astype('float32')
    df2[nm] = df2[col].map(vc)
    df2[nm] = df2[nm].astype('float32')
    return nm

In [65]:
for df in [train_dedup,test]:
    df['workclass-occupation'] = df['workclass'] + '#' + df['occupation']
    df['workclass-education'] = df['workclass'] + '#' + df['education']
    df['occupation-education'] = df['occupation'] + '#' + df['education']
    df['marital-status-relationship'] = df['marital-status'] + '#' + df['relationship']
    df['race-sex'] = df['race'] + '#' + df['sex']
    #df['native_country_race'] = df['native_country'] + '#' + df['race']
    #df['workclass_na'] = df['workclass'].isna().astype(int)
    #df['occupation_na'] = df['occupation'].isna().astype(int)
    #df['native_country_na'] = df['native_country'].isna().astype(int)
    #df['na_cnt'] = df.isna().sum(axis=1)
    df['capital-margin'] = df['capital-gain'] - df['capital-loss']
    df['capital-total'] = df['capital-gain'] + df['capital-loss']
    df['capital-margin-flag'] = np.nan
    df.loc[df['capital-margin']==0, 'capital-margin-flag'] = 'zero'
    df.loc[df['capital-margin']>0, 'capital-margin-flag'] = 'positive'
    df.loc[df['capital-margin']<0, 'capital-margin-flag'] = 'negative'
    #df['fnlwgt_log'] = np.log1p(df['fnlwgt'])

In [66]:
df.columns

Index(['no', 'age', 'workclass', 'fnlwgt', 'education', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country',
       'workclass-occupation', 'workclass-education', 'occupation-education',
       'marital-status-relationship', 'race-sex', 'capital-margin',
       'capital-total', 'capital-margin-flag'],
      dtype='object')

In [67]:
cate_cols = []
# LABEL ENCODING
for col in train_dedup.columns:
    if (train_dedup[col].dtype.name == 'object' or test[col].dtype.name == 'object' \
        or train_dedup[col].nunique() < 300) and col != 'age':
        cate_cols.append(col)
        le = LabelEncoder()
        le.fit(list(train_dedup[col].values) + list(test[col].values))
        train_dedup[col] = le.transform(list(train_dedup[col].values))
        test[col] = le.transform(list(test[col].values))
        
print('categorical feature:', cate_cols)

categorical feature: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'workclass-occupation', 'workclass-education', 'occupation-education', 'marital-status-relationship', 'race-sex', 'capital-margin', 'capital-total', 'capital-margin-flag']


In [68]:
for col in [col for col in cate_cols if col not in ['capital-gain','capital-loss','race']]:
    freq_encode_full(train_dedup, test, col)

In [69]:
remove_features = ['id','income']
features = [col for col in list(train_dedup) if col not in remove_features]
features

['no',
 'age',
 'workclass',
 'fnlwgt',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'workclass-occupation',
 'workclass-education',
 'occupation-education',
 'marital-status-relationship',
 'race-sex',
 'capital-margin',
 'capital-total',
 'capital-margin-flag',
 'workclass_FE_FULL',
 'education_FE_FULL',
 'marital-status_FE_FULL',
 'occupation_FE_FULL',
 'relationship_FE_FULL',
 'sex_FE_FULL',
 'hours-per-week_FE_FULL',
 'native-country_FE_FULL',
 'workclass-occupation_FE_FULL',
 'workclass-education_FE_FULL',
 'occupation-education_FE_FULL',
 'marital-status-relationship_FE_FULL',
 'race-sex_FE_FULL',
 'capital-margin_FE_FULL',
 'capital-total_FE_FULL',
 'capital-margin-flag_FE_FULL']

In [70]:
def make_xgb_prediction(train, y, test, features, model_params=None, folds=5):
    def xgb_f1_score(y_hat, data):
        y_true = data.get_label()
        y_hat = np.round(y_hat) 
        return 'f1', 1-f1_score(y_true, y_hat, average='micro')
    
    skf = StratifiedKFold(n_splits=folds, random_state=42, shuffle=True)
    
    x_train = train[features]
    x_test = test[features]

    y_preds = np.zeros(x_test.shape[0])
    y_oof = np.zeros(x_train.shape[0])
    score = 0

    feature_importance = pd.DataFrame()
    feature_importance['feature'] = features

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        print(f'Fold: {fold+1}')

        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(x_tr.shape, x_val.shape)

        dtrain = xgb.DMatrix(x_tr, label=y_tr)
        dvalid = xgb.DMatrix(x_val, label=y_val)

        clf = xgb.train(
            model_params,
            dtrain,
            num_boost_round=10000, 
            evals=[(dtrain, 'train'), (dvalid, 'valid')],
            verbose_eval=200,
            early_stopping_rounds=100,
            feval=xgb_f1_score
        )

        #feature_importance[f'fold_{fold+1}'] = clf.get_score()

        y_pred_val = clf.predict(dvalid)

        y_oof[val_idx] = y_pred_val
        print(f"Fold {fold + 1} | F1 Score: {f1_score(y_val, np.round(y_pred_val), average='micro')}")

        score += f1_score(y_val, np.round(y_pred_val), average='micro') / folds
        y_preds += clf.predict(xgb.DMatrix(x_test)) / folds

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean F1 score = {score}")
    print(f"OOF F1 score = {f1_score(y, np.round(y_oof), average='micro')}")
    
    return y_oof, y_preds, feature_importance

In [74]:
# xgb model params
xgb_params = {
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'max_depth': 6,
    #'colsample_bytree': 0.8,
    'subsample': 0.8,
    'disable_default_eval_metric': 1,
    #'eval_metric': 'logloss',
    'seed': 42,
} 

In [75]:
y_oof_xgb, y_preds_xgb, fi_xgb = make_xgb_prediction(train_dedup, y_dedup, test, features, \
                                                     model_params=xgb_params)

Fold: 1
(23425, 38) (5857, 38)
[0]	train-f1:0.24038	valid-f1:0.24040
[100]	train-f1:0.89020	valid-f1:0.89005
Fold 1 | F1 Score: 0.8796312105173297
Fold: 2
(23425, 38) (5857, 38)
[0]	train-f1:0.24038	valid-f1:0.24040
[100]	train-f1:0.89059	valid-f1:0.89056
Fold 2 | F1 Score: 0.867167491890046
Fold: 3
(23426, 38) (5856, 38)
[0]	train-f1:0.24042	valid-f1:0.24027
[100]	train-f1:0.88918	valid-f1:0.89771
Fold 3 | F1 Score: 0.8729508196721312
Fold: 4
(23426, 38) (5856, 38)
[0]	train-f1:0.24037	valid-f1:0.24044
[100]	train-f1:0.88901	valid-f1:0.88866
Fold 4 | F1 Score: 0.8773907103825137
Fold: 5
(23426, 38) (5856, 38)
[0]	train-f1:0.24037	valid-f1:0.24044
[99]	train-f1:0.89128	valid-f1:0.89242
Fold 5 | F1 Score: 0.8657786885245902

Mean F1 score = 0.8725837841973221
OOF F1 score = 0.872583839901646


In [None]:
sample_submission['income'] = y_preds

for ix, row in sample_submission.iterrows():
    if row['prediction'] > 0.5:
        sample_submission.loc[ix, 'prediction'] = 1
    else:
        sample_submission.loc[ix, 'prediction'] = 0

sample_submission = sample_submission.astype({"prediction": int})
sample_submission.to_csv('submission.csv', index=False)