In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
data_all = pd.concat([train.iloc[:, :-1], test])
y = train['income']

## marital-status labeling

In [4]:
data_all['marital-status'][data_all['marital-status']!=' Married-civ-spouse'] = 0
data_all['marital-status'][data_all['marital-status']==' Married-civ-spouse'] = 1
data_all['marital-status'] = data_all['marital-status'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## relationship labeling

In [5]:
data_all['relationship'][(data_all['relationship']!=' Husband') & (data_all['relationship']!=' Wife')] = 0
data_all['relationship'][(data_all['relationship']==' Husband') | (data_all['relationship']==' Wife')] = 1
data_all['relationship'] = data_all['relationship'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## drop un-important column

In [6]:
del data_all['education']
del data_all['no']
del data_all['fnlwgt']

## One-Hot Encoding

In [7]:
one_hot_encoding = pd.get_dummies(data_all)
one_hot_encoding.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48842 entries, 0 to 19536
Data columns (total 80 columns):
age                                           48842 non-null int64
education-num                                 48842 non-null int64
marital-status                                48842 non-null int64
relationship                                  48842 non-null int64
capital-gain                                  48842 non-null int64
capital-loss                                  48842 non-null int64
hours-per-week                                48842 non-null int64
workclass_ ?                                  48842 non-null uint8
workclass_ Federal-gov                        48842 non-null uint8
workclass_ Local-gov                          48842 non-null uint8
workclass_ Never-worked                       48842 non-null uint8
workclass_ Private                            48842 non-null uint8
workclass_ Self-emp-inc                       48842 non-null uint8
workclass_ Self-emp-

In [8]:
one_hot_encoding.columns

Index(['age', 'education-num', 'marital-status', 'relationship',
       'capital-gain', 'capital-loss', 'hours-per-week', 'workclass_ ?',
       'workclass_ Federal-gov', 'workclass_ Local-gov',
       'workclass_ Never-worked', 'workclass_ Private',
       'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc',
       'workclass_ State-gov', 'workclass_ Without-pay', 'occupation_ ?',
       'occupation_ Adm-clerical', 'occupation_ Armed-Forces',
       'occupation_ Craft-repair', 'occupation_ Exec-managerial',
       'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners',
       'occupation_ Machine-op-inspct', 'occupation_ Other-service',
       'occupation_ Priv-house-serv', 'occupation_ Prof-specialty',
       'occupation_ Protective-serv', 'occupation_ Sales',
       'occupation_ Tech-support', 'occupation_ Transport-moving',
       'race_ Amer-Indian-Eskimo', 'race_ Asian-Pac-Islander', 'race_ Black',
       'race_ Other', 'race_ White', 'sex_ Female', 'sex_ Male',
    

In [45]:
ohe_train = one_hot_encoding[:len(train)]
ohe_test = one_hot_encoding[len(train):]

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(ohe_train, y, test_size=0.3, random_state=0)

In [26]:
import xgboost as xgb
import time

In [38]:
params = {"learning_rate" : 0.02,
          "n_estimators" : 3000,
          "max_depth" : 7,
          "subsample" : 1,
          "colsample_bytree" : 0.5,
          "tree_method" : 'gpu_hist',
          "predictor" : 'gpu_predictor', 
          "objective" : "binary:logistic"}

clf = xgb.XGBClassifier(**params)

startTime = time.time()
clf.fit(X_train, y_train, eval_set = [(X_train,y_train), (X_test,y_test)],eval_metric='auc', early_stopping_rounds=100, verbose=200)
print("Spent", time.time() - startTime, "sec")

[0]	validation_0-auc:0.886594	validation_1-auc:0.876948
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[200]	validation_0-auc:0.934423	validation_1-auc:0.925197
[400]	validation_0-auc:0.94095	validation_1-auc:0.92894
[600]	validation_0-auc:0.943946	validation_1-auc:0.930157
[800]	validation_0-auc:0.946344	validation_1-auc:0.930493
Stopping. Best iteration:
[830]	validation_0-auc:0.946644	validation_1-auc:0.930522

Spent 21.220000743865967 sec


In [44]:
Imps = pd.DataFrame({"Features":ohe_train.columns, "Imps":clf.feature_importances_}).sort_values(by="Imps", ascending=False)
Imps.head(10)

Unnamed: 0,Features,Imps
3,relationship,0.30235
2,marital-status,0.161875
4,capital-gain,0.056255
1,education-num,0.039962
20,occupation_ Exec-managerial,0.030686
26,occupation_ Prof-specialty,0.029305
24,occupation_ Other-service,0.025089
5,capital-loss,0.021106
36,sex_ Female,0.019286
22,occupation_ Handlers-cleaners,0.017342


In [46]:
impVars = list(Imps[Imps.Imps != 0].iloc[:,0])

# 해당 변수로만 데이터셋 만들기
ohe_train = ohe_train[impVars]
ohe_test = ohe_test[impVars]

In [51]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## 학습셋/검증셋 구분
Xtrain, Xtest, ytrain, ytest = train_test_split(ohe_train,y,test_size=0.3)

## 파라미터 조합을 저장할 DataFrame 만들어두기
parameters = pd.DataFrame({"3colsample":[0], "4max_depth" : [0], "5gamma" : [0], "6reg_l2" : [0], "7reg_l1" : [0], "8AUC":[0]})

## 검증할 파라미터 조합 List로 만들기
colsample_bytree = [0.4, 0.6, 0.8]
max_depth = [4, 8,12]
gamma = [0,2]
reg_l2 = [0,0.5]
reg_l1 = [0,0.5]

In [54]:
i=0
for colsample in colsample_bytree:
    for max in max_depth:
        for gam in gamma:
            for l2 in reg_l2:
                for l1 in reg_l1:
                    
                    ## 파라미터 설정
                    params = {"learning_rate" : 0.02,
                                             "n_estimators" : 3000,
                                             "max_depth" : max,
                                             "colsample_bytree" : colsample,
                                             "gamma" : gam,
                                              "lambda" : l2,
                                              'alpha' : l1,
                                             "tree_method" : 'gpu_hist',
                                              "predictor" : 'gpu_predictor',
#                                               "n_jobs" : 14,
                                             "objective" : "binary:logistic"}
                    
                    clf = xgb.XGBClassifier(**params)
                        
                    s = time.time()  ## 적합 시작시간 저장
                    clf.fit(Xtrain, ytrain, eval_set = [(Xtrain,ytrain), (Xtest,ytest)], eval_metric='auc', early_stopping_rounds=100, verbose=500)
                    print(time.time() - s)  ## 적합 소요시간 출력

                    r =  pd.DataFrame({"3colsample":[colsample], "4max_depth" : [max], "5gamma" : [gam], "6reg_l2" : [l2], "7reg_l1" : [l1], "8AUC" : roc_auc_score(ytest,clf.predict_proba(Xtest)[:,1]) })
                    parameters = r.append(parameters)
                            
                    print(i, "finished")
                    i = i+1

[0]	validation_0-auc:0.856147	validation_1-auc:0.847934
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[500]	validation_0-auc:0.92989	validation_1-auc:0.921907
[1000]	validation_0-auc:0.93575	validation_1-auc:0.925029
[1500]	validation_0-auc:0.938806	validation_1-auc:0.926283
[2000]	validation_0-auc:0.941228	validation_1-auc:0.926922
Stopping. Best iteration:
[2342]	validation_0-auc:0.942753	validation_1-auc:0.927227

28.425328493118286
0 finished
[0]	validation_0-auc:0.856147	validation_1-auc:0.847934
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[500]	validation_0-auc:0.92989	validation_1-auc:0.921907
[1000]	validation_0-auc:0.93575	validation_1-auc:0.925029
[1500]	validation_0-auc:0.938806	validation_1-auc:0.926283
[2000]	validation_0-auc:0.941228	validation_1-auc

[500]	validation_0-auc:0.960202	validation_1-auc:0.925315
Stopping. Best iteration:
[478]	validation_0-auc:0.959558	validation_1-auc:0.925387

22.778999090194702
17 finished
[0]	validation_0-auc:0.875372	validation_1-auc:0.858615
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[500]	validation_0-auc:0.960202	validation_1-auc:0.925315
Stopping. Best iteration:
[478]	validation_0-auc:0.959558	validation_1-auc:0.925387

22.80900001525879
18 finished
[0]	validation_0-auc:0.875372	validation_1-auc:0.858615
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[500]	validation_0-auc:0.960202	validation_1-auc:0.925315
Stopping. Best iteration:
[478]	validation_0-auc:0.959558	validation_1-auc:0.925387

22.553112268447876
19 finished
[0]	validation_0-auc:0.875372	validation_1-auc:0.85

Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[500]	validation_0-auc:0.948341	validation_1-auc:0.925395
Stopping. Best iteration:
[729]	validation_0-auc:0.95217	validation_1-auc:0.925844

16.86705994606018
35 finished
[0]	validation_0-auc:0.90904	validation_1-auc:0.897543
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[500]	validation_0-auc:0.948341	validation_1-auc:0.925395
Stopping. Best iteration:
[729]	validation_0-auc:0.95217	validation_1-auc:0.925844

16.732257604599
36 finished
[0]	validation_0-auc:0.90904	validation_1-auc:0.897543
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[500]	validation_0-auc:0.948341	validation_1-auc:0.925395
Stopping. Best i

Stopping. Best iteration:
[2028]	validation_0-auc:0.943056	validation_1-auc:0.926133

25.099172353744507
53 finished
[0]	validation_0-auc:0.870028	validation_1-auc:0.86505
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[500]	validation_0-auc:0.930834	validation_1-auc:0.921408
[1000]	validation_0-auc:0.936845	validation_1-auc:0.924416
[1500]	validation_0-auc:0.940264	validation_1-auc:0.925797
[2000]	validation_0-auc:0.942962	validation_1-auc:0.926083
Stopping. Best iteration:
[2028]	validation_0-auc:0.943056	validation_1-auc:0.926133

24.998123168945312
54 finished
[0]	validation_0-auc:0.870028	validation_1-auc:0.86505
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[500]	validation_0-auc:0.930834	validation_1-auc:0.921408
[1000]	validation_0-auc:0.936845	validation_1-a

In [56]:
from sklearn.model_selection import StratifiedKFold

In [None]:
pred_test = pd.DataFrame(ohe_test.index)

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(ohe_train, y)
                        
for i, idx in enumerate(skf.split(ohe_train,y)):
    
    params = {"learning_rate" : 0.02,
              "n_estimators" : 3000,
              "max_depth" : 8,
              "colsample_bytree" : 0.4,
              "gamma" : 2,
              "lambda" : 0,
              'alpha' : 0.5,
              "tree_method" : 'gpu_hist',
              "predictor" : 'gpu_predictor',
              "objective" : "binary:logistic"}

       
    Xtrain = ohe_train.iloc[idx[0],:]
    Xtest = ohe_train.iloc[idx[1],:]

    ytrain = y.iloc[idx[0]]
    ytest = y.iloc[idx[1]]

    clf = xgb.XGBClassifier(**params)
                        
    s = time.time()    
    clf.fit(Xtrain, ytrain, eval_set = [(Xtrain,ytrain), (Xtest,ytest)], eval_metric='auc', early_stopping_rounds=100, verbose=500)
    print(time.time() - s)

    colname = "xgb_"+ str(i)
    pred_test[colname] =clf.predict_proba(ohe_test)[:,1]
                        
    print(i, "finished")

In [None]:
parameters.sort_values(by="8AUC", ascending=False)

## Train/Test Data Split

In [102]:
X = one_hot_encoding[:len(train)]
X_test = one_hot_encoding[len(train):]

## Scaling

In [103]:
sc = MinMaxScaler()
sc.fit(one_hot_encoding)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [104]:
X_tr = sc.transform(X)
X_te = sc.transform(X_test)

## LightGBM

In [105]:
from lightgbm import LGBMClassifier

In [None]:
model_lgbm = LGBMClassifier()
param_grid = {
    'learning_rate': [0.01, 0.15, 0.20],
    'max_depth': [5, 6, 7],
    'num_leaves': [31, 63, 127],
    'reg_alpha': [0.05 ,0.1, 0.15],
    'reg_lambda': [0, 0.1],
    'min_child_samples': [0, 3, 6],
    'min_data_in_leaf': [45, 55],
    'n_estimators':[195, 225]
    }
cv=KFold(n_splits=6, random_state=1)
gcv=GridSearchCV(model_lgbm, param_grid=param_grid, cv=cv, scoring='f1', n_jobs=-1)

gcv.fit(X_tr, y)

In [107]:
model_lgbm = gcv.best_estimator_
gcv.best_estimator_

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.15, max_depth=6,
               min_child_samples=0, min_child_weight=0.001, min_data_in_leaf=45,
               min_split_gain=0.0, n_estimators=195, n_jobs=-1, num_leaves=63,
               objective=None, random_state=None, reg_alpha=0.1, reg_lambda=0,
               silent=True, subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [108]:
cross_val_score(model_lgbm, X_tr, y).mean()



0.8707046191819453

## to_csv

In [109]:
y_pred = model_lgbm.predict(X_te)
y_pred = (y_pred > 0.5)
y_pred = pd.DataFrame(y_pred, columns=['income'])
y_pred.iloc[:, 0].value_counts()

False    15562
True      3975
Name: income, dtype: int64

In [110]:
y_pred['income'].replace(True, 1, inplace=True)
y_pred['income'].replace(False, 0, inplace=True)
y_pred = y_pred.astype('int64')

In [111]:
result = pd.concat([pd.DataFrame(test['no'].values, columns=['no']), y_pred], axis=1)
result.to_csv('./submission.csv', index=False)