In [2]:
import os
import sys
import time 
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings


os.environ["CUDA_VISIBLE_DEVICES"] = "0"

warnings.filterwarnings(action='once')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Test GPU:  True
Test CUDA:  True


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
"""
Utility functions:
    1. Normalize the data ...
"""

def normalize(X_train, X_test):
    
    X_all = X_train
    mean = np.mean(X_all, axis=0)
    std = np.std(X_all, axis=0)

    index = [0, 1, 3, 4, 5]
    mean_vec = np.zeros(X_all.shape[1])
    std_vec = np.ones(X_all.shape[1])
    mean_vec[index] = mean[index]
    std_vec[index] = std[index]

    X_all_std = (X_all - mean_vec) / std_vec
    X_train_std = X_all_std[0:X_train.shape[0]]
    X_test_std = X_all_std[X_train.shape[0]:]

    return X_train_std, X_test_std

In [4]:
'''
    1. Load csv file (UCI Census Income Dataset) 
    2. Trans df into numpy
    3. Normalize the data
    4. Spilt Train & Test Data
--- 
    Actually, I do lots of data preprocessing cause' I find the data sizes are imbalanced. Therefore,
    I use the imblearn package(Oversampling, undersampling, or combination). However, I couldn't get better score
    with these methods on Kaggle. Maybe, there are some problem in these datasets.(Sparse Data Point or something
    others...)
---

'''

X_train = pd.read_csv('./X_train')
y_train = pd.read_csv('./Y_train', header=None)
X_test = pd.read_csv('./X_test')

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)

X_train_std, X_test_std = normalize(X_train, X_test)

X_all_std = X_train_std
y_all = y_train

X_train_std, X_val_std, y_train, y_val = train_test_split(X_train_std, y_train, test_size=0.33, random_state=2019)

(32561, 106)
(32561, 1)
(32561, 106)
(32561, 1)


In [None]:
'''
    Hyperparams Tuning ... 
    * We can find best train loss in this method. However, it will cost lots of time.(Exec on CPU)
'''

In [20]:
estimator = GradientBoostingClassifier(loss = 'deviance', validation_fraction=0.2)

param_grid = {
    'learning_rate': [0.1],
    'n_estimators': [400, 500, 600, 700, 800, 900],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.85, 0.9]
}

gbc = GridSearchCV(estimator, param_grid, cv=5, n_jobs=-1)
gbc.fit(X_all_std, y_all)

# Best parameters found by grid search are: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'random_state': 2025}

  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.2,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.1], 'n_estimators': [400, 500, 600, 700, 800, 900], 'max_depth': [3, 4, 5, 6], 'subsample': [0.85, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
print('Best parameters found by grid search are:', gbc.best_params_)
print(gbc.best_score_)
print(gbc.cv_results_['mean_test_score'])

Best parameters found by grid search are: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 600, 'subsample': 0.9}
0.8724240655999509
[0.87214766 0.87147201 0.87162556 0.87227051 0.87153343 0.87242407
 0.87190197 0.8719941  0.87131845 0.87150272 0.87082706 0.87107276
 0.87193268 0.87205553 0.8709192  0.87165628 0.87104204 0.87104204
 0.8695986  0.86999785 0.86938362 0.8703971  0.86790946 0.86855441
 0.86999785 0.87104204 0.86993643 0.86947575 0.8690765  0.86797089
 0.866681   0.86763306 0.86575965 0.86643531 0.86419336 0.86560609
 0.86692669 0.86953718 0.86572894 0.86646602 0.86631246 0.86575965
 0.86496115 0.86468475 0.86370197 0.86459261 0.86265778 0.86339486]


In [None]:
'''
    Train the classifier with the hyperparams(which found by the best score)
    - sklearn.GradientBoostingClassifier
    - lightLGB 
        * This GBM will get the highest score and we can exec this GBM over the GPU !
        * In kaggle - public - 87.79XXX, private - 87.4XXXX
'''

In [39]:
params = {
    'loss': 'deviance',
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'n_estimators': 200, 
    'random_state': 2025
}
## 32561 - 7841 
# clf = GradientBoostingClassifier(loss='deviance', n_estimators=500 , max_depth=5, random_state=2019,
#                                  learning_rate=0.1, subsample=1, max_features=40)
### csv - 27 0.876
# clf = GradientBoostingClassifier(loss='deviance', n_estimators=300 , max_depth=4, random_state=2019, learning_rate=0.1,
#                                 validation_fraction=0.2, tol=0.01)
### csv - 28 = csv 12 
# clf = GradientBoostingClassifier(loss='deviance', n_estimators=250 , max_depth=4, random_state=2019, learning_rate=0.1,
#                                 validation_fraction=0.2, tol=0.01)

### csv - 29
# clf = GradientBoostingClassifier(loss='deviance', n_estimators=275 , max_depth=4, random_state=2019, learning_rate=0.1,
#                                 validation_fraction=0.2, tol=0.01)

### csv - 30
clf = GradientBoostingClassifier(loss='deviance', n_estimators=300 , max_depth=4, random_state=2019, learning_rate=0.1,
                                validation_fraction=0.2, tol=0.005)
# clf = GradientBoostingClassifier(params)
clf.fit(X_all_std, y_all)
scores = cross_val_score(clf, X_all_std, y_all, cv=5)
print(clf.score(X_all_std, y_all))
print(np.mean(scores))

y_pred = clf.predict(X_test_std)
y_submit = pd.read_csv('./sample_submission.csv')

y_submit.label = y_pred.astype(int)
y_submit.to_csv('./submission_GBC_Test30.csv', index=0)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.888701206965388
0.8727005495718071


In [92]:
import lightgbm as lgb

# param = {'num_leaves':150, 'objective':'binary','max_depth':7,'learning_rate': 0.05,'max_bin':200, 'n_estimators': 100}
# clf = lgb.LGBMRegressor(boosting_type='gbdt', num_leaves=300, objective='binary', learning_rate=0.05, max_bin=200
#                        , max_depth=4, n_estimators=500)
clf = lgb.LGBMRegressor(boosting_type='gbdt', num_leaves=300, objective='binary', learning_rate=0.05, max_bin=200
                       , max_depth=4, n_estimators=500)
clf.fit(X_all_std, y_all, eval_metric=['auc', 'binary_logloss'])
y_pred = clf.predict(X_test_std)
y_submit = pd.read_csv('./sample_submission.csv')

y_pred = np.around(y_pred)
y_submit.label = y_pred.astype(int)
y_submit.to_csv('./submission_LGB_Test2.csv', index=0)

  y = column_or_1d(y, warn=True)


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.05, max_bin=200,
       max_depth=4, min_child_samples=20, min_child_weight=0.001,
       min_split_gain=0.0, n_estimators=500, n_jobs=-1, num_leaves=300,
       objective='binary', random_state=None, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)