In [None]:
import pandas as pd
import numpy as np

import os
import time

import lightgbm as lgb

from sklearn.model_selection import ParameterSampler, StratifiedKFold, KFold
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import matplotlib.pyplot as plt
pd.options.display.max_rows = 500
%matplotlib inline

In [None]:
import io, math, requests

# Only works in Python3, see comment below for Python2
def submit_prediction(df, sep=',', **kwargs):
    # TOKEN to recover on the platform: "Submissions"> "Submit from your Python Notebook"
    TOKEN='2b828abeb51e873238ac39f3d3f2f9d4fcac34a17c1fbd07fb899f3f2c60171e665fd7c3d553a8090855a8442b14c0033a2e8bb46500f5f2aa7393319c78a627'  
    URL='https://qscore.datascience-olympics.com/api/submissions'
    buffer = io.BytesIO() # Python 2
    #buffer = io.StringIO() # Python 3
    df.to_csv(buffer, sep=sep, **kwargs)
    buffer.seek(0)
    r = requests.post(URL, headers={'Authorization': 'Bearer {}'.format(TOKEN)},files={'datafile': buffer})
    if r.status_code == 429:
        raise Exception('Submissions are too close. Next submission is only allowed in {} seconds.'.format(int(math.ceil(int(r.headers['x-rate-limit-remaining']) / 1000.0))))
    if r.status_code != 200:
        raise Exception(r.text)

### Load datasets

In [None]:
%%time
data_path = '../../Data'
submission_path = '../../submission'
train = pd.read_csv(os.path.join(data_path, "X_train_prep.csv"))
test = pd.read_csv(os.path.join(data_path, "X_test_prep.csv"))
target = pd.read_csv(os.path.join(data_path, "y_train.csv"), index_col=0).values.flatten()
cat_cols = pd.read_csv(os.path.join(data_path, "cat_cols.csv"), header=-1)
cat_cols = list(cat_cols.T.values.flatten())

### Prepare data

In [None]:
def get_sample(train, target, rate=0.05):
    np.random.seed(0)
    r = np.random.choice([True, False], len(train), p=[rate, 1-rate])
    r.sum()
    train = train[r]
    target = target[r]
    return train, target

In [None]:
#train, target = get_sample(train, target)

In [None]:
lTrain = lgb.Dataset(train, label=target, categorical_feature=cat_cols, free_raw_data=False)
lTest = lgb.Dataset(test, categorical_feature=cat_cols, free_raw_data=False)
ntrain = train.shape[0]
ntest = test.shape[0]

### Hyper-params

In [None]:
problematic = 'multiclass' # binary, regression, multiclass
metric = 'multi_logloss' # logloss, l1, l2, l2_root, binary_logloss, binary_error, auc, multi_error, multi_logloss
sklearn_metric = 'log_loss' # roc_auc_score, accuracy_score, mean_absolute_error, mean_squared_error, log_loss
n_classes = 3
random_seed = 0
n_folds = 5
n_iter = 30
submit = True
to_csv = True

In [None]:
# Set params
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': problematic,
          'metric' : metric,
          'nthread': -1,
          'n_estimators': 100000,
          'num_leaves': 128,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          #'min_split_gain': 0.5,
          #'min_child_weight': 1,
          #'min_child_samples': 5,
          #'scale_pos_weight': 1,
          #'subsample_for_bin': 200,
          #'reg_sqrt': True, # for regression
          'random_state': random_seed}

if problematic == 'multiclass':
    params['num_class'] = n_classes

# Create parameters to search
gridParams = {
    'learning_rate': [0.01, 0.005, 0.001],
    'num_leaves' : [2**4, 2**5, 2**6, 2**7, 2**8],
    'colsample_bytree' : [0.4, 0.6, 0.8],
    'subsample' : [0.4, 0.6, 0.8],
    'reg_alpha' : [0.01, 0.1, 1],
    'reg_lambda' : [0.01, 0.1, 1]
    }

param_list = list(ParameterSampler(gridParams, n_iter=n_iter, random_state=random_seed))

grid_search_params = []
for param in param_list:
    params.update(param)
    grid_search_params.append(params.copy())

### Find best params

In [None]:
%%time
stratified = True
if problematic == 'regression':
    stratified = False

best_scores = {}
for i, param in enumerate(grid_search_params):
    model = lgb.cv(param, lTrain, nfold=n_folds, verbose_eval=200,
                   early_stopping_rounds=200, stratified=stratified)
    res = {'best_score': min(model[metric + '-mean'])}
    res.update(param.copy())
    res['params'] = param.copy()
    best_scores['model_{}'.format(i)] = res.copy()

In [None]:
grid_search_results = pd.DataFrame(best_scores).T.sort_values('best_score')
best_params = grid_search_results['params'][0]
grid_search_results

### Re-trained top 3 best models

In [None]:
def train_and_submit(params, train, target, test, n_folds, n_classes, cat_cols,
                     metric, random_seed=0, submit=True, to_csv=True):

    if params['objective'] == 'regression':
        folds = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
    else:
        folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_seed)

    if params['objective'] == 'multiclass':
        oof = np.zeros((len(train), n_classes))
        predictions = np.zeros((len(test), n_classes))
    else:
        oof = np.zeros(len(train))
        predictions = np.zeros(len(test))
    feature_importance = pd.DataFrame(np.zeros((train.shape[1], n_folds)), index=train.columns)

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx], label=target[trn_idx],
                               categorical_feature=cat_cols, free_raw_data=False)
        val_data = lgb.Dataset(train.iloc[val_idx], label=target[val_idx],
                               categorical_feature=cat_cols, free_raw_data=False)
        clf = lgb.train(best_params, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 200)
        oof[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
        predictions += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
        feature_importance.iloc[:, fold_] = clf.feature_importance()
    
    allowed_metric = ['log_loss', 'roc_auc_score', 'accuracy_score', 'mean_absolute_error', 'mean_squared_error']
    if metric not in allowed_metric:
        raise('Not allowed metric')
    f_score = eval(metric)    
    score = f_score(target, oof)
    print("CV score: {:<8.5f}".format(score))
    df_submission = pd.DataFrame(predictions, index=test.index)
    
    feature_importance = feature_importance.mean(axis=1).sort_values(ascending=False)

    if submit:
        submit_prediction(df_submission, sep=',', index=True)
    if to_csv:
        df_submission.to_csv(os.path.join(submission_path, "submission_{}.csv".format(score)), index=False)

    return feature_importance, oof, predictions, df_submission

In [None]:
%%time
best_model = train_and_submit(params=best_params,
                              train=train,
                              target=target,
                              test=test,
                              n_folds=n_folds,
                              n_classes=n_classes,
                              cat_cols=cat_cols,
                              metric=sklearn_metric,
                              random_seed=random_seed,
                              submit=submit,
                              to_csv=to_csv)

In [None]:
second_model = train_and_submit(params=grid_search_results['params'][1],
                                train=train,
                                target=target,
                                test=test,
                                n_folds=n_folds,
                                n_classes=n_classes,
                                cat_cols=cat_cols,
                                metric=sklearn_metric,
                                random_seed=random_seed,
                                submit=submit,
                                to_csv=to_csv)

In [None]:
third_model = train_and_submit(params=grid_search_results['params'][2],
                               train=train,
                               target=target,
                               test=test,
                               n_folds=n_folds,
                               n_classes=n_classes,
                               cat_cols=cat_cols,
                               metric=sklearn_metric,
                               random_seed=random_seed,
                               submit=submit,
                               to_csv=to_csv)

In [None]:
imp = best_model[0]
imp

### Retrain with feature selection

In [None]:
col_to_keep = list(train.loc[:, imp != 0].columns)
col_to_drop = list(train.loc[:, imp == 0].columns)
drop_cat = [elem for elem in set(col_to_drop).intersection(cat_cols)]
new_cat_cols = [x for x in cat_cols if x not in drop_cat]
print(drop_cat)
pd.Series(col_to_keep).to_csv(os.path.join(data_path, "selected_features.csv"), index=False)

In [None]:
%%time
best_model_feature_selection = train_and_submit(params=best_params,
                                                train=train.loc[:, col_to_keep],
                                                target=target,
                                                test=test.loc[:, col_to_keep],
                                                n_folds=n_folds,
                                                n_classes=n_classes,
                                                cat_cols=new_cat_cols,
                                                metric=sklearn_metric,
                                                random_seed=random_seed,
                                                submit=submit,
                                                to_csv=to_csv)

### Retrain without outliers

In [None]:
train_out = pd.read_csv(os.path.join(data_path, "X_train_prep_without_out.csv"))
target_out = pd.read_csv(os.path.join(data_path, "y_train_without_out.csv"), header=-1).values.flatten()

#train_out, target_out = get_sample(train_out, target_out)

In [None]:
%%time
best_model_feature_selection_without_outliers = train_and_submit(params=best_params,
                                                                 train=train_out.loc[:, col_to_keep],
                                                                 target=target_out,
                                                                 test=test.loc[:, col_to_keep],
                                                                 n_folds=n_folds,
                                                                 n_classes=n_classes,
                                                                 cat_cols=new_cat_cols,
                                                                 metric=sklearn_metric,
                                                                 random_seed=random_seed,
                                                                 submit=submit,
                                                                 to_csv=to_csv)

### Stacking

In [None]:
to_csv = True
submit = True

In [None]:
if problematic=='multiclass':
    submission_files = os.listdir(submission_path)
    stacking = pd.DataFrame(np.zeros((ntest, n_classes)))

    for file_name in submission_files:
        if file_name.startswith('submission_'):
            submission = pd.read_csv(os.path.join(submission_path, file_name))
            submission.columns = stacking.columns.copy()
            stacking = stacking.add(submission/len(submission_files))

    if submit:
        submit_prediction(stacking, sep=',', index=True)
    if to_csv:
        stacking.to_csv(os.path.join(submission_path, "stacking.csv"), index=False)
        
        
else:
    models = ['best_model', 'second_model', 'third_model', 'best_model_feature_selection']

    stacking = pd.DataFrame(np.zeros(ntest))

    for model in models:
        score = pd.DataFrame(eval(model)[2])
        stacking = stacking + score/len(models)

    if submit:
        submit_prediction(stacking, sep=',', index=True)
    if to_csv:
        stacking.to_csv(os.path.join(submission_path, "stacking.csv"), index=False)

In [None]:
models = ['best_model', 'second_model', 'third_model', 'best_model_feature_selection']

CV_scores = pd.DataFrame(np.zeros((ntrain, len(models))), columns=models)
test_scores = pd.DataFrame(np.zeros((ntest, len(models))), columns=models)

for i, model in enumerate(models):
    CV_scores.iloc[:, i] = eval(model)[1]
    test_scores.iloc[:, i] = eval(model)[2]

if problematic=='binary':
    clf = RandomForestClassifier(random_state=random_seed, n_estimators=100, max_depth=10)
    clf.fit(CV_scores.values, target)
    preds = clf.predict_proba(CV_scores.values)[:, 1]
    f_score = eval(sklearn_metric)
    score = f_score(target, preds)
    print("Blend score: {:<8.5f}".format(score))
    estimates = clf.predict_proba(test_scores.values)[:, 1]
    blending = pd.DataFrame(estimates, index=test.index)
    if submit:
        submit_prediction(blending, sep=',', index=True)
    if to_csv:
        blending.to_csv(os.path.join(submission_path, "blending_{}.csv".format(score)), index=False)

elif problematic=='regression':
    clf = RandomForestRegressor(random_state=random_seed, n_estimators=100, max_depth=10)
    clf.fit(CV_scores.values, target)
    preds = clf.predict(CV_scores.values)
    f_score = eval(sklearn_metric)
    score = f_score(target, preds)
    print("Blend score: {:<8.5f}".format(score))
    estimates = clf.predict(test_scores.values)
    blending = pd.DataFrame(estimates, index=test.index)
    if submit:
        submit_prediction(blending, sep=',', index=True)
    if to_csv:
        blending.to_csv(os.path.join(submission_path, "blending_{}.csv".format(score)), index=False)