In [1]:
import numpy as np
import pandas as pd
import os
import gc
import random
random.seed(1024)

In [2]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from IPython.display import display

# Gradient Boosting
import lightgbm as lgb

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')

In [3]:
id_col = "ID"
target_var = "target"

print("Loading Data...")
df_train = pd.read_csv("./train.csv", index_col=id_col)
training_index = df_train.index
df_test = pd.read_csv("./test.csv", index_col=id_col)
test_index = df_test.index

y = np.log1p(df_train[target_var])
df_train.drop(target_var, axis=1, inplace=True)

print('Train shape: {} Rows, {} Columns'.format(*df_train.shape))
print('Test shape: {} Rows, {} Columns'.format(*df_test.shape))

print("Combine Train and Test")
df = pd.concat([df_train, df_test], axis=0)
del df_train, df_test
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

Loading Data...
Train shape: 4459 Rows, 4991 Columns
Test shape: 49342 Rows, 4991 Columns
Combine Train and Test

All Data shape: 53801 Rows, 4991 Columns


In [4]:
# Modeling Datasets
test_df = df.loc[test_index,:]
vocab = df.columns

# LGBM Dataset
lgtrain = lgb.Dataset(df.loc[training_index,vocab],y ,feature_name = "auto")
print("Starting LightGBM. Train shape: {}, Test shape: {}".format(df.loc[training_index,:].shape,test_df.shape))
print("Feature Num: ",len(vocab))
del df; gc.collect();

Starting LightGBM. Train shape: (4459, 4991), Test shape: (49342, 4991)
Feature Num:  4991


In [None]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "learning_rate": 0.01,
    "num_leaves": 180,
    "feature_fraction": 0.50,
    "bagging_fraction": 0.50,
    'bagging_freq': 4,
    "max_depth": -1,
    "reg_alpha": 0.3,
    "reg_lambda": 0.1,
    #"min_split_gain":0.2,
    "min_child_weight":10,
    'zero_as_missing':True,
    'device': 'gpu',
                }

In [None]:
results = pd.DataFrame(columns = ["Rounds","Score","STDV", "LB", "Parameters"])
# Find Optimal Parameters / Boosting Rounds
lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgtrain,
    num_boost_round=2000,
    stratified=False,
    nfold = 5,
    verbose_eval=50,
    seed = 23,
    early_stopping_rounds=75)

optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
best_cv_score = min(lgb_cv['rmse-mean'])

print("\nOptimal Round: {}\nOptimal Score: {} + {}".format(
    optimal_rounds,best_cv_score,lgb_cv['rmse-stdv'][optimal_rounds]))

results = results.append({"Rounds": optimal_rounds,
                          "Score": best_cv_score,
                          "STDV": lgb_cv['rmse-stdv'][optimal_rounds],
                          "LB": None,
                          "Parameters": lgbm_params}, ignore_index=True)

In [None]:
pd.set_option('max_colwidth', 800)
display(results.sort_values(by="Score",ascending = True))

In [None]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

results = pd.DataFrame(columns = ["Rounds","Score","STDV", "Parameters"])

def objective(params):
    lgbm_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        "learning_rate": 0.01,
        "bagging_fraction": 0.8,
        'bagging_freq': 4,
        "reg_alpha": 0.3,
        "reg_lambda": 0.1,
        "min_child_weight": 10,
        'zero_as_missing': True,
        'feature_fraction_seed': 5,
        'device': 'gpu',
    }
    
    params['num_leaves'] = int(params['num_leaves'])
    params['feature_fraction'] = '{:.3f}'.format(params['feature_fraction'])
    cv_params = dict(lgbm_params, **params)
    
    lgb_cv = lgb.cv(
        params = cv_params,
        train_set = lgtrain,
        num_boost_round=2000,
        stratified=False,
        nfold = 3,
        verbose_eval=10000,
        seed = 1234,
        early_stopping_rounds=75)
    
    score = np.min(lgb_cv['rmse-mean'])
    optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
    print("Params: {}".format(params))
    print("Optimal Round: {}\nOptimal Score: {} + {}".format(
        optimal_rounds,score,lgb_cv['rmse-stdv'][optimal_rounds]))
    global results
    results = results.append({"Rounds": optimal_rounds,
                          "Score": score,
                          "STDV": lgb_cv['rmse-stdv'][optimal_rounds],
                          "Parameters": params}, ignore_index=True)
    print("###########################################################################################")

    return score

space = {
    'num_leaves': hp.quniform('num_leaves', 40, 320, 2),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1.0),
    'max_depth': hp.choice('max_depth', [-1,4,6,8,12,16,24,32]),
    'num_leaves': hp.choice('num_leaves', [12,36,60,100,140,180])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=500)

Params: {'feature_fraction': '0.994', 'max_depth': 6, 'num_leaves': 36}
Optimal Round: 701
Optimal Score: 1.4475485462521605 + 0.008393263017899597
###########################################################################################
Params: {'feature_fraction': '0.663', 'max_depth': 16, 'num_leaves': 60}
Optimal Round: 393
Optimal Score: 1.4354457219911687 + 0.005831930719646793
###########################################################################################
Params: {'feature_fraction': '0.851', 'max_depth': 24, 'num_leaves': 12}
Optimal Round: 710
Optimal Score: 1.449511279197058 + 0.009030143247951589
###########################################################################################
Params: {'feature_fraction': '0.815', 'max_depth': 6, 'num_leaves': 140}
Optimal Round: 778
Optimal Score: 1.4465160447131524 + 0.0076460385052734454
###########################################################################################
Params: {'feature_fraction': '0.531',

In [None]:
learning_rates = [0.05, 0.02, 0.01 ,0.005]
for param in learning_rates:
    print("Learning Rate: ", param)
    lgbm_params["learning_rate"] = param
    # Find Optimal Parameters / Boosting Rounds
    lgb_cv = lgb.cv(
        params = lgbm_params,
        train_set = lgtrain,
        num_boost_round=10000,
        stratified=False,
        nfold = 5,
        verbose_eval=200,
        seed = 23,
        early_stopping_rounds=75)

    optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
    best_cv_score = min(lgb_cv['rmse-mean'])

    print("Optimal Round: {}\nOptimal Score: {} + {}".format(
        optimal_rounds,best_cv_score,lgb_cv['rmse-stdv'][optimal_rounds]))
    print("###########################################################################################")

    results = results.append({"Rounds": optimal_rounds,
                              "Score": best_cv_score,
                              "STDV": lgb_cv['rmse-stdv'][optimal_rounds],
                              "LB": None,
                              "Parameters": lgbm_params}, ignore_index=True)

In [None]:
pd.set_option('max_colwidth', 800)
display(results.sort_values(by="Score",ascending = True))

In [None]:
# Best Parameters
final_model_params = results.iloc[results["Score"].idxmin(),:]["Parameters"]
optimal_rounds = results.iloc[results["Score"].idxmin(),:]["Rounds"]
print("Parameters for Final Models:\n",final_model_params)
print("Score: {} +/- {}".format(results.iloc[results["Score"].idxmin(),:]["Score"],results.iloc[results["Score"].idxmin(),:]["STDV"]))
print("Rounds: ", optimal_rounds)

In [None]:
# Run Model with different Seeds
multi_seed_pred = dict()
all_feature_importance_df  = pd.DataFrame()

all_seeds = [0,20,400,4000,20000]
for seeds_x in all_seeds:
    print("Seed: ", seeds_x,)
    final_model_params["seed"] = seeds_x
    lgb_reg = lgb.train(
        final_model_params,
        lgtrain,
        num_boost_round = optimal_rounds + 1,
        verbose_eval=200)

    # Feature Importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = vocab
    fold_importance_df["importance"] = lgb_reg.feature_importance()
    all_feature_importance_df = pd.concat([all_feature_importance_df, fold_importance_df], axis=0)

    multi_seed_pred[seeds_x] =  list(lgb_reg.predict(test_df))
    print("###########################################################################################")
    del lgb_reg

In [None]:
cols = all_feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(
    by="importance", ascending=False)[:50].index
best_features = all_feature_importance_df.loc[all_feature_importance_df.feature.isin(cols)]
plt.figure(figsize=(8,10))
sns.barplot(x="importance", y="feature", 
            data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

# To DataFrame
sub_preds = pd.DataFrame.from_dict(multi_seed_pred).replace(0,0.000001)
del multi_seed_pred; gc.collect();

In [None]:
# Take Mean over Seed prediction
mean_sub = np.expm1(sub_preds.mean(axis=1).rename(target_var))
mean_sub.index = test_index

# Submit
mean_sub.to_csv('mean_sub_ep{}_sc{}.csv'.format(optimal_rounds,round(best_cv_score,5))
            ,index = True, header=True)
mean_sub.head()