In [1]:
import lightgbm

In [2]:
import time
notebookstart= time.time()
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import random
random.seed(2018)
from sklearn import *
# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split

# Gradient Boosting
import lightgbm as lgb



In [3]:
id_col = "ID"
target_var = "target"

# House Keeping Parameters
Debug = False
Home = False
Build_Results_csv = False # if running for first time

results = pd.DataFrame(columns = ["Rounds","Score","STDV", "LB", "Parameters"])
print("Data Load Stage")

Data Load Stage


In [4]:
training = pd.read_csv('train.csv', index_col = id_col)
if Debug is True : training = training.sample(100)
traindex = training.index
testing = pd.read_csv('test.csv', index_col = id_col)
if Debug is True : testing = testing.sample(100)
testdex = testing.index

In [5]:
y = np.log1p(training[target_var])
training.drop(target_var,axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*training.shape))
print('Test shape: {} Rows, {} Columns'.format(*testing.shape))

Train shape: 4459 Rows, 4991 Columns
Test shape: 49342 Rows, 4991 Columns


In [6]:
print("Combine Train and Test")
df = pd.concat([training,testing],axis=0)
del training, testing
gc.collect()
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

# Modeling Datasets
test_df = df.loc[testdex,:]
vocab = df.columns

Combine Train and Test

All Data shape: 53801 Rows, 4991 Columns


In [7]:
lgtrain = lgb.Dataset(df.loc[traindex,vocab],y ,feature_name = "auto")
print("Starting LightGBM. Train shape: {}, Test shape: {}".format(df.loc[testdex,:].shape,test_df.shape))
print("Feature Num: ",len(vocab))
del df; gc.collect();

print("Light Gradient Boosting Regressor: ")
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    "learning_rate": 0.01,
    "num_leaves": 200,
    "feature_fraction": 0.50,
    "bagging_fraction": 0.50,
    'bagging_freq': 4,
    "max_depth": -1,
    "reg_alpha": 0.3,
    "reg_lambda": 0.1,
    #"min_split_gain":0.2,
    "min_child_weight":10,
    'zero_as_missing':True
}

modelstart= time.time()
# Find Optimal Parameters / Boosting Rounds
lgb_cv = lgb.cv(
    params = lgbm_params,
    train_set = lgtrain,
    num_boost_round=2500,
    stratified=False,
    nfold = 5,
    verbose_eval=50,
    seed = 23,
    early_stopping_rounds=75)

optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
best_cv_score = min(lgb_cv['rmse-mean'])

Starting LightGBM. Train shape: (49342, 4991), Test shape: (49342, 4991)
Feature Num:  4991
Light Gradient Boosting Regressor: 
[50]	cv_agg's rmse: 1.58742 + 0.0305161
[100]	cv_agg's rmse: 1.50458 + 0.0316924
[150]	cv_agg's rmse: 1.46337 + 0.0325538
[200]	cv_agg's rmse: 1.44265 + 0.0327991
[250]	cv_agg's rmse: 1.43354 + 0.032863
[300]	cv_agg's rmse: 1.43088 + 0.0337591
[350]	cv_agg's rmse: 1.42942 + 0.0337614
[400]	cv_agg's rmse: 1.43018 + 0.0342158


In [8]:
print("\nOptimal Round: {}\nOptimal Score: {} + {}".format(
    optimal_rounds,best_cv_score,lgb_cv['rmse-stdv'][optimal_rounds]))


Optimal Round: 339
Optimal Score: 1.4292676122745214 + 0.03369025245241405


In [9]:
results = results.append({"Rounds": optimal_rounds,
                          "Score": best_cv_score,
                          "STDV": lgb_cv['rmse-stdv'][optimal_rounds],
                          "LB": None,
                          "Parameters": lgbm_params}, ignore_index=True)
        
learning_rates = [0.012,0.008,0.016]
for param in learning_rates:
    print("Learning Rate: ", param)
    modelstart= time.time()
    lgbm_params["learning_rate"] = param
    # Find Optimal Parameters / Boosting Rounds
    lgb_cv = lgb.cv(
        params = lgbm_params,
        train_set = lgtrain,
        num_boost_round=10000,
        stratified=False,
        nfold = 5,
        verbose_eval=200,
        seed = 23,
        early_stopping_rounds=75)

    optimal_rounds = np.argmin(lgb_cv['rmse-mean'])
    best_cv_score = min(lgb_cv['rmse-mean'])

    print("Optimal Round: {}\nOptimal Score: {} + {}".format(
        optimal_rounds,best_cv_score,lgb_cv['rmse-stdv'][optimal_rounds]))

    results = results.append({"Rounds": optimal_rounds,
                              "Score": best_cv_score,
                              "STDV": lgb_cv['rmse-stdv'][optimal_rounds],
                              "LB": None,
                              "Parameters": lgbm_params}, ignore_index=True)

Learning Rate:  0.012
[200]	cv_agg's rmse: 1.4344 + 0.0334958
Optimal Round: 284
Optimal Score: 1.4287010145278027 + 0.034950291968031635
Learning Rate:  0.008
[200]	cv_agg's rmse: 1.45785 + 0.0314299
[400]	cv_agg's rmse: 1.42848 + 0.0341142
Optimal Round: 445
Optimal Score: 1.4269354837748918 + 0.0343219787670181
Learning Rate:  0.016
[200]	cv_agg's rmse: 1.42889 + 0.0343656
Optimal Round: 195
Optimal Score: 1.428719227824455 + 0.034146578426916964


In [10]:
if Home is True:
        with open('results.csv', 'a') as f:
            results.to_csv(f, header=False)

In [11]:
final_model_params = results.iloc[results["Score"].idxmin(),:]["Parameters"]
optimal_rounds = results.iloc[results["Score"].idxmin(),:]["Rounds"]

In [13]:
allmodelstart= time.time()
# Run Model with different Seeds
multi_seed_pred = dict()
all_feature_importance_df  = pd.DataFrame()

In [14]:
all_seeds = [27,22,300,401,7]
for seeds_x in all_seeds:
    modelstart= time.time()
    print("Seed: ", seeds_x,)
    # Go Go Go
    final_model_params["seed"] = seeds_x
    lgb_reg = lgb.train(
        final_model_params,
        lgtrain,
        num_boost_round = optimal_rounds + 1,
        verbose_eval=200)

    # Feature Importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = vocab
    fold_importance_df["importance"] = lgb_reg.feature_importance()
    all_feature_importance_df = pd.concat([all_feature_importance_df, fold_importance_df], axis=0)

    multi_seed_pred[seeds_x] =  list(lgb_reg.predict(test_df))
    #del lgb_reg

Seed:  27
Seed:  22
Seed:  300
Seed:  401
Seed:  7


In [15]:
all_feature_importance_df.head()

Unnamed: 0,feature,importance
0,48df886f9,0
1,0deb4b6a8,0
2,34b15f335,0
3,a8cb14b00,0
4,2f0771a37,0


In [14]:
print("All Model Runtime: %0.2f Minutes"%((time.time() - allmodelstart)/60))

sub_preds = pd.DataFrame.from_dict(multi_seed_pred).replace(0,0.000001)
del multi_seed_pred; gc.collect();

lgb_ans = np.expm1(sub_preds.mean(axis=1))
mean_sub = np.expm1(sub_preds.mean(axis=1).rename(target_var))
mean_sub.index = testdex

# Submit
mean_sub.to_csv('lgb.csv'
            ,index = True, header=True)
print("Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))


All Model Runtime: 3.65 Minutes
Runtime: 14.49 Minutes


In [15]:
import pickle

In [24]:
filename = 'finalized_lgb_model.sav'
pickle.dump( lgb_reg, open(filename, 'wb'))

ValueError: binary mode doesn't take an encoding argument

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))