In [None]:
!pip install lightgbm

In [None]:
# Dependencies 

import os
import gc
from functools import partial, wraps
from datetime import datetime as dt
import warnings
warnings.simplefilter('ignore', FutureWarning)

import numpy as np 
import pandas as pd

from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [None]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ss = pd.read_csv('Submissions/sample_submission.csv')

# RMSLE is not by default present in LightGBM. So define it. 

def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean(np.power(np.log1p(y_true + 1) - np.log1p(y_pred + 1), 2)))

In [None]:
# Create a NumPy array of predictors
X = np.array(train.drop(['id', 'num_orders'], axis=1))

# Create a NumPy array of the log values of the target
y = np.log(train['num_orders'].values)

In [None]:
X_test = np.array(test.drop(['id',], axis=1))

In [None]:
median_num_orders = np.median(train['num_orders'].values)

In [None]:
params = {
    'num_leaves' : 128, #64
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 7, #6 
    'learning_rate': 0.1,
    "lambda_l1": 0.1,
    'feature_fraction': 0.7,
    "bagging_freq": 6,
    "bagging_fraction": 0.2,
    "bagging_seed" : 42,
    "verbosity" : -1 }#1

n_estimators = 1000 #750

- Baseline Params of LightGBM model:

```python
params = {
    'num_leaves' : 64,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth': 6, 
    'learning_rate': 0.1,
    "lambda_l1": 0.1,
    'feature_fraction': 0.9,
    "bagging_freq": 1,
    "bagging_fraction": 0.4,
    "verbosity": -1}

n_estimators = 160
```

In [None]:
# Train, evaluate and predict and repeat
n_iters = 10
preds_buf = []
err_buf = []
for i in range(n_iters): 
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=i)
    d_train = lgb.Dataset(x_train, label=y_train)
    d_valid = lgb.Dataset(x_valid, label=y_valid)

    model = lgb.train(params, d_train, n_estimators, valid_sets = [d_train, d_valid],\
                      verbose_eval=1, early_stopping_rounds=25)

    preds = model.predict(x_valid, num_iteration=model.best_iteration)
    preds = np.exp(preds)
    preds[preds < 0] = median_num_orders
    err = rmsle(np.exp(y_valid), preds)
    err_buf.append(err)
    print('RMSLE = ' + str(err))
    
    preds = model.predict(X_test)
    preds = np.exp(preds)
    preds[preds < 0] = median_num_orders
    preds_buf.append(preds)

print('Mean RMSLE = ' + str(np.mean(err_buf)) + ' +/- ' + str(np.std(err_buf)))
# Average predictions
preds = np.mean(preds_buf, axis=0)

1. Mean RMSLE = 0.5844806514684168 +/- 0.00211429735921519
2. Mean RMSLE = 0.5699200420771778 +/- 0.0025914016800524468
3. Mean RMSLE = 0.5615087285976806 +/- 0.002934098036840698
4. Mean RMSLE = 0.5559986378972583 +/- 0.0025083052868352653
5. Mean RMSLE = 0.5434706176600281 +/- 0.0032578217336534324
6. Mean RMSLE = 0.5276496861152888 +/- 0.002364611153512707
7. Mean RMSLE = 0.4788315817420143 +/- 0.0015999403935491486

In [None]:
# Preparation of submission file
subm = pd.DataFrame()
subm['id'] = test.id.values
subm['num_orders'] = preds
subm.to_csv('final_submission_lightgbm_sayak.csv', index=False)