In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
import datetime
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

print(os.listdir("../input"))

In [None]:
df = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df['store'].unique()

In [None]:
df['item'].unique()

In [None]:
df['date'] = pd.to_datetime(df['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

In [None]:
stores = len(df['store'].unique())

for store in range(1,4):
    df_store = df[(df['store'] == store) & 
           (df['item'] <= 3) & 
           (df['date'].dt.year <= 2015)]
    
    fig = plt.figure(figsize=(20,6))
    sns.lineplot(x='date', y='sales', hue='item', data=df_store)
    plt.title('Sales distribution in store %i between 2013 and 2015' % store)
    plt.show()

In [None]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['dayofweek'] = df['date'].dt.dayofweek
df = df.drop('date', axis=1)

df_test['year'] = df_test['date'].dt.year
df_test['month'] = df_test['date'].dt.month
df_test['day'] = df_test['date'].dt.day
df_test['dayofweek'] = df_test['date'].dt.dayofweek
df_test = df_test.drop('date', axis=1)

df.head()

In [None]:
X = df.drop('sales', axis=1)
y = df['sales']

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2)

In [None]:
params = {
    'objective': 'mean_squared_error',
    'metric':  ['l2', 'mape'],
    'num_leaves': 40,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.9,
    'bagging_freq': 5,
    'lambda_l2': 0.5,
    'verbose': 1,
}

In [None]:
lgb_train = lgb.Dataset(train_x,train_y)
lgb_valid = lgb.Dataset(test_x,test_y)

evals_result = {}

model = lgb.train(params=params, 
                  train_set=lgb_train, 
                  num_boost_round=10000, 
                  valid_sets=[lgb_train, lgb_valid],
                  early_stopping_rounds=50,
                  evals_result=evals_result,
                  verbose_eval=500)

In [None]:
lgb.plot_metric(evals_result, metric='l2')
plt.show()

lgb.plot_metric(evals_result, metric='mape')
plt.show()

In [None]:
df_test.head()

In [None]:
pd.read_csv('../input/sample_submission.csv').head()

In [None]:
col = [i for i in df_test.columns if i not in ['id']]
submission = df_test[['id']]

sales = np.around(model.predict(df_test[col])).astype(int)

submission['sales'] = sales.tolist()

In [None]:
submission.head()

In [None]:
submission.to_csv("Submission.csv", index=False)

In [None]:
'''lgb_reg = lgb.LGBMRegressor()

lgb_reg_params = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

lgb_reg_grid = GridSearchCV(lgb_reg,
                        lgb_reg_params,
                        cv = 5,
                        n_jobs = 5,
                        verbose=True)

lgb_reg_grid = lgb_reg_grid.fit(train_x,train_y,verbose=3)
'''

In [None]:
'''
num_round = 10
lgb.cv(param, 
       train_data, 
       num_round, 
       nfold=5,)
'''

In [None]:
'''
lgb_params = {  'n_estimators': range(10, 300, 80), 
                'num_leaves': range(20,50,15)}

lgb_grid = GridSearchCV(lgb.LGBMRegressor(), lgb_params, verbose=3, cv=5, scoring='neg_mean_squared_error')
lgb_grid.fit(X, y)

lgb_est = lgb_grid.best_estimator_

lgb_est = gridSearchCV.best_estimator_
lgb_score = cross_val_score(lgb_est, X, y, cv=5)
print('LightGBoost Cross Validation Score: ', round(lgb_score.mean() * 100, 2).astype(str) + '%')
'''