# Load Library

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load Data

In [None]:
item_categories = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
train_df = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
sample_sub = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
test_df = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

In [None]:
train_df.head()

# Data Cleaning
* drop item_price < 0 and item price > 10000
* if item_cnt_day < 0, then item_cnt_day = 0
* drop item_cnt_day > 1001

In [None]:
train_df[train_df["item_cnt_day"] < 0] = 0
train_df = train_df[(train_df["item_price"] < 100000) & (train_df["item_price"] > 0)]
train_df = train_df[train_df["item_cnt_day"] < 1001]
train_df.drop(['item_price'], axis=1, inplace=True)
X = train_df.copy()

In [None]:
X.head()

# Summarize sales in month

In [None]:
X = X.groupby(["date_block_num", "shop_id", "item_id"], as_index=False).sum()
X = X.rename(columns={'item_cnt_day':'item_cnt_month'})
X.head()

# Transform time series 0 ~ 33

In [None]:
X = X.pivot_table(index=["shop_id", "item_id"], columns="date_block_num", values="item_cnt_month", fill_value=0)
X.reset_index(inplace=True)
X.head()

In [None]:
test_df.head()

# Prepare training data

In [None]:
X_train = np.array(X.values[:, 0:-1])
Y_train = np.array(X.values[:, -1])
print(X_train.shape)
print(Y_train.shape)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
model_linear = LinearRegression()
model_linear.fit(X_train, Y_train)

# LightGBM

In [None]:
import lightgbm as lgb

In [None]:
params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'num_leaves': 1023,
    'min_data_in_leaf':10,
    'feature_fraction':0.7,
    'learning_rate': 0.01,
    'num_rounds': 3000,
    'early_stopping_rounds': 300,
    'seed': 1
}
lgb_train = lgb.Dataset(X_train[:-300], Y_train[:-300])
lgb_valid = lgb.Dataset(X_train[-300:], Y_train[-300:])
model_lgb = lgb.train(params=params,
                      train_set=lgb_train,
                      valid_sets=[lgb_valid],
                      verbose_eval=50)
model_lgb.save_model('model_lgb.txt')

# Predict Test

In [None]:
model = model_lgb
id_list = []
pred_list = []
for idx in range(len(test_df)):
    if idx % 1000 == 0:
        print(idx)
    row_id = test_df.iloc[idx]["ID"]
    shop_id = test_df.iloc[idx]["shop_id"]
    item_id = test_df.iloc[idx]["item_id"]
    if X[(X["shop_id"]==shop_id) & (X["item_id"]==item_id)].empty:
        id_list.append(row_id)
        pred_list.append(0.0)
    else:
        temp = X[(X["shop_id"]==shop_id) & (X["item_id"]==item_id)]
        historys = np.hstack([temp.values[:,0], temp.values[:,1], temp.values[0,3:]])
        historys = historys[np.newaxis, :]
        pred = model.predict(historys)
        id_list.append(row_id)
        if pred[0] < 0:
            pred_list.append(0.0)
        elif pred[0] > 20:
            pred_list.append(20.0)
        else:
            pred_list.append(pred[0])

In [None]:
print(len(id_list))
print(len(pred_list))
submission = pd.DataFrame()
submission['ID'] = id_list
submission['item_cnt_month'] = pred_list
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)