In [None]:
#load libraries
!pip install lightgbm
!pip install xgboost
import lightgbm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor , BaggingRegressor
from catboost import CatBoostRegressor
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#load  datasets
items_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
samples_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
itemsCat_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
sales_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shopes_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
test_df=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

**Analysis**

In [None]:
#display info foreach dataset
print('---------items----------')
items_df.info()
print('--------samples---------')
samples_df.info()
print('--------items Category---------')
itemsCat_df.info()
print('--------Sales---------')
sales_df.info()
print('--------Shopes---------')
shopes_df.info()
print('--------Test---------')
test_df.info()

In [None]:
sales_df["month"]=pd.DatetimeIndex(pd.to_datetime(sales_df['date'],format='%d.%m.%Y')).month
sales_df["year"]=pd.DatetimeIndex(pd.to_datetime(sales_df['date'],format='%d.%m.%Y')).year
sales_df["day"]=pd.DatetimeIndex(pd.to_datetime(sales_df['date'],format='%d.%m.%Y')).day
sales_df.head(10)

In [None]:
print("unique item",len(sales_df.item_id.unique()))
print("unique shop",len(sales_df.shop_id.unique()))

In [None]:
#total sales in 2013
df = sales_df[sales_df["year"]==2013][["month","item_cnt_day"]].groupby(["month"]).sum().reset_index()
plt.plot(df["month"],df["item_cnt_day"])

In [None]:
#total sales in 2014
df = sales_df[sales_df["year"]==2014][["month","item_cnt_day"]].groupby(["month"]).sum().reset_index()
plt.plot(df["month"],df["item_cnt_day"])

In [None]:
#total sales in 2015
df = sales_df[sales_df["year"]==2015][["month","item_cnt_day"]].groupby(["month"]).sum().reset_index()
plt.plot(df["month"],df["item_cnt_day"])

In [None]:
#sales_df["category"]=sales_df["item_id"].apply(lambda x: items_df[items_df["item_id"]==x]["item_category_id"].values[0])

In [None]:
train_df = sales_df[["item_id","shop_id","month","year","date_block_num","item_cnt_day"]].groupby(["item_id","shop_id","month","year","date_block_num"]).sum().reset_index()
train_df.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)
train_df.head(5)

In [None]:
train_df.boxplot(column=["item_cnt_month"])

In [None]:
train_df["item_cnt_month"] = train_df["item_cnt_month"].clip(0,1200)
train_df.boxplot(column=["item_cnt_month"])

In [None]:
#test_df["category"]=test_df["item_id"].apply(lambda x: items_df[items_df["item_id"]==x]["item_category_id"].values[0])
test_df["year"] = 2015
test_df["month"]=11
test_df["date_block_num"]=34
test_df.head(5)

In [None]:
#split data
features = ["item_id","shop_id","month","year","date_block_num"]
train_X, val_X, train_y, val_y = train_test_split(train_df[features],train_df["item_cnt_month"],test_size=0.1, random_state=0)

**Apply different machine learning modeling**

In [None]:
#1 - random forest
forest = RandomForestRegressor(n_estimators=50,random_state=0).fit(train_X,train_y)
forest.score(val_X,val_y)

In [None]:
y_predicted = forest.predict(val_X)
mean_squared_error(val_y, y_predicted, squared=False)

In [None]:
#2 - Light gradient boosting
lgb = LGBMRegressor().fit(train_X,train_y,eval_set=[(train_X,train_y), (val_X,val_y)],eval_metric='rmse', verbose=True)
lgb.score(val_X,val_y)

In [None]:
y_predicted = lgb.predict(val_X)
mean_squared_error(val_y, y_predicted, squared=False)

In [None]:
#3 - XGradient boosting
xgb = XGBRegressor().fit(train_X,train_y,eval_set=[(train_X,train_y), (val_X,val_y)],eval_metric='rmse', verbose=True)
xgb.score(val_X,val_y)

In [None]:
y_predicted = xgb.predict(val_X)
mean_squared_error(val_y, y_predicted, squared=False)

In [None]:
#4 - gradient boosting
gb = GradientBoostingRegressor(random_state=0).fit(train_X,train_y)
gb.score(val_X,val_y)

In [None]:
y_predicted = gb.predict(val_X)
mean_squared_error(val_y, y_predicted, squared=False)

In [None]:
xgb = XGBRegressor().fit(train_df[features],train_df["item_cnt_month"],eval_set=[(train_df[features],train_df["item_cnt_month"]), (val_X,val_y)],eval_metric='rmse', verbose=True)
pred = xgb.predict(test_df[features])
submission = pd.DataFrame({"ID": test_df["ID"],"item_cnt_month" : pred.round()})
submission.to_csv("submission.csv",index=False)