In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Predict Future Sales
## Import Base

In [None]:
items = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
item_categoria = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
shops = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")
sales = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")

items_full = items.merge(item_categoria, on="item_category_id", how='left')

In [None]:
submissao = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")
teste = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")

In [None]:
sales_full = (sales
              .merge(shops ,on="shop_id", how='left')
              .merge(items_full, on="item_id", how='left')
              #.drop(['shop_id', 'item_id', 'item_category_id'], axis=1)
             )

sales_full['date'] = pd.to_datetime(sales_full['date'], format='%d.%m.%Y')
sales_full['month'] = sales_full['date'].dt.to_period('M')

sales_full = sales_full.groupby([
    'month',
    "shop_id",
    "shop_name",
    "item_category_id",
    "item_category_name",
    "item_id",
    "item_name",
]).agg({"item_price":"mean","item_cnt_day":"sum"}).reset_index()


sales_full['item_category_name'] = sales_full['item_category_name'].str.strip()
sales_full['item_major_category'] = sales_full['item_category_name'].apply(lambda x: x.split('-')[0].strip())
sales_full['item_sales_day'] = sales_full['item_price'] * sales_full['item_cnt_day']

sales_full.head()

In [None]:
sales_full.month.max()

In [None]:
sales_full.groupby('month').sum('item_cnt_day').sort_values('month').plot.line(y='item_cnt_day')

In [None]:
sales_full['item_name'].value_counts()

In [None]:
sales_full.drop('month', axis=1).groupby('shop_name').apply(lambda df: df.sort_values(by='item_cnt_day', ascending=False))

In [None]:
sales_full.groupby(['shop_name', 'item_major_category']).apply(lambda df: df.sort_values(by='item_cnt_day', ascending=False))

In [None]:
sales_clean = sales_full[['item_id', 'shop_name', 'item_cnt_day', 'item_major_category']]

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
sales_clean.groupby(['shop_name', 'item_major_category']).sum('item_cnt_day').sort_values(by='item_cnt_day', ascending=False)

In [None]:
sales_full.head()

In [None]:
submissao.head()

In [None]:
teste.head()

## Feature Engineering

## Modeling

In [None]:
from catboost import CatBoostRegressor

In [None]:
X = sales_full.rename({'item_cnt_day':'item_cnt_month', 'item_sales_day':'item_sales_month'}, axis=1)
X = X[X.columns[~X.columns.str.contains('name')]]
X = X.drop('item_major_category', axis=1)
X['year_month'] = X['month']
X['month'] = X['year_month'].dt.month
X['year'] = X['year_month'].dt.year

y = X[['year_month', 'item_cnt_month']]
X.drop(['item_cnt_month', 'item_sales_month'], axis=1, inplace=True)

In [None]:
cut_date = '2015-08'

X_train = X[X.year_month < cut_date]
X_test = X[X.year_month >= cut_date]

y_train = y[y.year_month < cut_date]
y_test = y[y.year_month >= cut_date]

X_train.head()

In [None]:
y_train.head()

In [None]:
X_train = X_train.drop('year_month', axis=1)
X_test = X_test.drop('year_month', axis=1)
y_train = y_train.drop('year_month', axis=1)

model = CatBoostRegressor()
model.fit(X_train, y_train)

In [None]:
y_predictions = model.predict(X_test)

In [None]:
y_test['predictions'] = y_predictions

In [None]:
y_test.head()

In [None]:
y_test[['item_cnt_month', 'predictions']].plot(figsize=(20,10))

In [None]:
from sklearn.metrics import mean_absolute_percentage_error as mape

In [None]:
mape(y_test['item_cnt_month'], y_test['predictions'])