In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Carregando o Dataset

In [None]:
sPathTranslated = '/kaggle/input/predict-future-sales-translated-dataset/'
sPathTrain = '/kaggle/input/competitive-data-science-predict-future-sales/'
sPathSup = '/kaggle/input/predict-future-sales-supplementary/'


In [None]:
dfShops = pd.read_csv(sPathTranslated + 'shops_en.csv', index_col='shop_id')
dfItems = pd.read_csv(sPathTranslated + 'items_en.csv', index_col='item_id')
dfCateg = pd.read_csv(sPathTranslated + 'item_categories_en.csv', index_col='item_category_id')
dfSalesTrain = pd.read_csv(sPathTrain + 'sales_train.csv')
dfSalesTest = pd.read_csv(sPathTrain + 'test.csv', index_col='ID')
dfSubm  = pd.read_csv(sPathTrain + 'sample_submission.csv', index_col='ID')
dfCalendar = pd.read_csv(sPathSup + 'calendar.csv')

In [None]:
from itertools import product
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
import category_encoders as ce
import warnings

warnings.filterwarnings("ignore")

# **Tratando Features**

In [None]:
dfCateg

In [None]:
# Extraindo categorias e grupos de categorias
dfCateg['grupo'] = dfCateg['item_category_name'].str.extract(r'(^[\w\s]*)')
dfCateg['grupo'] = dfCateg['grupo'].str.strip()

dfCateg['group_id'] = le.fit_transform(dfCateg.grupo.values)
dfCateg.sample(5)

In [None]:
dfItems

In [None]:
# Join categoria, grupo e group_id no dfItems
dfItems = dfItems.join(dfCateg, on='item_category_id')
dfItems.sample(10)

**Shops/Cats/Items preprocessamento
**


Observações:

Cada "shop_name" começa com o nome da cidade.

Cada "category" ou categoria do produto contém um tipo e subtipo em seu nome.


In [None]:
dfShops.head()

In [None]:
dfShops['shop_name'] = dfShops['shop_name'].str.replace('!','').str.lstrip().str.rstrip()
dfShops['city'] = dfShops['shop_name'].str.split(' ').map(lambda x: x[0])
dfShops['city_code'] = LabelEncoder().fit_transform(dfShops['city'])
dfShops = dfShops[['shop_name','city_code', 'city']]

In [None]:
dfShops = dfShops.drop([10])
dfShops

In [None]:
dfSalesTrain.loc[dfSalesTrain.shop_id == 10, 'shop_id'] = 11
dfSalesTrain.loc[dfSalesTrain.shop_id == 11]

In [None]:
dfSalesTrain.isnull().sum()

# Tratando Outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=dfSalesTrain.item_cnt_day)

plt.figure(figsize=(10,4))
plt.xlim(dfSalesTrain.item_price.min(), dfSalesTrain.item_price.max()*1.1)
sns.boxplot(x=dfSalesTrain.item_price)

dfSalesTrain = dfSalesTrain[dfSalesTrain.item_price<100000]
dfSalesTrain = dfSalesTrain[dfSalesTrain.item_cnt_day<1001]

Foi detectado um item com valor abaixo de zero. Será preenchido com o valor médio.

In [None]:
median = dfSalesTrain[
    (dfSalesTrain.shop_id==32)&
    (dfSalesTrain.item_id==2973)&
    (dfSalesTrain.date_block_num==4)&
    (dfSalesTrain.item_price>0)
].item_price.median()

dfSalesTrain.loc[dfSalesTrain.item_price<0, 'item_price'] = median

In [None]:
dfSalesTrain

In [None]:
#Para cada par shop_id/item_id devemos criar uma linha para cada mês (0 - 33)

grid = [] 

index_cols = ['date_block_num','shop_id', 'item_id']
meses = dfSalesTrain['date_block_num'].unique()

#We construct a grid of all possible shop_id/item_id pairs for a given month
for mes in meses:
    shop_ids = dfSalesTrain[dfSalesTrain['date_block_num'] == mes].shop_id.unique()
    item_ids = dfSalesTrain[dfSalesTrain['date_block_num'] == mes].item_id.unique()
    grid.append(np.array(list(product(*[[mes], shop_ids, item_ids])), dtype='int16'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)  

#We join the grid with the aggregated sales data per month
gb = dfSalesTrain.groupby(index_cols, as_index = False).agg({ 'item_cnt_day':'sum'})
gb.rename(columns = {'item_cnt_day':'item_cnt_month'},inplace = True)
df_sales = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)

#We add item price
gb = dfSalesTrain.groupby('item_id',as_index = False).agg({ 'item_price':'mean'})
gb.rename(columns ={'item_price':'avg_item_price'}, inplace = True)
df_sales = pd.merge(df_sales, gb,how='left', on='item_id').fillna(0)

#Clip target values
df_sales['item_cnt_month'] = np.clip(df_sales['item_cnt_month'], 0, 20)
df_sales.sort_values(index_cols, inplace=True)

In [None]:
grid

In [None]:
dfSalesTest

In [None]:
#We assign next Date Block Num to the test set
dfSalesTest['date_block_num'] = 34

#Concatenate train and test dataframes
dfSales = pd.concat([dfSalesTrain,dfSalesTest], ignore_index=True)


In [None]:
dfSales

In [None]:
dfSales.fillna(0, inplace = True)

In [None]:
#Splitting training into training and validation
#Using gridsearch on XGBoost will take very long time, so I decided to go with a fixed validation set for month 33
#and use evaluation built in functionnality of XGBoost to determine the best iteration
X_train = df_sales[df_sales['date_block_num'] < 33].drop('item_cnt_month',axis = 1)
y_train = df_sales[df_sales['date_block_num'] < 33].item_cnt_month
X_val = df_sales[df_sales['date_block_num'] == 33].drop('item_cnt_month',axis = 1)
y_val = df_sales[df_sales['date_block_num'] == 33].item_cnt_month
#Test Set
X_test  = df_sales[df_sales['date_block_num'] == 34].drop('item_cnt_month',axis = 1)

In [None]:
import time
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import sklearn.model_selection as skt
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from xgboost import plot_importance
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from xgboost import plot_importance
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet

In [None]:
#Model training and fiting
#Model is already serialized, uncomment to train the model again
ts = time.time()

xgb = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    subsample=0.8,
    colsample_bytree=0.8,
    eta = 0.3,
    seed=42)

xgb.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_val, y_val)], 
    verbose=True, 
    early_stopping_rounds = 10)

time.time() - ts

in progress... 