In [None]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
import time

from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf, arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

from xgboost import XGBRegressor
from xgboost import plot_importance

import warnings
warnings.filterwarnings("ignore")

In [None]:
item_cat = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv",
                       dtype={'item_category_name': 'str', 'item_category_id': 'int32'})

items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv",
                    dtype={'item_name': 'str', 'item_id': 'int32','item_category_id': 'int32'})

sales = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv",
                    parse_dates=['date'], 
                    dtype={'date': 'str', 'date_block_num': 'int32', 'shop_id': 'int32', 
                          'item_id': 'int32', 'item_price': 'float32', 'item_cnt_day': 'int16'})

shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv",
                    dtype={'shop_name': 'str', 'shop_id': 'int32'})

test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv",
                   dtype={'ID': 'int32', 'shop_id': 'int32','item_id': 'int32'})

submission = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")

In [None]:
#fun to downcast the df
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df



In [None]:
'''
About predicting monthly sales for combinations of sales items in every shop
'''

In [None]:
#downcast test df
test = downcast_dtypes(test)

In [None]:
# item_cat.describe()
# items.head()
# sales.head()
# shops.head()
# sales.head()
test.head()

In [None]:
sales.head(10)
sales.columns

In [None]:
s = sales.item_id
s.max()

In [None]:
# sales.date = sales.date.apply(lambda x:datetime.datetime.strptime(x, "%d.%m.%Y"))


In [None]:
sales

In [None]:
monthly_sales = sales.groupby(["date_block_num", "shop_id", "item_id"])["date", "item_price", "item_cnt_day"].agg({"date":["min", "max"], "item_price":"mean","item_cnt_day": "sum"})
monthly_sales.head(20)

In [None]:
"""
which shop sells the most items
which item is sold the most 
in each shop which item is sold the most
item category with most items
category with most sold items

"""

In [None]:
#no of items by category
x = items.groupby(["item_category_id"]).count()
x = x.sort_values(by="item_id", ascending = False)

x=x.iloc[0:20].reset_index()
# x.head(20)
plt.figure(figsize = (12,8))
ax = sns.barplot(x.item_category_id, x.item_id, alpha=0.6)
plt.title("Items per category")
plt.ylabel("Item id")
plt.xlabel("Category")
plt.show()

In [None]:
#most expensive items
expns_items = sales.groupby(["item_price"], as_index=False)["item_id", "shop_id"].count()
expns_items = expns_items.sort_values(by="item_price", ascending=False)[:50]
# expns_items = expns_items.iloc[0:20].reset_index
# expns_items.head(5)


plt.figure(figsize=(12,8))
ax = sns.barplot(expns_items.item_id, expns_items.item_price, alpha=0.6)
plt.title("expensive items per shop")
plt.xlabel("the item_id")
plt.ylabel("the price")

plt.show()

In [None]:
#sales per month where date_block_num presents every month since these sales 
#were saved

ts_monthly = sales.groupby(["date_block_num"], as_index=False)["item_cnt_day"].sum()
plt.figure(figsize=(12,8))
ax = sns.barplot(ts_monthly.date_block_num, ts_monthly.item_cnt_day, alpha=0.6)
plt.title("sales by month")
plt.xlabel("the month")
plt.ylabel("the price")

plt.show()

In [None]:
item_cat.head(10)

In [None]:
train = sales.join(items, on='item_id', rsuffix='_').join(shops, on='shop_id', rsuffix='_').join(item_cat, on='item_category_id', rsuffix='_').drop(['item_id_', 'shop_id_', 'item_category_id_'], axis=1)

In [None]:
'''obvious trend in year where sales are up at the end of the year then drops'''

In [None]:
print('Train rows: ', train.shape[0])
print('Train columns: ', train.shape[1])

In [None]:
sales
print('Sales rows: ', sales.shape[0])
print('Sales columns: ', sales.shape[1])

In [None]:
train

In [None]:
train_monthly = train[['date', 'date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'item_cnt_day']]
train_monthly

In [None]:
#using shop_id and item_id that is in test set
test_shop_ids = test.shop_id.unique()
test_item_ids = test.item_id.unique()

lk_train = train[train["shop_id"].isin(test_shop_ids)]
lk_train = lk_train[train["item_id"].isin(test_item_ids)]

print("data before leaking", train.shape[0])
print("data after leaking", lk_train.shape[0])


In [None]:
lk_train.shape

In [None]:
#finding and solving outliers
plt.figure(figsize=(12,5))
plt.xlim(-100, 3000)
sns.boxplot(x=lk_train.item_cnt_day)

plt.figure(figsize=(12,5))
plt.xlim(lk_train.item_price.min(), train.item_price.max()*1.1)
sns.boxplot(x=lk_train.item_price)

In [None]:
lk_train.head(5)

In [None]:
#removing outliers
outliers_item_cnt = lk_train[lk_train["item_cnt_day"] > 1000].index
print(len(outliers_item_cnt))
# lk_train.head(4)
lk_train.drop(outliers_item_cnt, inplace=True)

In [None]:
#shops, cat, items preprocessing
#each shop_name starts with city name
#categories contains type and subtype in its name
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])

item_cat['split'] = item_cat['item_category_name'].str.split('-')
# item_cat['split']
item_cat['type'] = item_cat['split'].map(lambda x: x[0].strip())
item_cat['type_code'] = LabelEncoder().fit_transform(item_cat['type'])
item_cat['subtype'] = item_cat['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
item_cat['subtype_code'] = LabelEncoder().fit_transform(item_cat['subtype'])
item_cat

In [None]:
#droping text features
train_monthly = lk_train[['date', 'date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_price', 'item_cnt_day']]
train_monthly = train_monthly.sort_values('date').groupby(['date_block_num', 'shop_id', 'item_id','item_category_id'], as_index=False)
train_monthly = train_monthly.agg({'item_price':['sum','mean'],'item_cnt_day':['sum','mean','count']})
train_monthly.columns = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'item_price_sum', 'item_price_mean', 'item_cnt_sum', 'item_cnt_mean', 'item_cnt_count']
type(train_monthly)
# train_monthly.head(5)



In [None]:
train_monthly.columns

In [None]:
#building dataset with possible combinations of ['date_block_num, 'shop_id','item_id']
#to avoid any missing records
ts= time.time()
shop_ids = train_monthly['shop_id'].unique()
item_ids = train_monthly['item_id'].unique()

empty_df = []
for i in range(34):
    for shop in shop_ids:
        for item in item_ids:
            empty_df.append([i, shop, item])
            
combination_df = pd.DataFrame(empty_df, columns=['date_block_num','shop_id','item_id'])
combination_df = downcast_dtypes(combination_df)
# print(time.time()-ts)
combination_df
            

In [None]:
#diffrent way of creating combinations -- takes more time
# ts = time.time()
# comb_matrix = []
# for i in range(34):
#     sales_comb = train_monthly[train_monthly.date_block_num==i]
#     comb_matrix.append(np.array(list(product([i], train_monthly.shop_id.unique(), sales.item_id.unique()))))
    
# comb_df = pd.DataFrame(np.vstack(comb_matrix), columns=['date_block_num','shop_id','item_id'])
# comb_df


In [None]:
#extracting time based features
train_monthly['year'] = train_monthly['date_block_num'].apply(lambda x: ((x//12) + 2013))
train_monthly['month'] = train_monthly['date_block_num'].apply(lambda x: (x%12))
train_monthly['revenue'] = lk_train['item_price'] *  lk_train['item_cnt_day']
# lk_train['item_cnt_day'].isna().sum()

In [None]:
# print(train_monthly.isna().sum())
train_monthly.columns

In [None]:
#adding to target matrix
group = lk_train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day':['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)
cols = ['date_block_num','shop_id','item_id']
#adding sum of sales in a month to the target matrix
combination_df = pd.merge(combination_df, group, on=cols, how='left')
combination_df['item_cnt_month'] = combination_df['item_cnt_month'].fillna(0).clip(0,20).astype(np.float16)
combination_df

In [None]:
#adding city code, item_category_id, type_code and subtype_code to target matrix
combination_df['city_code'] = shop['city_code'].astype(np.int8)