In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import scipy
from scipy import stats

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print("Setup Complete")

### Upload all dataset paths

In [None]:
dataset_paths = {
    'categories': '../input/competitive-data-science-predict-future-sales/item_categories.csv',
    'items': '../input/competitive-data-science-predict-future-sales/items.csv',
    'sales': '../input/competitive-data-science-predict-future-sales/sales_train.csv',
    'shops': '../input/competitive-data-science-predict-future-sales/shops.csv',
    'test': '../input/competitive-data-science-predict-future-sales/test.csv'
}
print('Paths are ready')

### Load dataset

In [None]:
dataset = { name: pd.read_csv(path) for name, path in dataset_paths.items()}
dataset.keys()

### Look at the dataset tables

In [None]:
def take_a_view(dataset: pd.DataFrame):
    print(dataset.info())

    print(f"""
        Searching for duplicates:
        Found: {sum(dataset.value_counts() > 1)}
    """)

    return dataset.head()

1) Categories

In [None]:
print(f"""
        Searching for id duplicates:
        Found: {sum(dataset['categories'].loc[:,'item_category_id'].value_counts() > 1)}
    """)
print(f"""
        Searching for category name duplicates:
        Found: {sum(dataset['categories'].loc[:,'item_category_name'].value_counts() > 1)}
    """)
take_a_view(dataset['categories'])

2) Items

In [None]:
print(f"""
        Searching for id duplicates:
        Found: {sum(dataset['items'].loc[:,'item_id'].value_counts() > 1)}
    """)
print(f"""
        Searching for category name duplicates:
        Found: {sum(dataset['items'].loc[:,'item_name'].value_counts() > 1)}
    """)
take_a_view(dataset['items'])

3) Sales

In [None]:
take_a_view(dataset['sales'])

In [None]:
ds = dataset['sales'].drop('item_cnt_day', axis=1)

take_a_view(ds)

Duplicates possibly were created in case of multiple item sales in one shop (maybe data was uploaded more than once a day by portions)

In [None]:
duplications = dataset['sales'].value_counts()

In [None]:
duplications[duplications > 1]

Despite the unknown nature of this duplications, I can easily leave them unchaged or regroup in future, because their influence is too small (both in appearance quantity and in item cnt impact) to cause big mistakes.

4) Shops

In [None]:
print(f"""
        Searching for id duplicates:
        Found: {sum(dataset['shops'].loc[:,'shop_id'].value_counts() > 1)}
    """)
print(f"""
        Searching for category name duplicates:
        Found: {sum(dataset['shops'].loc[:,'shop_name'].value_counts() > 1)}
    """)
take_a_view(dataset['shops'])

5) Tests

In [None]:
take_a_view(dataset['test'])

## Analyze lexical units

So we have the following string based columns, that can give us some neccesairy information:
* item_name
* category_name
* shop_name
    

First of all I need to find the way to explore this data. Let's start with simple splitting of sentences

In [None]:
import re

WORD_PATTERN = r'(?:\w+)'

def tokenize(sentence: str) -> list:
    return list(map(str.lower, re.findall(WORD_PATTERN, sentence)))

assert tokenize("Мама мыла раму") == ['мама', 'мыла', 'раму']

In [None]:
class WordsAnalyzer:
    def __init__(self, frame: pd.DataFrame, index, value):
        self._frame = frame
        self._index = index
        self._value = value
        self._categories = list(zip(frame.loc[:, index], frame.loc[:, value]))
        self.vectors = None
        self._dataset = None
    
    def process(self):
        if self.vectors == None:
            self.vectors = {}
            for category_name, category_id in self._categories:
                tokens = tokenize(category_name)
                for token in tokens:
                    if self.vectors.get(token, None) == None:
                        self.vectors[token] = [category_id]
                    else:
                        self.vectors[token].append(category_id)
            self._proceed = True
        return self.vectors
    
    def labels(self):
        return self._frame.index.to_list()
    
    def from_idxs(self, idx_list):
        return self._frame.set_index(self._index).loc[idx_list, self._value].to_list()
    
    def from_vals(self, val_list):
        return self._frame.set_index(self._value).loc[val_list, self._index].to_list()
    
    def to_series(self) -> pd.Series:
        if self._proceed:
            return pd.Series([tuple(ids) for ids in self.vectors.values()], self.vectors.keys())
        raise RuntimeError('Analyzer need to process data first')
        
    def describe(self):
        if self._proceed:
            series = pd.Series([tuple(ids) for ids in self.vectors.values()], self.vectors.keys())
            print(f"Token statistic (token appearance quantity -> amount of the following tokens):\n{series.map(len).value_counts()}")
            return None
        raise RuntimeError('Analyzer need to process data first')
        
    def from_tokens(self, token_list: list) -> list:
        result = set([el[1] for el in self._categories])
        for token in token_list:
            result &= set(self.vectors[token])
        return result
        
    def _name_to_vec(self, string):
        string_tokens = tokenize(string)
        return np.array([np.int8(token in string_tokens) for token in self.vectors.keys()])
    
    def _to_dataset(self):
        if self._dataset is None:
            self._dataset =  np.array([
                self._name_to_vec(category) for category, _ in self._categories
            ])
        return self._dataset
        
    def process_hierarchical_cluster(self):
        from scipy.cluster.hierarchy import linkage
        
        linked = linkage(self._to_dataset(), metric='cosine', method='complete')
        
        return linked
    
    def show_clusters(self):
        from scipy.cluster.hierarchy import dendrogram
        label_list = [el for el, _ in self._categories]
        linked = self.process_hierarchical_cluster()

        plt.figure(figsize=(30, 24))
        dendrogram(linked,
                    orientation='right',
                    labels=label_list,
                    distance_sort='descending',
                    show_leaf_counts=True)
        plt.show()
        return 

So I decided to use clustering methods to look at the groups of shops based on their tags. Now, I have 2 potential algorithms k-means and hieararchical, but I decided to use the second one in order to be able to easy visualize it's results. 

So I used 'cosine' distance between my categories because I find it more suitable for text analyzing (cosine similarity give smaller error for different text length.

Then I had to choose linkage type between clusters. I've chosen 'complete', because it uses maximal dissimilarity between cluster representatives, so my clusters can be more granular and link slowly, but already created clusters will have stronger relationships. Alternative type was 'average' because it minimize average dissimilarity between representatives.

### Shop clusterisation

In [None]:
shop_tokens = WordsAnalyzer(dataset['shops'], 'shop_name', 'shop_id')
shop_tokens.process()

shop_tokens.show_clusters()

### Category clusterisation

In [None]:
category_tokens = WordsAnalyzer(dataset['categories'], 'item_category_name', 'item_category_id')
category_tokens.process()

category_tokens.show_clusters()

## In Sales items need to be joined with their category

In [None]:
dataset['sales'] = dataset['sales'].merge(dataset['items'][['item_id', 'item_category_id']], on='item_id')

take_a_view(dataset['sales'])

In [None]:
dataset['sales'].date = dataset['sales'].date.astype('datetime64[ns]')

raw_dataset = dataset['sales']

date_indexed = dataset['sales'].set_index('date', drop=True)

from datetime import date

date_indexed = date_indexed.loc[date_indexed.index < np.datetime64(date(2015, 11, 1))]
date_indexed.tail()

In [None]:
statistics = {
    'word_based': {
        'categories': category_tokens,
        'shops': shop_tokens
    },
    'sales_cnt_based': {
        'shops': date_indexed.groupby([date_indexed.index.to_period("W"), 'shop_id']).item_cnt_day.sum().unstack(-1).fillna(0),
        'categories': date_indexed.groupby([date_indexed.index.to_period("W"), 'item_category_id']).item_cnt_day.sum().unstack(-1).fillna(0),
        'items': date_indexed.groupby([date_indexed.index.to_period("W"), 'item_id']).item_cnt_day.sum().unstack(-1).fillna(0)
    }
}

In [None]:
# sns.pairplot(dataset['sales'])

In [None]:
weekly_statistic = pd.DataFrame(date_indexed.groupby(date_indexed.index.to_period("D")).item_cnt_day.sum())
weekly_statistic['month'] = weekly_statistic.index.month
weekly_statistic['day'] = weekly_statistic.index.day
weekly_statistic['year'] = weekly_statistic.index.year
weekly_statistic['week'] = weekly_statistic.index.week

from calendar import month_name 

month_names = month_name[1:]

figure, axis = plt.subplots(4, 3 ,figsize=(28, 19))



months = {}
for idx, name in enumerate(month_names):
    months[name] = weekly_statistic.loc[weekly_statistic.month == (idx + 1), ['item_cnt_day', 'day', 'year']]
    sns.lineplot(ax=axis[idx % 4][idx // 4], data=months[name], y='item_cnt_day', x='day', hue='year')
    axis[idx % 4][idx // 4].set_title(f'Weekly sales in - {name}')


In [None]:
weekly_statistic

In [None]:
figure, axis = plt.subplots(1, 2,figsize=(20, 12))

def normal_95_ceil(series):
    return np.mean(series) + 1.96 * np.std(series)

def normal_95_floor(series):
    return np.mean(series) - 1.96 * np.std(series)

data=weekly_statistic.groupby(['year', 'week']).item_cnt_day.sum().groupby(['week']).agg([np.mean, normal_95_ceil, normal_95_floor])
sns.lineplot(data=data, ax=axis[0])
sns.histplot((weekly_statistic.groupby(['year', 'week']).item_cnt_day.sum() - data['mean']) / data['mean'], kde=True, ax=axis[1])

On the left side we can see our weekly sales with their prediction 95% intervals (1,96 std if we assume residuals to be normally distributed). 
On the right side u can see multiplicative errors (in order to ignore level scale), it is negtively skewed and a bit biased, so we can't assume normal distribution of residuals and it seems that we might have some tendecy

In [None]:
figure, axis = plt.subplots(1, 2,figsize=(20, 12))

data=weekly_statistic[weekly_statistic.year < 2015].groupby(['year', 'week']).item_cnt_day.sum().groupby(['week']).agg([np.mean, normal_95_ceil, normal_95_floor])
sns.lineplot(data=data, ax=axis[0])
sns.histplot((weekly_statistic[weekly_statistic.year < 2015].groupby(['year', 'week']).item_cnt_day.sum() - data['mean']) / data['mean'], kde=True, ax=axis[1], stat='probability')

In [None]:
weekly_total_cnt = date_indexed.groupby(date_indexed.index.to_period(freq="W")).item_cnt_day.sum()

plt.figure(figsize=(20, 12))
plt.title('Sales tendency')

weekly_total_cnt.plot(xlabel='Date', ylabel='Weekly items sold')

In [None]:
dataset_income = date_indexed
dataset_income['income'] = date_indexed['item_cnt_day'] * date_indexed['item_price'] 

weekly_statistic_income = pd.DataFrame(dataset_income.groupby(date_indexed.index.to_period("D")).income.sum())
weekly_statistic_income['month'] = weekly_statistic_income.index.month
weekly_statistic_income['day'] = weekly_statistic_income.index.day
weekly_statistic_income['year'] = weekly_statistic_income.index.year
weekly_statistic_income['week'] = weekly_statistic_income.index.week

from calendar import month_name 

month_names = month_name[1:]

figure, axis = plt.subplots(4, 3 ,figsize=(28, 19))



months = {}
for idx, name in enumerate(month_names):
    months[name] = weekly_statistic_income.loc[weekly_statistic_income.month == (idx + 1), ['income', 'day', 'year']]
    sns.lineplot(ax=axis[idx % 4][idx // 4], data=months[name], y='income', x='day', hue='year')
    axis[idx % 4][idx // 4].set_title(f'Weekly sales in - {name}')


In [None]:
figure, axis = plt.subplots(1, 2,figsize=(20, 12))

data=weekly_statistic_income.groupby(['year', 'week']).income.sum().groupby(['week']).agg([np.mean, normal_95_ceil, normal_95_floor])
sns.lineplot(data=data, ax=axis[0])
sns.histplot((weekly_statistic_income.groupby(['year', 'week']).income.sum() - data['mean']) / data['mean'], kde=True, ax=axis[1])

In [None]:
figure, axis = plt.subplots(1, 2,figsize=(20, 12))

data=weekly_statistic_income[weekly_statistic_income.year < 2015].groupby(['year', 'week']).income.sum().groupby(['week']).agg([np.mean, normal_95_ceil, normal_95_floor])
sns.lineplot(data=data, ax=axis[0])
sns.histplot((weekly_statistic_income[weekly_statistic_income.year < 2015].groupby(['year', 'week']).income.sum() - data['mean']) / data['mean'], kde=True, ax=axis[1], stat='probability')

This estimation is less skewed but biased as the previous one

In [None]:
weekly_total_sales = dataset_income.groupby(dataset_income.index.to_period(freq='W')).income.sum()

plt.figure(figsize=(20, 12))
plt.title('Sales tendency')

weekly_total_sales.plot(xlabel='Date', ylabel='Weekly income')

In [None]:
class CrossCategoryAnalyzer:
    def __init__(self, frame, srcs, columns):
        self._frame = frame
        self._categories = columns
        self._srcs = srcs
        
    def relation_matrix(self):
        cat1_values = self._frame.loc[:,self._categories[0]].values
        cat2_values = self._frame.loc[:,self._categories[1]].values
        raw_data = getattr(self._frame.groupby(self._categories[1]), self._categories[0], None).value_counts().map(lambda x: 1 if x != 0 else 0).unstack(-1).fillna(0).transpose().to_numpy()
        return raw_data
    
    def process_hierarchical_cluster(self, is_forward=True):
        from scipy.cluster.hierarchy import linkage
        
        linked = linkage(self.relation_matrix() if is_forward else self.relation_matrix().transpose(), metric='cosine', method='complete')
        
        return linked
    
    def show_clusters(self, is_forward=True):
        from scipy.cluster.hierarchy import dendrogram
        
        label_list = [el for el, _ in self._srcs[not is_forward]._categories]
        linked = self.process_hierarchical_cluster(is_forward)

        plt.figure(figsize=(30, 24))
        dendrogram(linked,
                    orientation='top',
                    labels=label_list,
                    distance_sort='descending',
                    show_leaf_counts=True)
        plt.show()

## Right now we gonna explore relationship between our shops and categories

In [None]:
shops_to_categories_rel = CrossCategoryAnalyzer(dataset['sales'], [statistics['word_based']['shops'], statistics['word_based']['categories']], ['shop_id', 'item_category_id'])
shops_to_categories_rel.show_clusters()

In [None]:
shops_to_categories_rel._frame

So we can see here two little clusters with a very big cosine disimilarity:

1) ['Курск ТЦ "Пушкинский"', 'Воронеж ТРЦ "Максимир"', 'Москва ТЦ "МЕГА Белая Дача II"', 'Москва ТЦ "МЕГА Теплый Стан" II', 'Уфа ТЦ "Семья" 2', 'Ярославль ТЦ "Альтаир"'] + ['Москва ТЦ "Ареал" (Беляево)', 'Москва ТЦ "Семеновский"', 'Москва МТРЦ "Афи Молл"']

2) ['Чехов ТРЦ "Карнавал"', 'Коломна ТЦ "Рио"'] + ['Москва ТРК "Атриум"', 'Омск ТЦ "Мега"']

In [None]:
shops_to_items_rel = CrossCategoryAnalyzer(dataset['sales'], [statistics['word_based']['shops'], None], ['shop_id', 'item_id'])
shops_to_items_rel.show_clusters()

## All in all it's just my thoughts so now we'll try to check if sales in this clusters have something in common 

In [None]:
class SalesAnalyzer:
    def __init__(self, frame, src=None):
        self._frame = frame
        self._src = src
        
    def to_df(self):
        return self._frame
    
    def heatmap(self, absolute=False):
        plt.figure(figsize=(25,14))
        
        matrix = self._frame        
        label_list = [el for el, _ in self._src._categories] if self._src is not None else matrix.columns        
        matrix.columns = label_list 
        
        cat_corr = matrix.corr() if not absolute else matrix.corr().abs() 

        sns.heatmap(cat_corr)
        
    def multiplot(self, index_list=None, by_label=False, title='Some tendecy'):
        matrix = self._frame
        label_list = [el for el, _ in self._src._categories] if self._src is not None and by_label else matrix.columns        
        matrix.columns = label_list
        data=matrix[index_list] if index_list is not None else matrix
        
        data.plot(figsize=(20, 12), title=title)
        
    def process_hierarchical_cluster(self):
        from scipy.cluster.hierarchy import linkage
        
        linked = linkage(self._frame.transpose(), metric='correlation', method='average')
        
        return linked
    
    def show_clusters(self):
        from scipy.cluster.hierarchy import dendrogram
        
        label_list = [el for el, _ in self._src._categories] if self._src is not None else self._frame.columns    
        linked = self.process_hierarchical_cluster()

        plt.figure(figsize=(30, 24))
        dendrogram(linked,
                    orientation='right',
                    labels=label_list,
                    distance_sort='ascending',
                    show_leaf_counts=True)
        plt.show()
        return 

In [None]:
shop_analyzer = SalesAnalyzer(statistics['sales_cnt_based']['shops'], statistics['word_based']['shops'])

In [None]:
shop_analyzer.heatmap(True)

## Let's cover some shops with similar categories

In [None]:
shop_analyzer.multiplot(index_list=['Курск ТЦ "Пушкинский"', 'Воронеж ТРЦ "Максимир"', 'Москва ТЦ "МЕГА Белая Дача II"', 'Москва ТЦ "МЕГА Теплый Стан" II', 'Уфа ТЦ "Семья" 2', 'Ярославль ТЦ "Альтаир"', 'Москва ТЦ "Ареал" (Беляево)', 'Москва ТЦ "Семеновский"', 'Москва МТРЦ "Афи Молл"'], by_label=True, title='Shops with common categories')

In [None]:
shop_analyzer.multiplot(index_list=['Чехов ТРЦ "Карнавал"', 'Коломна ТЦ "Рио"', 'Москва ТРК "Атриум"', 'Омск ТЦ "Мега"'], by_label=True, title='Shops with common names')

## Shops with common naming

In [None]:
shop_analyzer.multiplot(index_list=statistics['word_based']['shops'].from_vals(statistics['word_based']['shops'].from_tokens(['москва', 'тц'])), by_label=True, title='Shops with common names')

In [None]:
shop_analyzer.multiplot(index_list=statistics['word_based']['shops'].from_vals(statistics['word_based']['shops'].from_tokens(['мега', 'тц'])), by_label=True, title='Shops with common names')

In [None]:
shop_analyzer.multiplot(index_list=statistics['word_based']['shops'].from_vals(statistics['word_based']['shops'].from_tokens(['якутск'])), by_label=True, title='Shops with common names')

As we see, shops with common categories have something in commmon with their sales

In [None]:
shop_analyzer.show_clusters()

## Discovering categories

In [None]:
category_analyzer = SalesAnalyzer(statistics['sales_cnt_based']['categories'], statistics['word_based']['categories'])

In [None]:
category_analyzer.heatmap(True)

In [None]:
category_analyzer.multiplot(index_list=statistics['word_based']['categories'].from_vals(statistics['word_based']['categories'].from_tokens(['игровые', 'консоли'])), by_label=True, title='Category sales tendecy')

In [None]:
category_analyzer.multiplot(index_list=statistics['word_based']['categories'].from_vals(statistics['word_based']['categories'].from_tokens(['игры'])), by_label=True, title='Category sales tendecy')

Games are less dependent on seasonality, but have a peak in summer (mostly games are published here)

In [None]:
category_analyzer.multiplot(index_list=statistics['word_based']['categories'].from_vals(statistics['word_based']['categories'].from_tokens(['книги'])), by_label=True, title='Category sales tendecy')

Russian people start reading manga & comics

In [None]:
category_analyzer.multiplot(index_list=statistics['word_based']['categories'].from_vals(statistics['word_based']['categories'].from_tokens(['литература'])), by_label=True, title='Category sales tendecy')

Maybe some trouble with dataset with literature

In [None]:
category_analyzer.multiplot(index_list=statistics['word_based']['categories'].from_vals(statistics['word_based']['categories'].from_tokens(['музыка'])), by_label=True, title='Category sales tendecy')

People less and less use self-made audio 

In [None]:
category_analyzer.multiplot(index_list=statistics['word_based']['categories'].from_vals(statistics['word_based']['categories'].from_tokens(['подарки'])), by_label=True, title='Category sales tendecy')

Seasonality itself

In [None]:
category_analyzer.multiplot(index_list=statistics['word_based']['categories'].from_vals(statistics['word_based']['categories'].from_tokens(['ps3'])), by_label=True, title='Category sales tendecy')

In [None]:
category_analyzer.show_clusters()

Pretty reasonable, but strange at times

In [None]:
weekly_total_cnt_ = date_indexed.groupby(dataset_income.index).item_cnt_day.sum()
weekly_total_cnt_

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

plot_acf(weekly_total_cnt_, lags=[i for i in range(366)])
_ = plot_acf(weekly_total_cnt_, lags=[i for i in range(25, 40)])
_ = plot_acf(weekly_total_cnt_, lags=[0, 1, 7, 28, 92, 365])

So as we see auto-correlation function shows us that significant seasonality periods probably can be seen in [1-7, 30-31, 91-92, 365] 

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

decomposition = {}
decomposition['week'] = seasonal_decompose(weekly_total_cnt_, model='additive', period=7)
decomposition['week'].plot()
decomposition['month'] = seasonal_decompose(weekly_total_cnt_, model='additive', period=31)
decomposition['month'].plot()
decomposition['quater'] = seasonal_decompose(weekly_total_cnt_, model='additive', period=92)
decomposition['quater'].plot()
decomposition['year'] = seasonal_decompose(weekly_total_cnt_, model='additive', period=365)
_ = decomposition['year'].plot()

In [None]:
figure, axis = plt.subplots(2, 2,figsize=(20, 12))

axis[0][0].title.set_text('Week')
sns.histplot(decomposition['week'].resid.dropna(), kde=True, ax=axis[0][0], stat="probability")
axis[0][1].title.set_text('Month')
sns.histplot(decomposition['month'].resid.dropna(), kde=True, ax=axis[0][1], stat="probability")
axis[1][0].title.set_text('Quater')
sns.histplot(decomposition['quater'].resid.dropna(), kde=True, ax=axis[1][0], stat="probability")
axis[1][1].title.set_text('Year')
sns.histplot(decomposition['year'].resid.dropna(), kde=True,ax=axis[1][1])

In [None]:
from statsmodels.tsa.holtwinters import SimpleExpSmoothing   
from statsmodels.tsa.holtwinters import ExponentialSmoothing

fitted = pd.DataFrame({'base': fits})

## Lets use ETS with additive trend and seasonality to predict 4 month (from july 2015)

In [None]:
_, axs = plt.subplots(1, 1, figsize=(30, 24))

forecaster = ExponentialSmoothing(fitted['base'][:-120], trend='add', seasonal='add', seasonal_periods=365).fit()

fitted['ETS(A,A)'] = forecaster.fittedvalues
fitted['ETS(A,A)'][-120:] = forecaster.forecast(120) 
fitted.plot(ax = axs)

In [None]:
residuals = fitted['base'][-120:] - fitted['ETS(A,A)'][-120:] 
sns.histplot(residuals, bins=10, kde=True)
residuals.describe()

In [None]:
_, axs = plt.subplots(1, 1, figsize=(16, 9))

fitted.groupby(fitted.index.to_period('W')).agg(np.sum).plot(ax=axs)

### But we stil have 83 category 60 shops and 22k unique items. How to cover them all... I was told to create simple regression predictors providing something like naive, seasonal naive, mean time based forecast predictors and overall trend. Yeah, I probably know them, but I also read about exponential smoothing and found it more suitable because it provides self trend and seasonality estimation (while regression estimation of this components is less flexible)

In [None]:
dframe  = pd.DataFrame(weekly_total_cnt_)

def name_wrapper(x, name):
    x.__name__  = name
    return x

for i in range(11):
    dframe[f'is_{month_names[i]}'] = np.int8(dframe.index.month == i + 1)
    
dframe.reset_index(inplace=True)
dframe