In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import scipy
from scipy import stats

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

print("Setup Complete")

In [None]:
dataset_paths = {
    'categories': '../input/competitive-data-science-predict-future-sales/item_categories.csv',
    'items': '../input/competitive-data-science-predict-future-sales/items.csv',
    'sales': '../input/competitive-data-science-predict-future-sales/sales_train.csv',
    'shops': '../input/competitive-data-science-predict-future-sales/shops.csv',
    'test': '../input/competitive-data-science-predict-future-sales/test.csv'
}
print('Paths are ready')

In [None]:
dataset = { name: pd.read_csv(path) for name, path in dataset_paths.items()}
dataset.keys()

In [None]:
sales = dataset['sales']
items = dataset['items']
categories = dataset['categories']
shops = dataset['shops']

In [None]:
sales.date = sales.date.astype('datetime64[ns]')

print("Before:", sales.shape)

from datetime import date

sales = sales.loc[sales.date < np.datetime64(date(2015, 11, 1))]

sales_train = sales[
    (sales["item_cnt_day"] < 1000)
    & (sales["item_price"] > 0)
    & (sales["item_price"] < 60000)
].copy()
print("After:", sales_train.shape)

sales.head()

In [None]:
force_category = {
    'category': {
        "PC - Гарнитуры/Наушники": "Аксессуары",
        "Игры MAC - Цифра": "Игры",
        "Игры Android - Цифра": "Игры",
        "Чистые носители (шпиль)": "Чистые носители",
        "Чистые носители (штучные)": "Чистые носители",
    },
    'shop': {
        'Интернет-магазин ЧС': 'Интернет-магазин',
        'Цифровой склад 1С-Онлайн': 'Склад',
        'Выездная Торговля': 'Выездная Торговля',
        '!Якутск Орджоникидзе, 56 фран': 'Якутск',
        '!Якутск ТЦ "Центральный" фран': 'Якутск',
    },
}

pattern = {
    'category': ' - ',
    'shop': ' ',
}

def create_transformer(force_category, pattern):
    def _wrapped(value):
        if value in force_category:
            return force_category[value]

        split = value.split(pattern)
        if len(split) > 1:
            return split[0]

        return value
    return _wrapped
    
make_cat_name = create_transformer(force_category['category'], pattern['category'])
make_city_name = create_transformer(force_category['shop'], pattern['shop'])

In [None]:
class PreprocessignPipeline:
    
    class PipelineIterator:
        def __init__(self, dataset, tasks, task_queue):
            self.tasks = tasks
            self.task_queue = task_queue
            self.dataset = dataset
            self.current_task = None
            self.result_storage = {}
            self.proceed = False
            
        def __iter__(self):
            if not self.proceed:
                dataset = self.dataset
                for task in self.task_queue:
                    self.current_task = self.tasks[task]
                    try:
                        proceed_task = self.current_task(dataset)
                        if not proceed_task is None:
                            dataset = proceed_task
                        self.result_storage[task] = dataset
                        print(f'Stage - {task} complete')
                    except:
                        print(f'Exception occured in stage {task}')
                        raise
                    yield self.result_storage[task]
                self.proceed = True
            else:
                for task in self.task_queue:
                    yield self.result_storage[task]
            
        def proceed_all(self):
            if not self.proceed:
                dataset = self.dataset
                for task in self.task_queue:
                    self.current_task = self.tasks[task]
                    try:
                        proceed_task = self.current_task(dataset)
                        if not proceed_task is None:
                            dataset = proceed_task
                        self.result_storage[task] = dataset
                        print(f'Stage - {task} complete')
                    except:
                        print(f'Exception occured in stage {task}')
                        raise
                    self.proceed = True
            return self.result_storage
        
    def __init__(self, tasks, task_queue):
        self.tasks = tasks
        self.task_queue = task_queue
        
    def __call__(self, dataset):
        return self.PipelineIterator(dataset, self.tasks, self.task_queue)

In [None]:
data_preprocessing = {}

# Add column created by transformer
def append_columns(dataset, columns, transformers):
    for column, transformer in zip(columns, transformers):
        dataset[column] = transformer(dataset)

# Add corresponding category and shop id's to each sale
data_preprocessing['id_merging_stage'] = lambda dataset: dataset.merge(
    items, 
    on='item_id'
).merge(
    shops,
    on='shop_id'
).merge(
    categories,
    on='item_category_id'
)

# Add summary among shop_id and category_id above similar time periods (daily intervals)
data_preprocessing['summarizing_and_name_merging_stage'] = lambda dataset: dataset.groupby(
    ['date', 'date_block_num', 'shop_id', 'item_category_id', 'item_category_name', 'shop_name']
).item_cnt_day.sum().reset_index().sort_values('date')

data_preprocessing['add_generalized_names_and_encode_stage'] = lambda dataset: append_columns(
    dataset=dataset, 
    columns=[
        'global_item_category_name',
        'city_name',
        'global_item_category_name_id',
        'city_id',
    ], 
    transformers=[
        lambda _dataset: _dataset["item_category_name"].apply(
            make_cat_name
        ),
        lambda _dataset: _dataset['shop_name'].apply(
            make_city_name
        ),
        lambda _dataset: LabelEncoder().fit_transform(_dataset['global_item_category_name']),
        lambda _dataset: LabelEncoder().fit_transform(_dataset['city_name']),
    ]
)

        
data_preprocessing['date_block_num_renaming'] = lambda dataset: dataset.rename(columns={'date_block_num': 'month_block'}, inplace=True)
        
data_preprocessing['date_encoding_stage'] = lambda dataset: append_columns(
    dataset=dataset, 
    columns=[
        'week_block',
        'day_block',
    ], 
    transformers=[
        lambda _dataset: LabelEncoder().fit_transform(_dataset['date'].dt.to_period('W')),
        lambda _dataset: LabelEncoder().fit_transform(_dataset['date'].dt.to_period('D')),
    ]
)

In [None]:
pipeline = PreprocessignPipeline(
    tasks=data_preprocessing, 
    task_queue = [
        'id_merging_stage',
        'summarizing_and_name_merging_stage',
        'add_generalized_names_and_encode_stage',
        'date_block_num_renaming',
        'date_encoding_stage',
    ]
)

In [None]:
pipeline_test = pipeline(sales)
pipeline_train = pipeline(sales_train)

In [None]:
_ = pipeline_test.proceed_all()
_ = pipeline_train.proceed_all()

In [None]:
task_df = {}

task_df['test'] = pipeline_test.result_storage['date_encoding_stage']
task_df['train'] = pipeline_train.result_storage['date_encoding_stage']

In [None]:
idx = task_df['test'].loc[:,['city_id', 'global_item_category_name_id', 'city_name', 'global_item_category_name']].value_counts().sort_index()
idx = pd.DataFrame({'id': [i for i in range(idx.size)]}, idx.index)
idx.reset_index(inplace=True)

In [None]:
task_df['test'] = task_df['test'].merge(idx, on=['city_id', 'global_item_category_name_id', 'city_name', 'global_item_category_name'])
task_df['train'] = task_df['train'].merge(idx, on=['city_id', 'global_item_category_name_id', 'city_name', 'global_item_category_name'])

In [None]:
idx['pair_name'] = idx['city_name'] + ' - ' + idx['global_item_category_name']

## ETS based predictions

### Worst estimation

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error as mse

class ETSModelRaw:
    def __init__(self, trend='add', seasonal='add', freq='M', seasonal_periods=12):
        self._model_kwargs = {
            'trend': trend,
            'seasonal': seasonal,
            'seasonal_periods': seasonal_periods,
            'freq': freq
        }
        self._forecasters = []
    
    def _smooth_line(self, series):
        return ExponentialSmoothing(
                    series, 
                    trend=self._model_kwargs['trend'], 
                    seasonal=self._model_kwargs['seasonal'], 
                    seasonal_periods=self._model_kwargs['seasonal_periods']
        ).fit(optimized=True)
        
    def fit(self, dataset):
        _dataset = dataset.groupby(['date_block_num', 'id']).item_cnt_day.sum().unstack(0).fillna(0).transpose()
        serieses = [_dataset.loc[:, id_] for id_ in _dataset.columns]
        
        from multiprocessing import Pool
        with Pool() as p:
            self._forecasters = p.map(
                self._smooth_line,
                serieses
            )
        
        return self._forecasters
    
    def predict(self, cnt):
        return [forecaster.forecast(cnt).sum() for forecaster in self._forecasters]
    
    def validate(self, train, test):
        from multiprocessing import Pool
        freq = self._model_kwargs['freq']
        
        entry_const = {
            'M': 12,
            'W': 52,
            'D': 365
        }
        
        sort_name_const = {
            'M': 'month_block',
            'W': 'week_block',
            'D': 'day_block'
        }
        
        step_const = {
            'M': 1,
            'W': 4,
            'D': 28
        }
        
        step_size = step_const[freq]
        
        _dataset = train.groupby([sort_name_const[freq], 'id']).item_cnt_day.sum().unstack(0).fillna(0).transpose()
        _test = test.groupby([sort_name_const[freq], 'id']).item_cnt_day.sum().unstack(0).fillna(0).transpose()
        entry = 2*entry_const[freq]
        _min = entry
        maximal = _dataset.index.max()

        errors = [
            [], [], []
        ]

        print('Start validation')

        while (entry <= maximal) :
            serieses = [_dataset.loc[_dataset.index < entry, id_] for id_ in _dataset.columns]

#             print(serieses)
            
            
            with Pool() as p:
                self._forecasters = p.map(
                    self._smooth_line,
                    serieses
                )



            y_vals = _test.iloc[[entry + i for i in range(step_size)]].sum().transpose().to_numpy()
            p = _dataset.iloc[[entry + i for i in range(step_size)]].sum().transpose().to_numpy()
            print(f'difference {mse(y_vals,p)}')
            
            predicted = self.predict(step_size)

            errors[0].append(entry)

            assert len(y_vals) == len(predicted), 'predictions should be the same size'
            assert len(y_vals) == 438, f'len {len(p)}'

            errors[1].append(mse(y_vals, predicted))
            print(f'Split {entry-_min}/{maximal-_min}')
            print(f'MSE error: {errors[1][-1]}')

            report = pd.DataFrame({'true_values': y_vals, 'predicted': predicted})
            errors[2].append(report)

            entry += step_size
            
        return errors

In [None]:
month_model = ETSModelRaw()
week_model = ETSModelRaw(freq='W', seasonal_periods=52)

In [None]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

errors = {}

errors['month'] = month_model.validate(task_df['train'], task_df['test'])

In [None]:
errors['week'] = week_model.validate(task_df['train'], task_df['test'])

In [None]:
_, axs = plt.subplots(2, 2, figsize=(20, 9))

axs[0][0].set_title("MSE histogram in validation scheme")
axs[0][1].set_title("MSE scatterplot")
axs[0][1].set_xlabel('window')
axs[0][1].set_ylabel('MSE')

axs[1][0].set_title("MSE histogram in validation scheme [week]")
axs[1][1].set_title("MSE scatterplot [week]")
axs[1][1].set_xlabel('window')
axs[1][1].set_ylabel('MSE')

sns.scatterplot(errors['month'][0], errors['month'][1], ax=axs[0][1])
sns.histplot(errors['month'][1], ax=axs[0][0])

sns.scatterplot(errors['week'][0][:-1], errors['week'][1][:-1], ax=axs[1][1])
sns.histplot(errors['week'][1], ax=axs[1][0])

In [None]:
def stat_info(errors):
    mean_residuals = pd.DataFrame({'abs_resid': [0 for _ in range(438)], 'resid': [0 for _ in range(438)]}, [i for i in range(438)])
    residual_series = None
    for _idx, report in enumerate(errors[2]):
        residuals = get_statistics(report)
        mean_residuals['abs_resid'] += residuals['abs_residuals']
        mean_residuals['resid'] += residuals['residuals']
        resids = residuals['residuals'].reset_index()
        resids['window'] = _idx
        resids.set_index(['index', 'window'], inplace=True)
        if residual_series is None:
            residual_series = resids
        else:
            residual_series = pd.concat([residual_series, resids])
    mean_residuals /= len(errors[2])
    return mean_residuals, residual_series

In [None]:
def get_statistics(report):
    stat = report.copy()
    stat['residuals'] = stat['predicted'] - stat['true_values'] 
    stat['id'] = stat.index
    stat['abs_residuals'] = stat['residuals'].abs()
    stat['percentage'] = 2*(stat['residuals'])/(stat['true_values'] + stat['predicted'])
    return stat

In [None]:
proceed_residuals = {}

proceed_residuals['month'] = stat_info(errors['month'])
proceed_residuals['week'] = stat_info(errors['week'])

### Stable interval exploration

In [None]:
from time import sleep

while True:
    sleep(3)

#### 1) Month

In [None]:
stable_residual_interval = proceed_residuals['month'][1].reset_index()[proceed_residuals['month'][1].reset_index().window < 8].rename(columns={'index': 'id'}).merge(idx)
stable_residual_interval['abs_residuals'] = abs(stable_residual_interval['residuals'])
sns.boxplot(data=stable_residual_interval, x='residuals')

In [None]:
total_error = stable_residual_interval.residuals @ stable_residual_interval.residuals
outliers = stable_residual_interval[stable_residual_interval.abs_residuals > 200]
outliers_error = outliers.residuals @ outliers.residuals
print(f'{len(outliers)/len(stable_residual_interval)} residuals cause {outliers_error/total_error} mistake')

In [None]:
"""
    Theory of 'giant outliers':
    
    We consider that: 
        <<less than 5% of residuals in each window cause more than 80% of total error for this window>> 
    
    --> So we would check top 5% residuals by absolute error in each window (top 22 id pairs)
"""

In [None]:
giants = []
totals = []

for i in range(8):
    slice_ = stable_residual_interval[stable_residual_interval.window == i]    
    giants.append(slice_.sort_values('abs_residuals').tail(22))
    totals.append(slice_.residuals @ slice_.residuals)
    giants[i]['part'] = (slice_.residuals ** 2) / totals[i]
    print(f'Window {i}: 5% cause {(giants[i].residuals @ giants[i].residuals)/(totals[i])}')

In [None]:
giant_outliers = pd.concat(giants)

pair_name_bag = tuple(giant_outliers.loc[:,['pair_name', 'window', 'residuals']].set_index(['pair_name', 'window']).unstack().fillna(0).index)

giant_outliers_matrix = {}

target = stable_residual_interval[stable_residual_interval.pair_name.isin(pair_name_bag)]

giant_outliers_matrix['pair_name'] = target.loc[:,['pair_name', 'window', 'residuals']].set_index(['pair_name', 'window']).unstack().fillna(0)

In [None]:
giant_outliers_matrix['pair_name'].transpose().plot(figsize=(30, 24))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print (giant_outliers.pair_name.value_counts())

In [None]:
"""
    EDA of results
"""

In [None]:
def explore_plot(dataset, periodicy, global_categories, cities):
    _df = dataset.merge(idx)[(task_df['test'].global_item_category_name.isin(global_categories)) & (task_df['test'].city_name.isin(cities))]
    return _df.groupby([periodicy, 'pair_name']).item_cnt_day.sum().unstack().fillna(0).plot(figsize=(20, 10))

In [None]:
def residual_plot(dataset, global_categories, cities):
    _df = dataset.merge(idx)[(dataset.global_item_category_name.isin(global_categories)) & (dataset.city_name.isin(cities))]
    return _df.loc[:,['pair_name', 'window', 'residuals']].set_index(['window', 'pair_name']).unstack().fillna(0).plot(figsize=(20, 10))

In [None]:
residual_plot(stable_residual_interval, ['Игры PC', 'Кино', 'Подарки', 'Игры'], ['Москва', 'СПб', 'Интернет-магазин', 'Якутск'])

In [None]:
explore_plot(task_df['test'], 'month_block', ['Игры PC', 'Кино', 'Подарки', 'Игры'], ['Москва', 'СПб', 'Интернет-магазин', 'Якутск'])

#### Week

In [None]:
stable_residual_interval = proceed_residuals['week'][1].reset_index()[(proceed_residuals['week'][1].reset_index().window < 9) & (proceed_residuals['week'][1].reset_index().window > 0)].rename(columns={'index': 'id'}).merge(idx)
stable_residual_interval['abs_residuals'] = abs(stable_residual_interval['residuals'])
sns.boxplot(data=stable_residual_interval, x='residuals')

In [None]:
total_error = stable_residual_interval.residuals @ stable_residual_interval.residuals
outliers = stable_residual_interval[stable_residual_interval.abs_residuals > 160]
outliers_error = outliers.residuals @ outliers.residuals
print(f'{len(outliers)/len(stable_residual_interval)} residuals cause {outliers_error/total_error} mistake')

In [None]:
giants = []
totals = []

for i in range(8):
    slice_ = stable_residual_interval[stable_residual_interval.window == i + 1]    
    giants.append(slice_.sort_values('abs_residuals').tail(22))
    totals.append(slice_.residuals @ slice_.residuals)
    giants[i]['part'] = (slice_.residuals ** 2) / totals[i]
    print(f'Window {i + 1}: 5% cause {(giants[i].residuals @ giants[i].residuals)/(totals[i])}')

In [None]:
giant_outliers = pd.concat(giants)

pair_name_bag = tuple(giant_outliers.loc[:,['pair_name', 'window', 'residuals']].set_index(['pair_name', 'window']).unstack().fillna(0).index)

giant_outliers_matrix = {}

target = stable_residual_interval[stable_residual_interval.pair_name.isin(pair_name_bag)]

giant_outliers_matrix['pair_name'] = target.loc[:,['pair_name', 'window', 'residuals']].set_index(['pair_name', 'window']).unstack().fillna(0)

In [None]:
giant_outliers_matrix['pair_name'].transpose().plot(figsize=(30, 24))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print (giant_outliers.pair_name.value_counts())

In [None]:
_, axs = plt.subplots(1, 2, figsize=(18, 5))

axs[0].set_title("Histogram plot [month]")
axs[1].set_title("Histogram plot [week]")
# axs[2].set_title("Histogram plot [day]")

sns.histplot(proceed_residuals['month'][1].residuals[abs(proceed_residuals['month'][1].residuals) < 500], ax=axs[0])
sns.histplot(proceed_residuals['week'][1].residuals[abs(proceed_residuals['week'][1].residuals) < 500], ax=axs[1])
# sns.histplot(proceed_residuals['day']['resid'], ax=axs[2])

## Unstable regions

### Month

In [None]:
unstable_residual_interval = proceed_residuals['month'][1].reset_index()[proceed_residuals['month'][1].reset_index().window > 7].rename(columns={'index': 'id'}).merge(idx)
unstable_residual_interval['abs_residuals'] = abs(unstable_residual_interval['residuals'])

In [None]:
_, axs = plt.subplots(1, 2, figsize=(18, 5))

sns.boxplot(unstable_residual_interval[unstable_residual_interval.window == 8].residuals, ax=axs[0])
sns.boxplot(unstable_residual_interval[unstable_residual_interval.window == 9].residuals, ax=axs[1])

In [None]:
giants = []
totals = []

for i in range(8,10):
    slice_ = unstable_residual_interval[unstable_residual_interval.window == i]    
    giants.append(slice_.sort_values('abs_residuals').tail(22))
    totals.append(slice_.residuals @ slice_.residuals)
    giants[i - 8]['part'] = (slice_.residuals ** 2) / totals[i - 8]
    print(f'Window {i}: 5% cause {(giants[i - 8].residuals @ giants[i - 8].residuals)/(totals[i - 8])}')

In [None]:
giants[0]

In [None]:
giants[1]

In [None]:
explore_plot(task_df['test'], 'month_block', ['Билеты (Цифра)', 'Служебные'], ['Склад', 'Интернет-магазин'])

In [None]:
explore_plot(task_df['test'], 'month_block', ['Игры', 'Билеты (Цифра)', 'Служебные', 'Подарки'], ['Москва'])

### Week

In [None]:
unstable_residual_interval = proceed_residuals['week'][1].reset_index()[(proceed_residuals['week'][1].reset_index().window > 8) | (proceed_residuals['week'][1].reset_index().window < 1)].rename(columns={'index': 'id'}).merge(idx)
unstable_residual_interval['abs_residuals'] = abs(unstable_residual_interval['residuals'])

In [None]:
_, axs = plt.subplots(1, 3, figsize=(18, 5))

sns.boxplot(unstable_residual_interval[unstable_residual_interval.window == 0].residuals, ax=axs[0])
sns.boxplot(unstable_residual_interval[unstable_residual_interval.window == 9].residuals, ax=axs[1])
sns.boxplot(unstable_residual_interval[unstable_residual_interval.window == 10].residuals, ax=axs[2])

In [None]:
giants = []
totals = []

for _idx, i in enumerate([0,9,10]):
    slice_ = unstable_residual_interval[unstable_residual_interval.window == i]    
    giants.append(slice_.sort_values('abs_residuals').tail(22))
    totals.append(slice_.residuals @ slice_.residuals)
    giants[_idx]['part'] = (slice_.residuals ** 2) / totals[_idx]
    print(f'Window {i}: 5% cause {(giants[_idx].residuals @ giants[_idx].residuals)/(totals[_idx])}')

In [None]:
giants[0]

In [None]:
giants[1]

In [None]:
giants[2]

In [None]:
explore_plot(task_df['test'], 'week_block', ['Подарки'], ['Москва'])

In [None]:
explore_plot(task_df['test'], 'week_block', ['Билеты (Цифра)', 'Служебные'], ['Склад', 'Интернет-магазин'])