In [None]:
import numpy as np
import pandas as pd

import scipy
from scipy import stats

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

OUTPUT_PATH = './'

UPLOAD_PREVIOUS = True

FEATURE_TYPE = 'scaled'

print("Setup Complete")

In [None]:
dataset_paths = {
    'categories': '../input/competitive-data-science-predict-future-sales/item_categories.csv',
    'items': '../input/competitive-data-science-predict-future-sales/items.csv',
    'sales': '../input/competitive-data-science-predict-future-sales/sales_train.csv',
    'shops': '../input/competitive-data-science-predict-future-sales/shops.csv',
    'test': '../input/competitive-data-science-predict-future-sales/test.csv'
}
preproceed_paths = {
    'df_rolled': '/kaggle/input/fstsfresh/df_rolled.csv',
    'features': '/kaggle/input/fstsfresh/features.csv',
    'features_filtered': '/kaggle/input/fstsfresh/features_filtered.csv'
}
print('Paths are ready')

In [None]:
dataset = { name: pd.read_csv(path) for name, path in dataset_paths.items()}
dataset.keys()

In [None]:
proceed = { name: pd.read_csv(path) for name, path in preproceed_paths.items()}
dataset.keys()

In [None]:
sales = dataset['sales']
items = dataset['items']
categories = dataset['categories']
shops = dataset['shops']

In [None]:
sales.date = sales.date.astype('datetime64[ns]')

print("Before:", sales.shape)

from datetime import date

sales = sales.loc[sales.date < np.datetime64(date(2015, 11, 1))]
sales.tail()

sales_train = sales[
    (sales["item_cnt_day"] < 1000)
    & (sales["item_price"] > 0)
    & (sales["item_price"] < 60000)
].copy()
print("After:", sales.shape)

sales.head()

In [None]:
force_category = {
    'category': {
        "PC - Гарнитуры/Наушники": "Аксессуары",
        "Игры MAC - Цифра": "Игры",
        "Игры Android - Цифра": "Игры",
        "Чистые носители (шпиль)": "Чистые носители",
        "Чистые носители (штучные)": "Чистые носители",
    },
    'shop': {
        'Интернет-магазин ЧС': 'Интернет-магазин',
        'Цифровой склад 1С-Онлайн': 'Склад',
        'Выездная Торговля': 'Выездная Торговля',
        '!Якутск Орджоникидзе, 56 фран': 'Якутск',
        '!Якутск ТЦ "Центральный" фран': 'Якутск',
    },
}

pattern = {
    'category': ' - ',
    'shop': ' ',
}

def create_transformer(force_category, pattern):
    def _wrapped(value):
        if value in force_category:
            return force_category[value]

        split = value.split(pattern)
        if len(split) > 1:
            return split[0]

        return value
    return _wrapped
    
make_cat_name = create_transformer(force_category['category'], pattern['category'])
make_city_name = create_transformer(force_category['shop'], pattern['shop'])

In [None]:
class PreprocessignPipeline:
    
    class PipelineIterator:
        def __init__(self, dataset, tasks, task_queue):
            self.tasks = tasks
            self.task_queue = task_queue
            self.dataset = dataset
            self.current_task = None
            self.result_storage = {}
            self.proceed = False
            
        def __iter__(self):
            if not self.proceed:
                dataset = self.dataset
                for task in self.task_queue:
                    self.current_task = self.tasks[task]
                    try:
                        proceed_task = self.current_task(dataset)
                        if not proceed_task is None:
                            dataset = proceed_task
                        self.result_storage[task] = dataset
                        print(f'Stage - {task} complete')
                    except:
                        print(f'Exception occured in stage {task}')
                        raise
                    yield self.result_storage[task]
                self.proceed = True
            else:
                for task in self.task_queue:
                    yield self.result_storage[task]
            
        def proceed_all(self):
            if not self.proceed:
                dataset = self.dataset
                for task in self.task_queue:
                    self.current_task = self.tasks[task]
                    try:
                        proceed_task = self.current_task(dataset)
                        if not proceed_task is None:
                            dataset = proceed_task
                        self.result_storage[task] = dataset
                        print(f'Stage - {task} complete')
                    except:
                        print(f'Exception occured in stage {task}')
                        raise
                    self.proceed = True
            return self.result_storage
        
    def __init__(self, tasks, task_queue):
        self.tasks = tasks
        self.task_queue = task_queue
        
    def __call__(self, dataset):
        return self.PipelineIterator(dataset, self.tasks, self.task_queue)

In [None]:
data_preprocessing = {}

# Add column created by transformer
def append_columns(dataset, columns, transformers):
    for column, transformer in zip(columns, transformers):
        dataset[column] = transformer(dataset)

# Add corresponding category and shop id's to each sale
data_preprocessing['id_merging_stage'] = lambda dataset: dataset.merge(
    items, 
    on='item_id'
).merge(
    shops,
    on='shop_id'
).merge(
    categories,
    on='item_category_id'
)

# Add summary among shop_id and category_id above similar time periods (daily intervals)
data_preprocessing['summarizing_and_name_merging_stage'] = lambda dataset: dataset.groupby(
    ['date', 'date_block_num', 'shop_id', 'item_category_id', 'item_category_name', 'shop_name']
).item_cnt_day.sum().reset_index().sort_values('date')

data_preprocessing['add_generalized_names_and_encode_stage'] = lambda dataset: append_columns(
    dataset=dataset, 
    columns=[
        'global_item_category_name',
        'city_name',
        'global_item_category_name_id',
        'city_id',
    ], 
    transformers=[
        lambda _dataset: _dataset["item_category_name"].apply(
            make_cat_name
        ),
        lambda _dataset: _dataset['shop_name'].apply(
            make_city_name
        ),
        lambda _dataset: LabelEncoder().fit_transform(_dataset['global_item_category_name']),
        lambda _dataset: LabelEncoder().fit_transform(_dataset['city_name']),
    ]
)

data_preprocessing['create_full_matrix_stage'] = lambda _dataset: _dataset.set_index('date') \
    .groupby([
        'shop_id',
        'item_category_id',
        'date_block_num',
        'city_id',
        'global_item_category_name_id',
        'id'
    ]).item_cnt_day.sum() \
    .reset_index().rename(columns={'item_cnt_day': 'item_cnt_month'}) \
    .groupby(['shop_id', 'item_category_id', 'date_block_num', 'city_id', 'global_item_category_name_id', 'id']).item_cnt_month.sum().reset_index() \
    .groupby(['date_block_num', 'id']).item_cnt_month.sum().unstack().fillna(0) \
    .stack().reset_index().rename(columns={0:'item_cnt_month'}) \

In [None]:
pipeline = PreprocessignPipeline(
    tasks=data_preprocessing, 
    task_queue = [
        'id_merging_stage',
        'summarizing_and_name_merging_stage',
        'add_generalized_names_and_encode_stage',
    ]
)

In [None]:
pipeline_test = pipeline(sales)
pipeline_train = pipeline(sales_train)

In [None]:
_ = pipeline_test.proceed_all()
_ = pipeline_train.proceed_all()

In [None]:
task_df = {}

task_df['test'] = pipeline_test.result_storage['add_generalized_names_and_encode_stage']
task_df['train'] = pipeline_train.result_storage['add_generalized_names_and_encode_stage']

In [None]:
task_df['test']

In [None]:
idx = task_df['test'].loc[:,['city_id', 'global_item_category_name_id', 'city_name', 'global_item_category_name']].value_counts().sort_index()
idx = pd.DataFrame({'id': [i for i in range(idx.size)]}, idx.index)
idx.reset_index(inplace=True)

In [None]:
task_df['test'] = task_df['test'].merge(idx, on=['city_id', 'global_item_category_name_id', 'city_name', 'global_item_category_name'])
task_df['train'] = task_df['train'].merge(idx, on=['city_id', 'global_item_category_name_id', 'city_name', 'global_item_category_name'])

In [None]:
idx['pair_name'] = idx['city_name'] + ' - ' + idx['global_item_category_name']

In [None]:
task_df_copy['test']

In [None]:
task_df_copy = {}

task_df_copy['test'] = task_df['test'].copy()
task_df_copy['test']['pair_name'] = task_df_copy['test']['city_name'] + ' - ' + task_df_copy['test']['global_item_category_name']

task_df['test'] = data_preprocessing['create_full_matrix_stage'](task_df['test'])
task_df['train'] = data_preprocessing['create_full_matrix_stage'](task_df['train'])

In [None]:
raw_dataset = task_df['train']

In [None]:
from tsfresh import extract_features, select_features

In [None]:
def get_target_values(dataset):
    return dataset.groupby(['date_block_num', 'id']).item_cnt_month.sum().unstack(0)

def upload_df(dataset, path, name):
    with open(os.path.join(path, name + '.csv'), 'w+') as writer:
        dataset.to_csv(writer)

In [None]:
from tsfresh.utilities.dataframe_functions import roll_time_series, make_forecasting_frame

if not UPLOAD_PREVIOUS:
    df_rolled = roll_time_series(raw_dataset, column_id='id', column_sort='date_block_num', min_timeshift=11, max_timeshift=33, rolling_direction=1)
    upload_df(df_rolled.reset_index(), OUTPUT_PATH, 'df_rolled')
else:
    df_rolled = proceed['df_rolled']
    df_rolled = df_rolled.set_index('Unnamed: 0').sort_index()

In [None]:
df_rolled

In [None]:
df_rolled.id.value_counts().sort_index()

In [None]:
y_rolled = get_target_values(task_df['train'])
y_rolled_test = get_target_values(task_df['test'])
y_rolled_test = y_rolled_test.loc[:, 11:].unstack()
y_rolled = y_rolled.loc[:, 11:].unstack()

In [None]:
y_rolled = y_rolled.reset_index().set_index(['id', 'date_block_num']).loc[:,0].fillna(0)
y_rolled_test = y_rolled_test.reset_index().set_index(['id', 'date_block_num']).loc[:,0].fillna(0)

In [None]:
y_rolled.index

In [None]:
if not UPLOAD_PREVIOUS:
    features = extract_features(df_rolled, column_id='id', column_sort='date_block_num')
    upload_df(features.reset_index(), OUTPUT_PATH, 'features')
else:
    features = proceed['features'].drop('Unnamed: 0', axis=1)
    features = features.rename(columns={'level_0':'id', 'level_1':'date_block_num'}).set_index(['id', 'date_block_num'])

In [None]:
features

In [None]:
y_rolled = y_rolled[set(y_rolled.index) & set(features.index)]

In [None]:
y_rolled.reset_index()

In [None]:
from tsfresh.utilities.dataframe_functions import impute

impute(features)

In [None]:
if not UPLOAD_PREVIOUS:
    features_filtered = select_features(features, y_rolled)
    features_filtered = features_filtered.reset_index().rename(columns={'level_0': 'id', 'level_1': 'date_block_num'})
    upload_df(features_filtered.reset_index(), OUTPUT_PATH, 'features_filtered')
else:
    features_filtered = proceed['features_filtered'].drop('index', axis=1)
    features_filtered.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
features_filtered

In [None]:
# assert False

In [None]:
features_filtered[features_filtered.date_block_num == 11].set_index(['id', 'date_block_num'])

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
features = {}

In [None]:
def standartize(dataset):
    scaler = StandardScaler()
    try:
        scaler.fit(dataset.to_numpy())
        return scaler.transform(dataset.to_numpy())
    except:
        scaler.fit(dataset)
        return scaler.transform(dataset)

In [None]:
df = pd.DataFrame(
    data=standartize(features_filtered), 
    index=features_filtered.index, 
    columns=features_filtered.columns
)
df['id'] = features_filtered['id']
df['date_block_num'] = features_filtered['date_block_num']
scaled_features = df

In [None]:
features['filtered'] = features_filtered
features['scaled'] = scaled_features

In [None]:
y_rolled.reset_index()[y_rolled.reset_index().date_block_num == 11].set_index(['id', 'date_block_num'])

In [None]:
from sklearn.metrics import mean_squared_error as mse

def get_features(date, train_df):
    return train_df[train_df.date_block_num == date].set_index(['id', 'date_block_num']).sort_index()
        
def get_target(date, target_vector):
    return target_vector.reset_index()[target_vector.reset_index().date_block_num == date].set_index(['id', 'date_block_num']).sort_index()

class RegressionValidator:
    def __init__(self, model, *args, **kwargs):
        self.model = model(**kwargs)
        
    def validate(self, X_features, target_vector, test_target):
        train_df = X_features
        
        def get_features(date):
            return train_df[train_df.date_block_num == date].set_index(['id', 'date_block_num']).sort_index()
        
        current_date = 23
        max_date = 32
        
        errors = [
            [], [], []
        ]
        
        while current_date < max_date:
            current_features = get_features(current_date)
            validation_window = get_features(current_date + 1)
#             print(validation_window )
            
            current_target = get_target(current_date, target_vector)
            validation_target = get_target(current_date + 1, test_target)

            fitted = self.model.fit(current_features, current_target)

            predictions = self.model.predict(validation_window)
            
#             print(validation_target.loc[:,0].to_list())
            if predictions.transpose().shape[0] == 1:
                predictions = predictions.transpose()[0]
            else:
                predictions = predictions.transpose()
        
            errors[0].append(current_date)
            assert validation_target.loc[:,0].shape == predictions.shape, f'Shapes are pred:{validation_target.loc[:,0].shape} and truth:{predictions.shape}\nCurrent validation set: {current_date}'
            errors[1].append(mse(validation_target.loc[:,0].to_list(), predictions))
            
            report = pd.DataFrame({'true_values': validation_target.loc[:,0].to_list(), 'predicted': predictions})
            errors[2].append(report)
            
            current_date += 1
        
        return errors
        
        

In [None]:
def get_statistics(report):
    stat = report.copy()
    stat['residuals'] = stat['predicted'] - stat['true_values'] 
    stat['id'] = stat.index
    stat['abs_residuals'] = stat['residuals'].abs()
    stat['percentage'] = 2*(stat['residuals'])/(stat['true_values'] + stat['predicted'])
    return stat

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV

In [None]:
errors = {}

## Basic Regressions(Naive, Ridge, Lasso, PLS)

In [None]:
validator_lin = RegressionValidator(LinearRegression)
errors['original'] = validator_lin.validate(features[FEATURE_TYPE], y_rolled, y_rolled_test)
validator_ridge = RegressionValidator(Ridge, alpha=1e-10)
errors['ridge'] = validator_ridge.validate(features[FEATURE_TYPE], y_rolled, y_rolled_test)
validator_ridge = RegressionValidator(Lasso, alpha=0.275)
errors['lasso'] = validator_ridge.validate(features[FEATURE_TYPE], y_rolled, y_rolled_test)

_, axs = plt.subplots(3, 2, figsize=(20, 17))

axs[0][0].set_title("MSE histogram in validation scheme [original]")
axs[0][1].set_title("MSE scatterplot [original]")
axs[0][1].set_xlabel('date_block_num')
axs[0][1].set_ylabel('MSE')
axs[1][0].set_title("MSE histogram in validation scheme [ridge]")
axs[1][1].set_title("MSE scatterplot [ridge]")
axs[1][1].set_xlabel('date_block_num')
axs[1][1].set_ylabel('MSE')
axs[2][0].set_title("MSE histogram in validation scheme [lasso]")
axs[2][1].set_title("MSE scatterplot [lasso]")
axs[2][1].set_xlabel('date_block_num')
axs[2][1].set_ylabel('MSE')

sns.scatterplot(errors['original'][0], errors['original'][1], ax=axs[0][1])
sns.histplot(errors['original'][1], ax=axs[0][0])
sns.scatterplot(errors['ridge'][0], errors['ridge'][1], ax=axs[1][1])
sns.histplot(errors['ridge'][1], ax=axs[1][0])
sns.scatterplot(errors['lasso'][0], errors['lasso'][1], ax=axs[2][1])
sns.histplot(errors['lasso'][1], ax=axs[2][0])

In [None]:
def stat_info(errors):
    mean_residuals = pd.DataFrame({'abs_resid': [0 for _ in range(438)], 'resid': [0 for _ in range(438)]}, [i for i in range(438)])
    residual_series = None
    for _idx, report in enumerate(errors[2]):
        residuals = get_statistics(report)
        mean_residuals['abs_resid'] += residuals['abs_residuals']
        mean_residuals['resid'] += residuals['residuals']
        resids = residuals['residuals'].reset_index()
        resids['window'] = _idx
        resids.set_index(['index', 'window'], inplace=True)
        if residual_series is None:
            residual_series = resids
        else:
            residual_series = pd.concat([residual_series, resids])
    mean_residuals /= len(errors[2])
    return mean_residuals, residual_series

In [None]:
maximal_feature = get_features(31, features[FEATURE_TYPE])
maximal_target = get_target(31, y_rolled)

validation_feature = get_features(32, features[FEATURE_TYPE])
validation_target = get_target(32, y_rolled)

alpha = [1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]

In [None]:
ridge_res = RidgeCV(alpha, store_cv_values=True).fit(maximal_feature, maximal_target)
sns.scatterplot(np.log(alpha), np.log([abs(ridge_res.cv_values_.transpose()[i][0]).sum() for i in range(15)]))

In [None]:
# from time import sleep

# while True:
#     sleep(3)

## Error exploration

In [None]:
proceed_residuals = {}

proceed_residuals['original'] = stat_info(errors['original'])
proceed_residuals['ridge'] = stat_info(errors['ridge'])
proceed_residuals['lasso'] = stat_info(errors['lasso'])

### Original linear regression

In [None]:
stable_residual_interval = proceed_residuals['original'][1].reset_index().rename(columns={'index': 'id'}).merge(idx)
stable_residual_interval['abs_residuals'] = abs(stable_residual_interval['residuals'])
sns.boxplot(data=stable_residual_interval, x='residuals')

In [None]:
total_error = stable_residual_interval.residuals @ stable_residual_interval.residuals
outliers = stable_residual_interval[stable_residual_interval.abs_residuals > 93]
outliers_error = outliers.residuals @ outliers.residuals
print(f'{len(outliers)/len(stable_residual_interval)} residuals cause {outliers_error/total_error} mistake')

In [None]:
giants = []
totals = []

for i in range(9):
    slice_ = stable_residual_interval[stable_residual_interval.window == i]    
    giants.append(slice_.sort_values('abs_residuals').tail(22))
    totals.append(slice_.residuals @ slice_.residuals)
    giants[i]['part'] = (slice_.residuals ** 2) / totals[i]
    print(f'Window {i}: 5% cause {(giants[i].residuals @ giants[i].residuals)/(totals[i])}')

In [None]:
giant_outliers = pd.concat(giants)

pair_name_bag = tuple(giant_outliers.loc[:,['pair_name', 'window', 'residuals']].set_index(['pair_name', 'window']).unstack().fillna(0).index)

giant_outliers_matrix = {}

target = stable_residual_interval[stable_residual_interval.pair_name.isin(pair_name_bag)]

giant_outliers_matrix['pair_name'] = target.loc[:,['pair_name', 'window', 'residuals']].set_index(['pair_name', 'window']).unstack().fillna(0)

In [None]:
giant_outliers_matrix['pair_name'].transpose().plot(figsize=(30, 24))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print (giant_outliers.pair_name.value_counts())
sns.lineplot(x=giant_outliers.pair_name.value_counts().value_counts().index.to_list(), y=giant_outliers.pair_name.value_counts().value_counts().to_list())

### Ridge

In [None]:
stable_residual_interval = proceed_residuals['ridge'][1].reset_index().rename(columns={'index': 'id'}).merge(idx)
stable_residual_interval['abs_residuals'] = abs(stable_residual_interval['residuals'])
sns.boxplot(data=stable_residual_interval, x='residuals')

In [None]:
total_error = stable_residual_interval.residuals @ stable_residual_interval.residuals
outliers = stable_residual_interval[stable_residual_interval.abs_residuals > 47]
outliers_error = outliers.residuals @ outliers.residuals
print(f'{len(outliers)/len(stable_residual_interval)} residuals cause {outliers_error/total_error} mistake')

In [None]:
giants = []
totals = []

for i in range(9):
    slice_ = stable_residual_interval[stable_residual_interval.window == i]    
    giants.append(slice_.sort_values('abs_residuals').tail(22))
    totals.append(slice_.residuals @ slice_.residuals)
    giants[i]['part'] = (slice_.residuals ** 2) / totals[i]
    print(f'Window {i}: 5% cause {(giants[i].residuals @ giants[i].residuals)/(totals[i])}')

In [None]:
giant_outliers = pd.concat(giants)

pair_name_bag = tuple(giant_outliers.loc[:,['pair_name', 'window', 'residuals']].set_index(['pair_name', 'window']).unstack().fillna(0).index)

giant_outliers_matrix = {}

target = stable_residual_interval[stable_residual_interval.pair_name.isin(pair_name_bag)]

giant_outliers_matrix['pair_name'] = target.loc[:,['pair_name', 'window', 'residuals']].set_index(['pair_name', 'window']).unstack().fillna(0)

In [None]:
giant_outliers_matrix['pair_name'].transpose().plot(figsize=(30, 24))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print (giant_outliers.pair_name.value_counts())
sns.lineplot(x=giant_outliers.pair_name.value_counts().value_counts().index.to_list(), y=giant_outliers.pair_name.value_counts().value_counts().to_list())

### Lasso

In [None]:
stable_residual_interval = proceed_residuals['lasso'][1].reset_index().rename(columns={'index': 'id'}).merge(idx)
stable_residual_interval['abs_residuals'] = abs(stable_residual_interval['residuals'])
sns.boxplot(data=stable_residual_interval, x='residuals')

In [None]:
total_error = stable_residual_interval.residuals @ stable_residual_interval.residuals
outliers = stable_residual_interval[stable_residual_interval.abs_residuals > 47]
outliers_error = outliers.residuals @ outliers.residuals
print(f'{len(outliers)/len(stable_residual_interval)} residuals cause {outliers_error/total_error} mistake')

In [None]:
giants = []
totals = []

for i in range(9):
    slice_ = stable_residual_interval[stable_residual_interval.window == i]    
    giants.append(slice_.sort_values('abs_residuals').tail(22))
    totals.append(slice_.residuals @ slice_.residuals)
    giants[i]['part'] = (slice_.residuals ** 2) / totals[i]
    print(f'Window {i}: 5% cause {(giants[i].residuals @ giants[i].residuals)/(totals[i])}')

In [None]:
giant_outliers

In [None]:
giant_outliers = pd.concat(giants)

pair_name_bag = tuple(giant_outliers.loc[:,['pair_name', 'window', 'residuals']].set_index(['pair_name', 'window']).unstack().fillna(0).index)

giant_outliers_matrix = {}

target = stable_residual_interval[stable_residual_interval.pair_name.isin(pair_name_bag)]

giant_outliers_matrix['pair_name'] = target.loc[:,['pair_name', 'window', 'residuals']].set_index(['pair_name', 'window']).unstack().fillna(0)

In [None]:
giant_outliers_matrix['pair_name'].transpose().plot(figsize=(30, 24))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print (giant_outliers.pair_name.value_counts())
sns.lineplot(x=giant_outliers.pair_name.value_counts().value_counts().index.to_list(), y=giant_outliers.pair_name.value_counts().value_counts().to_list())

# Magic with non-selected features

In [None]:
if not UPLOAD_PREVIOUS:
    raw_features = extract_features(df_rolled, column_id='id', column_sort='date_block_num')
    upload_df(raw_features.reset_index(), OUTPUT_PATH, 'features')
else:
    raw_features = proceed['features'].drop('Unnamed: 0', axis=1)
    raw_features = raw_features.rename(columns={'level_0':'id', 'level_1':'date_block_num'}).set_index(['id', 'date_block_num'])

In [None]:
raw_features.replace([np.inf, -np.inf], np.nan, inplace=True)
raw_features_na = raw_features.dropna(axis=1, how='all', thresh=1000)

In [None]:
raw_features_pruned = raw_features.dropna(axis=1, how='any')

### a) Pruned features

In [None]:
features_pruned = {'raw': raw_features_pruned}

In [None]:
validator_lin = RegressionValidator(LinearRegression)
errors['pruned_raw'] = validator_lin.validate(raw_features_pruned.reset_index(), y_rolled, y_rolled_test)

_, axs = plt.subplots(1, 2, figsize=(14, 4))

axs[0].set_title("MSE histogram in validation scheme")
axs[1].set_title("MSE scatterplot")
axs[1].set_xlabel('date_block_num')
axs[1].set_ylabel('MSE')

sns.scatterplot(errors['pruned_raw'][0][2:], errors['pruned_raw'][1][2:], ax=axs[1])
sns.histplot(errors['pruned_raw'][1], ax=axs[0])

In [None]:
df = pd.DataFrame(
    data=standartize(raw_features_pruned), 
    index=raw_features_pruned.index, 
    columns=raw_features_pruned.columns
)
raw_features_pruned_scaled = df

raw_features_pruned_scaled

In [None]:
validator_lin = RegressionValidator(LinearRegression)
errors['pruned_scaled'] = validator_lin.validate(raw_features_pruned_scaled.reset_index(), y_rolled, y_rolled_test)

_, axs = plt.subplots(1, 2, figsize=(14, 4))

axs[0].set_title("MSE histogram in validation scheme")
axs[1].set_title("MSE scatterplot")
axs[1].set_xlabel('date_block_num')
axs[1].set_ylabel('MSE')

sns.scatterplot(errors['pruned_scaled'][0], errors['pruned_scaled'][1], ax=axs[1])
sns.histplot(errors['pruned_scaled'][1], ax=axs[0])

In [None]:
from gc import collect
from sklearn.decomposition import PCA

In [None]:
def produce_pca(dataset, components='mle'):
    decomposer = PCA(n_components=components)
    return decomposer.fit_transform(dataset)

In [None]:
def produce_df(dataset):
    df = pd.DataFrame(data=dataset)
    df['id'] = raw_features_pruned_scaled.reset_index()['id']
    df['date_block_num'] = raw_features_pruned_scaled.reset_index()['date_block_num']
    return df

In [None]:
raw_features_pruned_pca = {n_comp: produce_df(produce_pca(raw_features_pruned_scaled, n_comp)) for n_comp in range(50, 311, 5)}

In [None]:
validator_lin = RegressionValidator(LinearRegression)
errors['pruned_pca'] = {}
for i in raw_features_pruned_pca:
    errors['pruned_pca'][i] = validator_lin.validate(raw_features_pruned_pca[i].reset_index(), y_rolled, y_rolled_test)

In [None]:
sns.scatterplot(x=[i for i in raw_features_pruned_pca  if i > 270], y=[np.mean(errors['pruned_pca'][i][1]) for i in raw_features_pruned_pca if i > 270])

In [None]:
validator_lin = RegressionValidator(LinearRegression)
errors['pruned_pca_validated'] = validator_lin.validate(raw_features_pruned_pca[300].reset_index(), y_rolled, y_rolled_test)

_, axs = plt.subplots(1, 2, figsize=(14, 4))

axs[0].set_title("MSE histogram in validation scheme")
axs[1].set_title("MSE scatterplot")
axs[1].set_xlabel('date_block_num')
axs[1].set_ylabel('MSE')

sns.scatterplot(errors['pruned_pca_validated'][0], errors['pruned_pca_validated'][1], ax=axs[1])
sns.histplot(errors['pruned_pca_validated'][1], ax=axs[0])

### b) Missing values filling

1) Filling zeros

In [None]:
def apply_standart(dataset, droped_idx=False):
    df = pd.DataFrame(
        data=standartize(dataset), 
        index=dataset.index, 
        columns=dataset.columns
    )
    return df

In [None]:
raw_features_na_zero = raw_features_na.fillna(0)

In [None]:
validator_lin = RegressionValidator(LinearRegression)
errors['na_zero'] = validator_lin.validate(apply_standart(raw_features_na_zero).reset_index(), y_rolled, y_rolled_test)

_, axs = plt.subplots(1, 2, figsize=(14, 4))

axs[0].set_title("MSE histogram in validation scheme")
axs[1].set_title("MSE scatterplot")
axs[1].set_xlabel('date_block_num')
axs[1].set_ylabel('MSE')

sns.scatterplot(errors['na_zero'][0], errors['na_zero'][1], ax=axs[1])
sns.histplot(errors['na_zero'][1], ax=axs[0])

2) Filling as outlier

In [None]:
raw_features_na_high = raw_features_na.fillna(-9999999)

In [None]:
validator_lin = RegressionValidator(LinearRegression)
errors['na_high'] = validator_lin.validate(apply_standart(raw_features_na_high).reset_index(), y_rolled, y_rolled_test)

_, axs = plt.subplots(1, 2, figsize=(14, 4))

axs[0].set_title("MSE histogram in validation scheme")
axs[1].set_title("MSE scatterplot")
axs[1].set_xlabel('date_block_num')
axs[1].set_ylabel('MSE')

sns.scatterplot(errors['na_high'][0], errors['na_high'][1], ax=axs[1])
sns.histplot(errors['na_high'][1], ax=axs[0])

3) Filling mean

In [None]:
def apply_to(dataset, func):
    for column in dataset.columns:
        dataset[column] = dataset[column].replace(np.NaN, func(dataset[column]))
    return dataset

In [None]:
raw_features_na_mean = apply_to(raw_features_na, np.mean)

In [None]:
validator_lin = RegressionValidator(LinearRegression)
errors['na_mean'] = validator_lin.validate(apply_standart(raw_features_na_mean).reset_index(), y_rolled, y_rolled_test)

_, axs = plt.subplots(1, 2, figsize=(14, 4))

axs[0].set_title("MSE histogram in validation scheme")
axs[1].set_title("MSE scatterplot")
axs[1].set_xlabel('date_block_num')
axs[1].set_ylabel('MSE')

sns.scatterplot(errors['na_mean'][0], errors['na_mean'][1], ax=axs[1])
sns.histplot(errors['na_mean'][1], ax=axs[0])

4) Filling median

In [None]:
raw_features_na_median = apply_to(raw_features_na, np.median)

In [None]:
validator_lin = RegressionValidator(LinearRegression)
errors['na_median'] = validator_lin.validate(apply_standart(raw_features_na_median).reset_index(), y_rolled, y_rolled_test)

_, axs = plt.subplots(1, 2, figsize=(14, 4))

axs[0].set_title("MSE histogram in validation scheme")
axs[1].set_title("MSE scatterplot")
axs[1].set_xlabel('date_block_num')
axs[1].set_ylabel('MSE')

sns.scatterplot(errors['na_median'][0], errors['na_median'][1], ax=axs[1])
sns.histplot(errors['na_median'][1], ax=axs[0])

5) KNN-Imputer

In [None]:
from sklearn.impute import KNNImputer

def knn_impute(dataset):
    imputer = KNNImputer(n_neighbors=2)
    try:
        return imputer.fit_transform(dataset.to_numpy())
    except:
        return imputer.fit_transform(dataset)

In [None]:
raw_features_na_knn = produce_df(knn_impute(raw_features_na))

In [None]:
produce_df(apply_standart(raw_features_na_knn).reset_index().drop('index', axis=1))

In [None]:
validator_lin = RegressionValidator(LinearRegression)
errors['na_knn'] = validator_lin.validate(produce_df(apply_standart(raw_features_na_knn).reset_index().drop('index', axis=1)), y_rolled, y_rolled_test)

_, axs = plt.subplots(1, 2, figsize=(14, 4))

axs[0].set_title("MSE histogram in validation scheme")
axs[1].set_title("MSE scatterplot")
axs[1].set_xlabel('date_block_num')
axs[1].set_ylabel('MSE')

sns.scatterplot(errors['na_knn'][0], errors['na_knn'][1], ax=axs[1])
sns.histplot(errors['na_knn'][1], ax=axs[0])