In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Disclaimer

#### In this notebook I will be using some of useful functions and ideas which were presented by other kaggle users in the discussions and kernel sections.
#### In order to make this notebook more readable I commented out all visualizations. If there is a need to see them, feel free to remove those comments. 


In [None]:
!pip install workalendar

In [None]:
# imports

import matplotlib.pyplot as plt
%matplotlib inline
from workalendar.europe import Russia
from itertools import product
from sklearn.preprocessing import LabelEncoder
import calendar
import seaborn as sns
from xgboost import XGBRegressor
from xgboost import plot_importance
import gc
import re
import pickle

In [None]:
# load data

train=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
test=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")
items=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
item_categories=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
shops=pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")

## Cleaning train dataset

In [None]:
# lets plot all columns in train (except index) 


#for i in train.columns[1:]:
#    plt.figure(figsize=(10,4));
#    sns.boxplot(train[i]);
#    plt.title(i);

In [None]:
# there are some outliers, remove them

train = train[(train["item_price"] > 0 ) & (train["item_price"] < 100000 ) & (train["item_cnt_day"] > 0) & (train["item_cnt_day"] < 1000)]
#train

## Cleaning shops dataset

In [None]:
# inspect shops dataset
#shops

In [None]:
# several shops are duplicates, remove them from test, train and shops

# Якутск Орджоникидзе, 56

train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57

# Якутск ТЦ "Центральный"

train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58

# Жуковский ул. Чкалова 39м²

train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

# remove from shops

shops = shops[(shops["shop_id"] > 1) & (shops["shop_id"] != 10)]

In [None]:
# add shop city name and population (taken from wikipedia)

shops["shop_city_population"] = [
    784048, 235336, 327356, 1018790, 1003638, 1003638, 1003638, -1, 106872, -1, 1176187, 1176187, 331351, 144707, 1016385, 1016385, 428741, 11979529,
 11979529, 11979529, 11979529, 11979529, 11979529, 11979529, 11979529, 11979529, 11979529, 11979529, 11979529, 11979529, 178672, 1259921, 1259921, 1523801, 1523801, 1160670, 
    1103733, 1103733, 1103733, 5028000, 5028000, 1171598, 1171598, 108490, 325511, 547989, 634171, 634171, 634171, 1077719, 1077719, 221084, -1, 65359, 286456, 286456, 599169
]

shops['shop_name'] = shops['shop_name'].apply(lambda x: x.lower()).str.replace('[^\w\s]', '').str.replace('\d+','').str.strip()
shops['shop_city'] = shops['shop_name'].str.partition(' ')[0]
shops.loc[:, "shop_city"] = LabelEncoder().fit_transform(shops["shop_city"])
shops['shop_type'] = shops['shop_name'].apply(lambda x: 'мтрц' if 'мтрц' in x else 'трц' if 'трц' in x else 'трк' if 'трк' in x else 'тц' if 'тц' in x else 'тк' if 'тк' in x else 'NaN')
shops.loc[:, "shop_type"] = LabelEncoder().fit_transform(shops["shop_type"])

# drop text column

shops.drop(["shop_name"], axis=1, inplace=True)

#shops

## Cleaning item_categories dataset

In [None]:
# add item_category_name_2 feature

item_categories["item_category_name_2"] = item_categories["item_category_name"].apply(lambda x: x.split()[0])
item_categories["item_category_name_2"] = LabelEncoder().fit_transform(item_categories["item_category_name_2"])

# drop text column

item_categories.drop(["item_category_name"], axis=1, inplace=True)

#item_categories

## Cleaning items dataset

In [None]:
import re
def name_correction(x):
    x = x.lower()
    x = x.partition('[')[0]
    x = x.partition('(')[0]
    x = re.sub('[^A-Za-z0-9А-Яа-я]+', ' ', x)
    x = x.replace('  ', ' ')
    x = x.strip()
    return x

In [None]:
# use item name as a text feature. this idea was suggested by Konstantin Yakovlev and nicely done by lonewolf45

items["name1"], items["name2"] = items.item_name.str.split("[", 1).str
items["name1"], items["name3"] = items.item_name.str.split("(", 1).str

items["name2"] = items.name2.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
items["name3"] = items.name3.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
items = items.fillna("0")

items["item_name"] = items["item_name"].apply(lambda x: name_correction(x))
items.name2 = items.name2.apply( lambda x: x[:-1] if x !="0" else "0")

items["type"] = items.name2.apply(lambda x: x[0:8] if x.split(" ")[0] == "xbox" else x.split(" ")[0] )
items.loc[(items.type == "x360") | (items.type == "xbox360") | (items.type == "xbox 360") ,"type"] = "xbox 360"
items.loc[ items.type == "", "type"] = "mac"
items.type = items.type.apply( lambda x: x.replace(" ", "") )
items.loc[ (items.type == 'pc' )| (items.type == 'pс') | (items.type == "pc"), "type" ] = "pc"
items.loc[ items.type == 'рs3' , "type"] = "ps3"


In [None]:
group_sum = items.groupby(["type"]).agg({"item_id": "count"}).reset_index()
to_drop = []
for cat in group_sum.type.unique():
    if group_sum.loc[(group_sum.type == cat), "item_id"].values[0] <40:
        to_drop.append(cat)
items.name2 = items.name2.apply( lambda x: "etc" if (x in to_drop) else x )
items = items.drop(["type"], axis = 1)
items.name2 = LabelEncoder().fit_transform(items.name2)
items.name3 = LabelEncoder().fit_transform(items.name3)

# drop item name and name1 from item dataset

items.drop(["item_name", "name1"],axis = 1, inplace= True)

## Creating matrix with all combinations of shops, items and months and adding more features

In [None]:
matrix = []
cols  = ["date_block_num", "shop_id", "item_id"]
for i in range(34):
    sales = train[train["date_block_num"] == i]
    matrix.append(np.array(list( product( [i], sales["shop_id"].unique(), sales["item_id"].unique()))))
matrix = pd.DataFrame(np.vstack(matrix), columns = cols )
matrix.sort_values(cols, inplace = True )

#matrix

In [None]:
# add date_block_num to test

test["date_block_num"] = 34

# add data from test, train, item_categories, items

matrix = pd.concat([matrix, test.drop(["ID"],axis = 1)], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace = True)

matrix = pd.merge(matrix, shops, on = ["shop_id"], how = "left")
matrix = pd.merge(matrix, items, on = ["item_id"], how = "left")
matrix = pd.merge(matrix, item_categories, on = ["item_category_id"], how = "left")

In [None]:
%%time
# add month day, week day and weekend count

def count_days(date_block_num): # function by Arnab Chakraborty
    y = 2013 + date_block_num // 12
    m = 1 + date_block_num % 12
    leap = 0
    if y% 400 == 0:
        leap = 1
    elif y % 100 == 0:
        leap = 0
    elif y% 4 == 0:
        leap = 1
    if m==2:
        return 28 + leap
    list = [1,3,5,7,8,10,12]
    if m in list:
        return 31
    return 30

def count_holidays(date_block_num):
    m = 1 + date_block_num % 12
    if m == 1:
        return 1
    elif m == 2:
        return 1
    elif m == 3:
        return 1
    elif m == 5:
        return 2
    elif m == 6:
        return 1
    elif m == 11:
        return 1
    elif m == 12:
        return 2
    else:
        return 0
    
def count_weekdays(date_block_num):
    try:
        y = 2013 + date_block_num // 12
        m = 1 + date_block_num % 12
        if m < 9:
            return np.busday_count(f'{y}-0{m}', f'{y}-0{m+1}')
        elif m ==9:
            return np.busday_count(f'{y}-0{m}', f'{y}-10')
        elif m != 12:
             return np.busday_count(f'{y}-{m}', f'{y}-{m+1}')
        else:
            return np.busday_count(f'{y}-{m}', f'{y+1}-01')
    except ValueError:
        print(m,y)
    
matrix["days_in_month"] = matrix["date_block_num"].apply(lambda x: count_days(x))
matrix["holidays_in_month"] = matrix["date_block_num"].apply(lambda x: count_holidays(x))
matrix["weekdays_in_month"] = matrix["date_block_num"].apply(lambda x: count_weekdays(x))

In [None]:
# add item_cnt_month feature

group = train.groupby( ["date_block_num", "shop_id", "item_id"] ).agg( {"item_cnt_day": ["sum"]} )
group.columns = ["item_cnt_month"]
group.reset_index( inplace = True)
matrix = pd.merge(matrix, group, on = cols, how = "left" )
matrix["item_cnt_month"] = matrix["item_cnt_month"].fillna(0)
matrix["item_cnt_month"] = matrix["item_cnt_month"].clip(0,20)

In [None]:
# add first item appearance indicators

group1 = matrix.groupby(['item_id'])['date_block_num'].min().reset_index()
group1['item_first_interaction'] = 1
matrix = pd.merge(matrix, group1[['item_id', 'date_block_num', 'item_first_interaction']], on=['item_id', 'date_block_num'], how='left')

group2 = matrix[matrix['date_block_num'] > 0].groupby(['shop_id', 'item_id'])['date_block_num'].min().reset_index()
group2['first_date_block_num'] = group2['date_block_num']
matrix = pd.merge(matrix, group2[['item_id', 'shop_id', 'first_date_block_num']], on=['item_id', 'shop_id'], how='left')

matrix['first_date_block_num'].fillna(100, inplace=True)
matrix['shop_item_sold_before'] = (matrix['first_date_block_num'] < matrix['date_block_num'])
matrix.drop(['first_date_block_num'], axis=1, inplace=True)

matrix['item_first_interaction'].fillna(0, inplace=True)
matrix['shop_item_sold_before'].fillna(0, inplace=True)

del group
gc.collect()

In [None]:
# function for creating lagged features, used in top tier notebooks

def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_' + str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
        df[col+'_lag_' + str(i)] = df[col + '_lag_' + str(i)].astype('float16')
    return df

In [None]:
%%time

# add item_cnt_month lags for last 3 months

matrix = lag_feature(matrix, [1, 2, 3], 'item_cnt_month')

# add average item price per shop and average item price per shop per month

group = train.groupby(["item_id", "date_block_num"])['item_price'].mean().reset_index().rename(columns={"item_price": "avg_item_price"})
matrix = matrix.merge(group, on = ["item_id", "date_block_num"], how = "left")

group = train.groupby(["shop_id", "item_id", "date_block_num"])['item_price'].mean().reset_index().rename(columns={"item_price": "avg_item_price_shop"})
matrix = matrix.merge(group, on = ["shop_id", "item_id", "date_block_num"], how = "left")

# add normalized difference between prices

matrix["price_diff"] = (matrix["avg_item_price_shop"] - matrix["avg_item_price"]) / matrix["avg_item_price"]

# fill NaNs with zeros

matrix.fillna(0, inplace=True)

# add price_diff lags for last 3 months 

matrix = lag_feature(matrix, [1, 2, 3], 'price_diff')

# remove redundant columns

matrix.drop(['avg_item_price', 'avg_item_price_shop', 'price_diff'], axis=1, inplace=True)

In [None]:
# add sales lags for similar items with nearby ids
# the idea of using this feature belongs to uladzimirkapeika

def lag_similar_items(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)+'_sim']
        shifted['date_block_num'] += i
        shifted['item_id'] -= 1
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
        df[col+'_lag_'+str(i)+'_sim'] = df[col+'_lag_'+str(i)+'_sim']
    return df

matrix = lag_similar_items(matrix, [1, 2, 3], 'item_cnt_month')

In [None]:
matrix.T

In [None]:
# add target encoding

group = matrix.groupby(['date_block_num','item_id'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_enc"})
matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')

# add target per shop encoding

group = matrix.groupby(['date_block_num','item_id', 'shop_id'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_shop_enc"})
matrix = pd.merge(matrix, group, on=['date_block_num','item_id', 'shop_id'], how='left')

# add target per item_category_id encoding

group = matrix.groupby(['date_block_num','item_id', 'item_category_id'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_cat1_enc"})
matrix = pd.merge(matrix, group, on=['date_block_num','item_id', 'item_category_id'], how='left')

# add target per city encoding

group = matrix.groupby(['date_block_num','item_id', 'shop_city'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_city_enc"})
matrix = pd.merge(matrix, group, on=['date_block_num','item_id', 'shop_city'], how='left')

# add target per name2 encoding

group = matrix.groupby(['date_block_num','item_id', 'name2'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_name2_enc"})
matrix = pd.merge(matrix, group, on=['date_block_num','item_id', 'name2'], how='left')

# add avg category sales for last 3 months for new items 

group = matrix[matrix['item_first_interaction'] < 0].groupby(['date_block_num','item_category_id'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "new_item_cat_enc"})
matrix = pd.merge(matrix, group, on=['date_block_num','item_category_id'], how='left')

# fill NaNs with zeros

matrix.fillna(0, inplace=True)

# lag encodings

matrix = lag_feature(matrix, [1, 2, 3], 'item_target_enc')
matrix = lag_feature(matrix, [1, 2, 3], 'item_target_shop_enc')
matrix = lag_feature(matrix, [1, 2, 3], 'item_target_cat1_enc') 
matrix = lag_feature(matrix, [1, 2, 3], 'item_target_name2_enc')
matrix = lag_feature(matrix, [1, 2, 3], 'item_target_city_enc')
matrix = lag_feature(matrix, [1, 2, 3], 'new_item_cat_enc')

# drop unlagged encodings

matrix.drop(['item_target_enc', 'item_target_shop_enc', 'item_target_cat1_enc', 'item_target_city_enc', 'new_item_cat_enc', 'item_target_name2_enc'], axis=1, inplace=True)

# fill Nans with zeros

matrix.fillna(0, inplace=True)

# remove first 3 months that dont have lag values

matrix = matrix[(matrix['date_block_num'] > 2)]
matrix.head()

In [None]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# reduce matrix memory usage

matrix = reduce_mem_usage(matrix, use_float16=False)

In [None]:
# save matrix

matrix.to_pickle('matrix.pkl')