### Introduction

Data Science is nothing without data and Machine Learning does not drive far without feature engineering. Data for <a href='https://www.kaggle.com/c/competitive-data-science-predict-future-sales'>Predict Future Sales</a> competition reminds us about it.

In this notebook I plan to:
* explore data given for the competition
* get some insights based on (at least quick) EDA
* generate some additional (hopefully usefull) features for the dataset
* make some model and train it
* test/validate the results
* make at least baseline predictions and submission

P.S. I update the notebook as I can devote some time, if there are parts not yet covered from my plan, I am on my way to cover those in the next versions.

In [None]:
#!conda create -n rapids-0.16 -c rapidsai -c nvidia -c conda-forge \
#    -c defaults rapids=0.16 python=3.7 cudatoolkit=10.1 -y

In [None]:
seed=66

In [None]:
import os
import re
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
#import cudf

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 5);
sns.set_style('whitegrid')

In [None]:
items = pd.read_csv(
    "/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
shops = pd.read_csv(
    "/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")
cats = pd.read_csv(
    "/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
train = pd.read_csv(
    "/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
test = pd.read_csv(
    "/kaggle/input/competitive-data-science-predict-future-sales/test.csv")
sub_df = pd.read_csv(
    "/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")

In [None]:
train.head(3)
test.head(3)

In [None]:
items.head(3)
shops.head(3)
cats.head(3)

Thanks @abdalazez for spoting some duplicates the set! Check out <a href='https://www.kaggle.com/abdalazez/predict-future-sales-2020'>his notebook</a> Overall we discover many duplicates throught the dataset, maybe those I deal with is not the full list. But take a look at a few of these.

In [None]:
shops[shops['shop_id']==0]
shops[shops['shop_id']==57]

In [None]:
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11
train.loc[train.shop_id == 23, 'shop_id'] = 24
test.loc[test.shop_id == 23, 'shop_id'] = 24
train.loc[train.item_id == 69, 'item_id'] = 70
test.loc[test.item_id == 69, 'item_id'] = 70

shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"',"shop_name"] = 'СергиевПосад ТЦ "7Я"'
shops.loc[shops.shop_name.str.contains(
    'Жуковский ул. Чкалова 39м', case=False),"shop_name"] = 'Жуковский ул. Чкалова 39м'
shops.loc[shops.shop_name.str.contains(
    'Якутск Орджоникидзе, 56', case=False),"shop_name"] = 'Якутск Орджоникидзе, 56'
shops.loc[shops.shop_name.str.contains(
    '!Якутск ТЦ "Центральный" фран', case=False),"shop_name"] = 'Якутск ТЦ "Центральный" фран'
shops["city"] = shops.shop_name.str.split(" ").map(lambda x: x[0])
shops["category"] = shops.shop_name.str.split(" ").map(lambda x: x[1])
shops.loc[shops.city == "!Якутск", "city"] = "Якутск"

With my attempt to get an idea what kind of shops we are dealing with some archtifacts arose. Seems like some shops did not contain any information regarding the shop category, a few categories we get seem to be to rare to be meaningfull. For now I will consider one category 'OTHER' for all these cases.

In [None]:
shops.category.value_counts()

In [None]:
counts = shops['category'].value_counts()
idx = counts[counts.lt(4)].index

shops.loc[shops['category'].isin(idx), 'category'] = 'OTHER'
shops.loc[shops[shops.city=='СергиевПосад'].index, 'city'] = 'ТЦ'
shops.head()

In [None]:
shops.city.unique()
shops.category.unique()

In [None]:
shops.head(3)

In [None]:
set(train['item_id']) == set(test['item_id'])
set(train['shop_id']) == set(test['shop_id'])

In [None]:
train['item_in_test'] = train['item_id'].isin(list(test['item_id']))
train['shop_in_test'] = train['shop_id'].isin(list(test['shop_id']))

In [None]:
train.shape[0]

In [None]:
train['item_in_test'].sum()
train['shop_in_test'].sum()

In [None]:
items.head(3)

In [None]:
items['item_name'].unique()

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import re

stop_words_en = set(stopwords.words('english'))
stop_words_ru = set(stopwords.words('russian'))

def get_fdist(df):
    fdist = FreqDist()
    for item in df['item_name'].unique():
        for w in word_tokenize(item):
            word = re.sub(r"(`|\*|'|!)+", '', w)
            if len(word) < 2:
                continue
            if word.lower() in stop_words_en or word.lower() in stop_words_ru:
                continue
            fdist[word.lower()] += 1
            
    return fdist
            
fdist = get_fdist(items)

In [None]:
fdist.most_common(10)

In [None]:
def broad_category(string):
    if '-' in string:
        cat = string.split('-')[0].strip(' ')
    elif '(' in string:
        cat = string.split('(')[0].strip(' ')
    else:
        cat = string
    return cat

def narrow_category(string):
    if '-' in string:
        cat = string.split('-')[1].strip(' ')
    elif '(' in string:
        cat = string.split('(')[1].strip(' ').replace(')', '')
    else:
        cat = 'UNK'
    return cat

cats['super_cat'] = cats['item_category_name'].apply(broad_category)
cats['sub_cat'] = cats['item_category_name'].apply(narrow_category)

In [None]:
super_cat_dict = {x:y for x, y in zip(
    cats['super_cat'].unique(), range(1, cats['super_cat'].nunique()+1))}
sub_cat_dict = {x:y for x, y in zip(
    cats['sub_cat'].unique(), range(1, cats['sub_cat'].nunique()+1))}

super_cat_index = {y:x for x, y in super_cat_dict.items()}
sub_cat_index = {y:x for x, y in sub_cat_dict.items()}

In [None]:
cats_m = pd.merge(items, cats, on='item_category_id').drop('item_category_id', axis=1)
cats_m.head()

In [None]:
items_train = pd.merge(cats_m, train, on='item_id')
items_test = pd.merge(cats_m, test, on='item_id').drop('item_category_name', axis=1)
items_train = pd.merge(items_train, shops, on='shop_id')
items_test = pd.merge(items_test, shops, on='shop_id')

assert items_train.shape[0]==train.shape[0]
assert items_test.shape[0]==test.shape[0]
items_train.head(3)
items_test.head(3)

In [None]:
items_train.super_cat.unique()

In [None]:
fig, ax = plt.subplots(1, 2)
sns.distplot(items_train['item_price'].dropna(), ax=ax[0]);
sns.distplot(items_train['item_price'].fillna(items_train['item_price'].median()), ax=ax[1]);

In [None]:
fig, ax = plt.subplots(1, 2)
sns.distplot(np.log(items_train['item_price'].dropna()), ax=ax[0]);
sns.distplot(np.log(items_train['item_price'].fillna(items_train['item_price'].median())), ax=ax[1]);

In [None]:
# for now what we want is less memory usage :)
to_drop = ['item_name', 'date', 
           'shop_name', 'item_category_name']
items_train = items_train.drop(to_drop, axis=1)

In [None]:
# items_train = cudf.DataFrame.from_pandas(items_train)
grouped = items_train.groupby(['item_id', 'shop_id', 'date_block_num'], as_index=False)

In [None]:
def mean_log(values):
    if len(values)==1:
        return np.log(values)
    else:
        return np.log(np.mean(values))
    
def to_int(values):
    if len(values)==1:
        return values.astype('int')
    else:
        return values.astype('int').mode()[0]
    
def encode_super_cats(string, codes=super_cat_dict):
    if len(string)==1:
        s = string.values[0]
    else:
        s = string.tolist()[0]
    return codes[s]

def encode_sub_cats(string, codes=sub_cat_dict):
    if len(string)==1:
        s = string.values[0]
    else:
        s = string.tolist()[0]
    return codes[s]
        

items_train = grouped.agg({'super_cat': encode_super_cats, 
             'sub_cat': encode_sub_cats, 
             'item_price': mean_log, 
             'item_cnt_day': np.sum,
            'item_in_test': to_int,
            'shop_in_test': to_int})

items_train.head()

In [None]:
grouped = items_test.groupby(['item_id', 'shop_id'], as_index=False)
items_test = grouped.agg({'super_cat': encode_super_cats, 
             'sub_cat': encode_sub_cats,})

items_test.head()

In [None]:
def get_price_dict(i_df=items, ref_df=items_train):
    p_dict = dict()
    for i in tqdm(i_df.item_id.unique()):
        if i in ref_df.item_id:
            p_dict[i] = ref_df[ref_df.item_id==i].item_price.mean()
        else:
            p_dict[i] = None
            
    return p_dict

def get_cnt_dict(ref_df=items_train):
    p_dict = dict()
    for i in tqdm(ref_df.item_id.unique()):
        p_dict[i] = ref_df[ref_df.item_id==i].item_cnt_day.tolist()[0]
    return p_dict

cnt_dict = get_cnt_dict()
price_dict = get_price_dict()

In [None]:
# price_dict = get_price_dict()
def get_price(row):
    i = row['item_id']
    if price_dict[i] is not None:
        return price_dict[i] #row['item_price']
    i = items_train[items_train.sub_cat==row['sub_cat']].item_price.mean()
    return i

def get_cnt(row):
    i = row['item_id']
    #if cnt_dict[i] is not None:
    if i in cnt_dict.keys():
        return cnt_dict[i] #row['item_price']
    i = items_train[items_train.sub_cat==row['sub_cat']].item_cnt_day.mode()[0]
    return i

#assert items_train.shape[0]==train.shape[0]
#assert items_test.shape[0]==test.shape[0]
    
items_test.loc[:, 'item_price'] = items_test.apply(get_price, axis=1)
items_test.loc[:, 'item_cnt_day'] = items_test.apply(get_cnt, axis=1)

In [None]:
items_test #.loc[items_test, 'item_price']

In [None]:
items_train.sample(3)
items_test.sample(3)

Let's first see if we can get some information from categories alone.

In [None]:
X = items_train.drop(['item_cnt_day', 
                      'date_block_num', 
                      'item_id', 
                      'item_in_test', 
                      'shop_in_test',
                     'item_price'], axis=1) 
#[['super_cat', 'sub_cat']]
y = items_train.item_cnt_day
X
y

In [None]:
from catboost import CatBoostRegressor

In [None]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

kf = KFold(n_splits=5)
model = CatBoostRegressor(cat_features=['shop_id', 'super_cat', 'sub_cat'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=seed)

model = model.fit(X_train, y_train)
preds = model.predict(X_test)
print('rmse:', mean_squared_error(y_test, preds, squared=False))
"""
for train_index, test_index in kf.split(X):

    X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    clf = model.fit(X_train, y_train)
    preds = clf.predict(X_test)
    print('rmse:', mean_squared_error(y_test, preds, squared=False))
"""

Bad, I know. But let's build a submission pipeline so do not worry about it later.

In [None]:
items_test.item_id

In [None]:
error_ids = []
for ind in tqdm(sub_df.index):
    item = sub_df.loc[ind, 'ID']
    vals = items_train[items_train.item_id==item][['shop_id', 'super_cat', 'sub_cat']]
    if len(vals)==0:
        vals = items_test[items_test.item_id==item][['shop_id', 'super_cat', 'sub_cat']]
    #print(vals)
    try:
        sub_df.loc[ind, 'shop_id'] = vals.shop_id.tolist()[-1]
        sub_df.loc[ind, 'super_cat'] = vals.super_cat.tolist()[-1]
        sub_df.loc[ind, 'sub_cat'] = vals.sub_cat.tolist()[-1]
    except Exception:
        sub_df.loc[ind, 'shop_id'] = items_train.shop_id.mode()[0]
        sub_df.loc[ind, 'super_cat'] = items_train.super_cat.mode()[0]
        sub_df.loc[ind, 'sub_cat'] = items_train.sub_cat.mode()[0]
        error_ids.append(item)

In [None]:
sub_df
error_ids

In [None]:
X

In [None]:
sub_df[['shop_id', 'super_cat', 'sub_cat']].values

In [None]:
preds = model.predict(sub_df[['shop_id', 'super_cat', 'sub_cat']].astype('int'))

In [None]:
sub_df.item_cnt_month = preds
sub_df

In [None]:
sub_df = sub_df[['ID', 'item_cnt_month']]
sub_df.to_csv('submission.csv', index=False)