# Final project

# Preprocessing

## Import packages

import textdistance to calculate Levenshtein distance

In [None]:
!pip install textdistance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from itertools import product
import textdistance
from xgboost import XGBRegressor
from xgboost import plot_importance
from datetime import datetime, date
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

## Useful functions

create a useful function to reduce data size

In [None]:
# downcast 
def downcast(df):
    df_dtypes = df.dtypes
    new_dtypes = []
    for idx, col_dtype in enumerate(df_dtypes):
        if col_dtype == "float64":
            new_dtypes.append(np.float16)
        elif col_dtype in ["int64", "int32", "int16"]:
            if df[df.columns[idx]].min() >= -128 and df[df.columns[idx]].max() <= 127:
                new_dtypes.append(np.int8)
            elif df[df.columns[idx]].min() >= -32768 and df[df.columns[idx]].max() <= 32767:
                new_dtypes.append(np.int16)
            elif df[df.columns[idx]].min() >= -2147483648  and df[df.columns[idx]].max() <= 2147483647:
                new_dtypes.append(np.int32)
            else:
                new_dtypes.append(np.int64)
        else:
            new_dtypes.append(col_dtype)
            
    return df.astype(dict(zip(df.columns,new_dtypes)))

## Load data

I previously translated shops and item_categories data set by googletrans API.

In [None]:
# load data
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')

# item_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
# shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')

item_categories = pd.read_pickle('../input/translated/item_categories.pkl')
shops = pd.read_pickle('../input/translated/shops.pkl')

### panda display option

for the purpose of dataframe visulization

In [None]:
pd.options.display.precision = 6
pd.options.display.float_format = '{:.2f}'.format

In [None]:
# check
print("-----------------train-------------------------")
train.info()
print(train.head())
print(train.describe())

In [None]:
# check
print("-----------------test-------------------------")
test.info()
print(test.head())
print(test.describe())

In [None]:
# check
print("-----------------item_categories-------------------------")
item_categories.info()
print(item_categories.head())
print(item_categories.describe())

In [None]:
# check
print("-----------------items-------------------------")
items.info()
print(items.head())
print(items.describe())

In [None]:
# check
print("-----------------shops-------------------------")
shops.info()
print(shops.head())
print(shops.describe())

# EDA

## Train


In [None]:
# check duplicated rows
print(train.duplicated().count())

# delete duplicated rows
train = train[~train.duplicated()]
train.info()

In [None]:
# plot
#date_block_num 

train["date_block_num"].value_counts(normalize=True).sort_index().plot(kind="bar", figsize = (15,5))

# 11 and 23 have a large number of sales

In [None]:
#shop_id   

train["shop_id"].value_counts(normalize=True).plot(kind="bar", figsize = (15,5))

# Some shops have a large number of sales, 

In [None]:
#item_id     

train["item_id"].plot(kind="hist", figsize = (15,5))

# The distribution is not even. Some items seem to have more sales than others.

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(train["item_id"].value_counts())


In [None]:
#item_price     

train["item_price"].plot(kind="hist", figsize = (15,5))

# Maybe there are outliers

In [None]:
train["item_id"].value_counts().plot(kind="hist", figsize = (15,5))

In [None]:
#item_cnt_day      

train["item_cnt_day"].plot(kind="hist", figsize = (15,5))

# Maybe there are outliers

In [None]:
train["item_id"].value_counts().plot(kind="hist", figsize = (15,5))

In [None]:
train["item_id"].value_counts().sort_values(ascending=False)[:10]

### Outliers in train

#### shop_id  

In [None]:
#shop_id   

plt.figure(figsize=(15,5))
sns.boxplot(train["shop_id"].value_counts(normalize=True))

In [None]:
train["shop_id"].value_counts(normalize=True).sort_values(ascending=False)[:10]



Take in account that the distribution of shop sales in uneven

#### item_id  

In [None]:
train["item_id"].value_counts().sort_values(ascending=False)[:10]


item_id 20949 seems to make a large number of sales

In [None]:
items.loc[items["item_id"]==20949]

What is that ?

In [None]:
item_categories.loc[item_categories["item_category_id"]==71]

plot item_id 20949 across date_block_num 

In [None]:
train.loc[train["item_id"]==20949]["date_block_num"].value_counts(normalize=True).sort_index().plot(kind="bar", figsize = (15,5))

Except date_block_num 0, 1 and 2, this item has been sold across the entire period. It seems that this item launched on date_block_num 3.   
It is something strange that sales decrease towards date_block_num 24. Maybe it is substituted by another item or chaged its price. 

#### item_price 

In [None]:
#item_price     
plt.figure(figsize=(15,5))
sns.boxplot(train["item_price"])

In [None]:
train["item_price"].sort_values(ascending=False)[:10]

Apparently there are some outliers.

In [None]:
train[train['item_price'] == 307980]

In [None]:
items.loc[items["item_id"]==6066]

In [None]:
item_categories.loc[item_categories["item_category_id"]==75]

Radmin 3 - 522 лиц. seems to be a computer software and this item was sold only one day in the entire period with high price.  
Therefore, it is reasonable to delete this item from the train set.  
Make sure that the test set does not contain this item

In [None]:
test[test["item_id"]==6066].empty

Delete item_id 6066 from train set

In [None]:
train = train[train["item_id"] != 6066]
train.info()

In [None]:
train[train['item_price'] < 0]

item_price < 0 is wired. 

In [None]:
items.loc[items["item_id"]==2978]

In [None]:
item_categories.loc[item_categories["item_category_id"]==31]

This is a pc game but it is not clear why the price is -1. Maybe an eror. So delete item_id 2978.

In [None]:
train = train[train["item_id"] != 2978]

#### item_cnt_day

In [None]:
train["item_cnt_day"].sort_values(ascending=False)[:10]

Outliers ?

In [None]:
train[train["item_cnt_day"] > 999]

In [None]:
items.loc[items["item_id"]==11373]

What is that ?

In [None]:
item_categories.loc[item_categories["item_category_id"]==9]

Доставка до пункта выдачи (Boxberry) seems to be a delivery service.

plot item_cnt_day of item_id 11373

In [None]:

plt.figure(figsize=(15,5))
sns.boxplot(train[train["item_id"]==11373]["item_cnt_day"])

In [None]:
train[train["item_id"]==11373]["item_cnt_day"].sort_values(ascending=False)[:10]

As this value is quite extreme, it should be deleted from the train set.

In [None]:

train = train[train["item_cnt_day"] < 1000]

### test

In [None]:
# check duplicated rows
print(test[["shop_id", "item_id"]].duplicated().sum())

There is no duplicated row in the test set

#### shop_id

See the distribution

In [None]:
test["shop_id"].value_counts().sort_index().plot(kind="bar", figsize = (15,5))

In [None]:
test["shop_id"].value_counts().sort_values()[:10]


The distribution is really even ?

In [None]:
test["shop_id"].value_counts().sort_values()[-5:]

very interesting that all shops have same number of rows, 5100

#### item_id


See the distribution

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(test["item_id"].value_counts())

In [None]:
test["item_id"].value_counts().sort_values()[:10]

In [None]:
test["item_id"].value_counts().sort_values()[-10:]

interesting all items appear same number 42 times

## Shop

In [None]:
mat = np.zeros((shops.shape[0], shops.shape[0]))

calculate levenshtein distance

In [None]:
for row in range(shops.shape[0]):
    for col in range(shops.shape[0]):
        mat[row, col] = textdistance.levenshtein.normalized_similarity(shops["shop_name"][row], shops["shop_name"][col])
# if 1 set as 0
mat = np.where(mat == 1 , 0, mat)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(mat, vmin = 0.7, xticklabels = 1, yticklabels = 1)

In [None]:
# 0.7 as threshold
shop_name_duplicated = pd.DataFrame(np.transpose(np.nonzero(np.where(mat > 0.7, mat, 0))))
shop_name_duplicated = shop_name_duplicated[~shop_name_duplicated.apply(frozenset, axis=1).duplicated()].reset_index(drop=True)
print(shop_name_duplicated)

In [None]:
for index, row in shop_name_duplicated.iterrows():
    print("shop_id:", row[0], "shop_name:", shops.loc[shops["shop_id"] == row[0], "shop_name"].values[0])
    print("shop_id:", row[1], "shop_name:", shops.loc[shops["shop_id"] == row[1], "shop_name"].values[0])
    print()

Some shops seem to be the same.

plot of item_cnt_days across the entire period (date_block_num) by pairs of shops

In [None]:

fig, ((ax0,ax1), (ax2,ax3), (ax4,ax5), (ax6,ax7)) = plt.subplots(4, 2, figsize=(15, 20))

for index, row in shop_name_duplicated.iterrows():
    data = train[["date_block_num", "item_cnt_day", "shop_id"]].query('shop_id == @row[0] or shop_id == @row[1]')\
        .groupby(["shop_id", "date_block_num"]).sum().reset_index()
    sns.lineplot(x="date_block_num", y="item_cnt_day", hue="shop_id", legend = "full", data=data, ax=eval("ax" + str(index)),  palette = ["Blue", "Green"])

plt.show()

The plots show that the following shop's names are cosiderated as the same shop:  

- 0 and 57   
- 1 and 58  
- 10 and 11  
- 23 and 24  

To correct shop_id in the train and test set, check if the test set contains all these shop_id

In [None]:
test[test.isin({"shop_id":[0,57,1,58,10,11,23,24]})["shop_id"]]["shop_id"].drop_duplicates()

The right way to change shop_id is the following:

- 0 to 57 
- 1 to 58
- 23 to 24
- 11 to 10

shop_id 10 and 11

In [None]:
train.loc[train["shop_id"]==10].describe()

The two lines are overlapped.

In [None]:
train.loc[train["shop_id"]==11].describe()

The shop 11 contains only data of date_block_num 25. 

In [None]:
# plot 
plt.figure(figsize=(15,5))
sns.countplot(x="date_block_num", hue="shop_id", data = train.query('shop_id == [10, 11]'))
plt.show()

The shop 11 might be the shop 10 in the date_block_num 25.

shop_id correction

In [None]:
train.loc[train["shop_id"]== 0, "shop_id"] = 57
test.loc[test["shop_id"]== 0, "shop_id"] = 57

train.loc[train["shop_id"]== 1, "shop_id"] = 58
test.loc[test["shop_id"]== 1, "shop_id"] = 58

train.loc[train["shop_id"]== 23, "shop_id"] = 24
test.loc[test["shop_id"]== 23, "shop_id"] = 24

train.loc[train["shop_id"]== 11, "shop_id"] = 10
test.loc[test["shop_id"]== 11, "shop_id"] = 10

# Train and Test relationships

To check all possible combinations of shop_id and item_id of the test data appear at least one in the train data 

## unique values and combinations

In [None]:
print("shop_id:", "train:" , len(train["shop_id"].unique()), "test:", len(test["shop_id"].unique()))
print("item_id:", "train:" , len(train["item_id"].unique()), "test:", len(test["item_id"].unique()))

In [None]:
train_comb = train[["shop_id", "item_id"]].drop_duplicates()
print(train_comb)

In [None]:
test_comb = test[["shop_id", "item_id"]].drop_duplicates()
print(test_comb)

There is no duplicated combinations in the test set.

In [None]:
unseen_shops = test_comb.shop_id[~test_comb["shop_id"].isin(train_comb["shop_id"])].drop_duplicates()
unseen_items = test_comb.item_id[~test_comb["item_id"].isin(train_comb["item_id"])].drop_duplicates()
print("In the test set:", unseen_shops.count(), "unseen shops","and", unseen_items.count(), "unseen items")

All shops appeared at least once but there are 15246 never seen items !!

seen items

In [None]:
len(test["item_id"].unique()) - unseen_items.count()

unseen combinations

In [None]:
unseen_combs = []
seen_others = []
for shop in test_comb["shop_id"].unique():
    test_items = test_comb.loc[test_comb["shop_id"] == shop, ["item_id"]].values.reshape(-1)
    # in the same shop
    train_items = train_comb.loc[train_comb["shop_id"] == shop, ["item_id"]].values.reshape(-1)
    # but in other shops (maybe overlapped)
    train_items_others = train_comb.loc[train_comb["shop_id"] != shop, ["item_id"]].values.reshape(-1)
    for item in test_items:
        if item not in train_items:
           unseen_combs.append((shop, item))

to dataframe

In [None]:
unseen_combs_df = pd.DataFrame(unseen_combs, columns=["shop_id", "item_id"])
unseen_combs_df.info()

102697 out of 24100 combinations have not seen in the train set.

In [None]:
unseen_comb_items = unseen_combs_df["item_id"].unique()
len(unseen_comb_items)

But actually 363 never seen items and other items have been seen at least once in another shop.

How many combinations in the test set have been seen ?

In [None]:
214200 - 102697

How many items have been seen but not its combination ?

In [None]:
102697-363*42

seen combinations

In [None]:
seen_combs = []
for shop in test_comb["shop_id"].unique():
    test_items = test_comb.loc[test_comb["shop_id"] == shop, ["item_id"]].values.reshape(-1)
    # in the same shop
    train_items = train_comb.loc[train_comb["shop_id"] == shop, ["item_id"]].values.reshape(-1)
    for item in test_items:
        if item in train_items:
           seen_combs.append((shop, item))

In [None]:
print(len(seen_combs))

## Outdated shop/item

As other kernel notebooks have shown, I also considered that items which have not sold for the last 6 months are outdated.  
And shops which have no sales for the last 6 months can be considered outdated.

#### outdated shops

date_blok_num and corresponding year

- 0-11 2013
- 12-23 2014
- 24-33 2015

In [None]:
last_months_shops_sales = train.query('date_block_num > 27').groupby("shop_id")["item_cnt_day"].sum()

print(last_months_shops_sales > 0) 

In [None]:
last_months_sales_shops = last_months_shops_sales.index.tolist()

In [None]:
no_sales_shops = []
for shop in test["shop_id"].unique():
    if shop not in last_months_sales_shops:
        no_sales_shops.append(shop)
print(no_sales_shops)

#### outdated items

In [None]:
last_months_items_sales = train.query('date_block_num > 27').groupby("item_id")["item_cnt_day"].sum()
last_months_items_sales = last_months_items_sales[last_months_items_sales > 0]
print(last_months_items_sales) 

In [None]:
last_months_sales_items = last_months_items_sales.index.tolist()

In [None]:
no_sales_items = []
for item in test["item_id"].unique():
    if item not in last_months_sales_items:
        no_sales_items.append(item)
print(no_sales_items)

number of no_sales_items

In [None]:
len(no_sales_items)

Some of no_sales_items are unseen_items

In [None]:
len(set(no_sales_items) & set(unseen_items))

In [None]:
outdated_items = list(set(no_sales_items) - set(unseen_items))
len(outdated_items)

186 items of the train set seem to be outdated and test set contains these items.
186*42 = 7812 combinations in the test set

Combinations of outdated items in the test set

In [None]:
outdated_combs = []
for shop in test_comb["shop_id"].unique():
    test_items = test_comb.loc[test_comb["shop_id"] == shop, ["item_id"]].values.reshape(-1)
    for item in outdated_items:
        if item in test_items:
           outdated_combs.append((shop, item))

In [None]:
len(outdated_combs)                    

### Resume before feature engineering

- Items
    - Never seen 363 items
    - Seen 4737 items but never sold in a given shop combinations of shop-item no exist
        - 186 items seem to be outdated as they have not made any sales in the last 6 months
        - 4551 items have been seen in the last 6 months

- Combinations
    - Never seen 15246 combinations
    - 87451 combinations of shop-item no exist, but items have been sold at least once
    - Seen 111503 combinations of which,
        - combinations exist in the test set but the item have not seen in the last 6 months (7812 combinations)
        - combinations exist and have been seen in the last 6 months (111503 - 7812 = 103691 combinations)


# Text analysis: item_categories/shops

## Item categories

In [None]:
item_categories.info()
print(item_categories.head())
print(item_categories.item_category_name.unique())

As item_categories names are separated by "-", split it and generate new columns.  

In [None]:
# split
no_split = item_categories.item_category_name.str.split(" ", expand=True, n = 2)

# replace hyphen and None as ""
no_split = no_split.fillna("").apply(lambda x: x.replace("-", ""))
no_split[0] = no_split[0] + " " + no_split[1]

# create item category df
item_category_df = no_split.drop(columns = 1)
item_category_df.columns = ["big_category_name", "sub_category_name"]

# item_category_df big_category_name column
big_category_name = item_category_df.big_category_name.str.strip()

# item_category_df sub_category_name column
sub_category_name = item_category_df.sub_category_name.apply(lambda x: x.replace("-", "")).str.strip()

item_category_df["big_category_name"] = big_category_name
item_category_df["sub_category_name"] = sub_category_name

# item category encoding
item_category_df["big_category_id"] = item_category_df.big_category_name.astype("category").cat.codes.to_frame(name = "big_category_id")
item_category_df["sub_category_id"] = item_category_df.sub_category_name.astype("category").cat.codes.to_frame(name = "sub_category_id")

# join to item_categories
item_categories = item_categories.join(item_category_df)
print(item_categories)
item_categories = downcast(item_categories)

del no_split
del item_category_df

## Items

It can no be translated by googletrans API because of its data size.

In [None]:
print("items")
items.info()
print(items.head())
print(items.item_name.unique())

## Shops

In [None]:
print("shops")
shops.info()
print(shops.head())
print(shops.shop_name.unique())

Same as items_categories, split it and generate new columns, extracting city name and shop catgeory.

In [None]:
# split
split = shops.shop_name

# delete 
split = split.apply(lambda x:x.replace("!", ""))
#print(split)

# some names seem to be repeated

# with out split 
without_split = split.str.split(expand=True)
#print(without_split)

# city name correction for index 34 35 42 43 and 46 
to_correct = without_split.iloc[[34, 35, 42, 43, 46]].drop(columns = [3,4,5,6,7])
to_correct[0] = to_correct[0] + " " + to_correct[1]
to_correct[1] = to_correct[2]
to_correct.drop(columns = 2, inplace=True)

without_split_new = without_split.drop(index= [34, 35, 42, 43, 46]).drop(columns = [2,3,4,5,6,7])
without_split_new = to_correct.join(without_split_new, how="outer", rsuffix="_").fillna("")

without_split_new["0"] = without_split_new["0"] + without_split_new["0_"]
without_split_new["1"] = without_split_new["1"] + without_split_new["1_"]
without_split_new.drop(columns = ["0_","1_"], inplace=True)
without_split_new.columns = ["city_name", "shop_category_name"]
without_split_new.city_name = without_split_new.city_name.str.strip()
without_split_new.shop_category_name = without_split_new.shop_category_name.str.strip()

# first column: city name encoding
city_name_ids = without_split_new.city_name.astype("category").cat.codes.to_frame(name = "city_id")
# first column: shop category encoding
shop_category_ids = without_split_new.shop_category_name.astype("category").cat.codes.to_frame(name = "shop_category_id")

# create shop_df 
shop_df = without_split_new.join(city_name_ids).join(shop_category_ids)

# and join to shops
shops = shops.join(shop_df)
print(shops)
shops = downcast(shops)

del to_correct
del shop_df

# Data leakages

## Missing values

In [None]:


# NA
print("train")
print(train.isna().sum())
print("test")
print(test.isna().sum())
print("item_categories")
print(item_categories.isna().sum())
print("items")
print(items.isna().sum())
print("shops")
print(shops.isna().sum())

# Null
print("train")
print(train.isnull().sum())
print("test")
print(test.isnull().sum())
print("item_categories")
print(item_categories.isnull().sum())
print("items")
print(items.isnull().sum())
print("shops")
print(shops.isnull().sum())

# Feature engineering

## Train set extension

Compute item_cnt_days of all combinations of shop and item across date_block_num.  
If there is not item_cnt_days for a given combination of shop and item a given date_block_num, item_cnt_days as 0 is computed.

In [None]:
train_extended = []
for i in range(34):
    sales = train.query('date_block_num == @i')
    train_extended.append(np.array(list(product([i] ,sales["shop_id"].unique(), sales["item_id"].unique()))))

cols = ["date_block_num", "shop_id", "item_id"]
print(train_extended)

Using vstack and make a dataframe

In [None]:
train_extended = pd.DataFrame(np.vstack(train_extended), columns = cols)

In [None]:
train_extended.info()
print(train_extended.describe())

downcast

In [None]:
train_extended = downcast(train_extended)

## monthly sales (respect to item_cnt_days)

calculate revenue

In [None]:
# date 
# train["date"] = pd.to_datetime(train["date"], format="%d.%m.%Y")
# train['year'], train['month'] = train['date'].dt.year, train['date'].dt.month

# revenue
train["revenue"] = train["item_price"] * train["item_cnt_day"] 

The train set is grouped by date_block_num to calculate monthly mean of item_cnt_days.

In [None]:
train_monthly = train.groupby(['date_block_num', "shop_id", "item_id"], as_index = False)\
    .agg({"item_cnt_day" : ["sum"]})

train_monthly.columns = ['date_block_num',"shop_id", "item_id", "item_cnt_month"]

train_monthly = downcast(train_monthly)

Clip target values

In [None]:
train_monthly["item_cnt_month"] = train_monthly["item_cnt_month"].clip(0,20)

# add year and month
# data_monthly = data_monthly.merge(train[['date_block_num',"year", "month"]].drop_duplicates(), on = 'date_block_num')

In [None]:
print(train_monthly.describe())

Merge train_monthly to train_extended

In [None]:
train_new = train_extended.merge(train_monthly, on=["date_block_num", "shop_id", "item_id"], how="left").fillna(0)

In [None]:
train_new = downcast(train_new)
train_new["item_cnt_month"] = train_new["item_cnt_month"].astype(np.int8)

In [None]:
train_new.info()

In [None]:
train_new.describe()

## Add test set to the train set

November 2015 corresponds to date_block_num 34.

In [None]:
test = test.drop("ID", axis = 1)
test.insert(0, "date_block_num", 34)

In [None]:
test = downcast(test)

In [None]:
test.info()

Concatenate 

In [None]:
train_new = pd.concat([train_new, test], ignore_index=True, sort=False, keys=["date_block_num", "shop_id", "item_id", "item_cnt_month"])
train_new.describe()

In [None]:
train_new.fillna(0, inplace=True)

In [None]:
# downcast
train_new = downcast(train_new)

In [None]:
train_new.info()

In [None]:
train_new.describe()

## add other data frames

Add item_categoriees data 

In [None]:
items_and_item_categories = items.drop(columns=["item_name"])\
    .merge(item_categories.drop(columns=["item_category_name", "big_category_name", "sub_category_name"]))

items_and_item_categories.info()

In [None]:
train_new = train_new.merge(items_and_item_categories, on = ["item_id"])
test = test.merge(items_and_item_categories, on = ["item_id"])

Add shops data

In [None]:
train_new = train_new.merge(shops.drop(columns=["shop_name", "city_name", "shop_category_name"]), on = ["shop_id"])
test = test.merge(shops.drop(columns=["shop_name", "city_name", "shop_category_name"]), on = ["shop_id"])

In [None]:
train_new.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
train_new.describe()

In [None]:
test.describe()

In [None]:
train_new.info()

In [None]:
test.info()

## Shifted features of the target

In [None]:
# use this for lag features
lags = [1,2,3,6,12]

def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
# target 
train_new = lag_feature(train_new, lags, "item_cnt_month")
train_new.fillna(0, inplace=True)
train_new = downcast(train_new)

## target mean by date_block_num 

In [None]:
# by date_block_num mean
group = train_new.groupby("date_block_num").agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num", "date_block_item_cnt_mean"]

In [None]:
# merge it to train_new, generate lag 1 feature and drop it as date_block_item_cnt_mean for the test set is all zero
train_new =  train_new.merge(group, on=["date_block_num"])
train_new.fillna(0, inplace=True)

In [None]:
# lag 1 feature
train_new = lag_feature(train_new, [1], "date_block_item_cnt_mean")
train_new.fillna(0, inplace=True)

In [None]:
train_new.drop("date_block_item_cnt_mean", axis=1, inplace=True)

### target mean by date_block_num and shop_id 

In [None]:
# use lags as all shops of the test set appear in the train set
group = train_new.groupby(["date_block_num","shop_id"]).agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num","shop_id", "shop_item_cnt_mean"]
train_new = train_new.merge(group, on=["date_block_num", "shop_id"])
train_new = lag_feature(train_new, lags, "shop_item_cnt_mean")
train_new.fillna(0, inplace=True)

In [None]:
train_new.drop("shop_item_cnt_mean", axis=1, inplace=True)

### target mean by date_block_num and item_id 

In [None]:
# use only lag 1 as there are many unseen items in the test set
group = train_new.groupby(["date_block_num","item_id"]).agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num","item_id", "item_item_cnt_mean"]
train_new = train_new.merge(group, on=["date_block_num", "item_id"])
train_new = lag_feature(train_new, [1], "item_item_cnt_mean")
train_new.drop("item_item_cnt_mean", axis=1, inplace=True)
train_new.fillna(0, inplace=True)

### target mean by date_block_num and item_category_id

In [None]:
# use only lag 1 as there are many unseen items in the test set
group = train_new.groupby(["date_block_num","item_category_id"]).agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num","item_category_id", "item_category_item_cnt_mean"]
train_new = train_new.merge(group, on=["date_block_num", "item_category_id"])
train_new = lag_feature(train_new, [1], "item_category_item_cnt_mean")
train_new.drop("item_category_item_cnt_mean", axis=1, inplace=True)
train_new.fillna(0, inplace=True)

### target mean by date_block_num and city_id 

In [None]:
group = train_new.groupby(["date_block_num","city_id"]).agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num","city_id", "city_item_cnt_mean"]
train_new = train_new.merge(group, on=["date_block_num", "city_id"])
train_new = lag_feature(train_new, [1], "city_item_cnt_mean")
train_new.drop("city_item_cnt_mean", axis=1, inplace=True)
train_new.fillna(0, inplace=True)

### target mean by date_block_num and big_category_id

In [None]:
group = train_new.groupby(["date_block_num","big_category_id"]).agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num","big_category_id", "big_category_item_cnt_mean"]
train_new = train_new.merge(group, on=["date_block_num", "big_category_id"])
train_new = lag_feature(train_new, [1], "big_category_item_cnt_mean")
train_new.drop("big_category_item_cnt_mean", axis=1, inplace=True)
train_new.fillna(0, inplace=True)

### target mean by date_block_num and sub_category_id

In [None]:
group = train_new.groupby(["date_block_num","sub_category_id"]).agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num","sub_category_id", "sub_category_item_cnt_mean"]
train_new = train_new.merge(group, on=["date_block_num", "sub_category_id"])
train_new = lag_feature(train_new, [1], "sub_category_item_cnt_mean")
train_new.drop("sub_category_item_cnt_mean", axis=1, inplace=True)
train_new.fillna(0, inplace=True)

### target mean by date_block_num, item_category_id and item_id  all

In [None]:
group = train_new.groupby(["date_block_num","item_category_id", "item_id"]).agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num","item_category_id", "item_id", "item_category_item_item_cnt_mean"]
train_new = train_new.merge(group, on=["date_block_num","item_category_id", "item_id"])
train_new = lag_feature(train_new, lags, "item_category_item_item_cnt_mean")
train_new.drop("item_category_item_item_cnt_mean", axis=1, inplace=True)
train_new.fillna(0, inplace=True)

### target mean by date_block_num, shop_id and item_category_id  all

In [None]:
group = train_new.groupby(["date_block_num", "shop_id", "item_category_id"]).agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num", "shop_id", "item_category_id", "shop_item_category_item_cnt_mean"]
train_new = train_new.merge(group, on=["date_block_num", "shop_id", "item_category_id"])
train_new = lag_feature(train_new, lags, "shop_item_category_item_cnt_mean")
train_new.drop("shop_item_category_item_cnt_mean", axis=1, inplace=True)
train_new.fillna(0, inplace=True)

### target mean by date_block_num, shop_id and big_category_id

In [None]:
group = train_new.groupby(["date_block_num", "shop_id", "big_category_id"]).agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num", "shop_id", "big_category_id", "shop_big_category_item_cnt_mean"]
train_new = train_new.merge(group, on=["date_block_num", "shop_id", "big_category_id"])
train_new = lag_feature(train_new, [1], "shop_big_category_item_cnt_mean")
train_new.drop("shop_big_category_item_cnt_mean", axis=1, inplace=True)
train_new.fillna(0, inplace=True)

### target mean by date_block_num, shop_id and sub_category_id

In [None]:
group = train_new.groupby(["date_block_num", "shop_id", "sub_category_id"]).agg({"item_cnt_month": ["mean"]}).reset_index()
group.columns = ["date_block_num", "shop_id", "sub_category_id", "shop_sub_category_item_cnt_mean"]
train_new = train_new.merge(group, on=["date_block_num", "shop_id", "sub_category_id"])
train_new = lag_feature(train_new, [1], "shop_sub_category_item_cnt_mean")
train_new.drop("shop_sub_category_item_cnt_mean", axis=1, inplace=True)
train_new.fillna(0, inplace=True)

In [None]:
print(train_new.describe())
print(train_new.info())

## other features

month

In [None]:
train_new["month"] = train_new["date_block_num"] % 12

holidays

In [None]:
# from http://www.timebie.com/calendar/russia2013.php

holidays = [13, 8, 11, 8, 10, 11, 8, 9, 9, 8, 10, 9,
            12, 8, 10, 4, 11, 10, 8, 10, 8, 8, 12, 8,
            13, 9, 9, 8, 11, 9, 8, 10, 8, 9, 11]

holidays_dict = dict(zip(list(range(35)), holidays))

print(holidays_dict)

train_new["holidays_cnt"] = train_new["date_block_num"].map(holidays_dict)

US Dollar per 1 Russian Ruble Monthly average rate from January 2013 to November 2015

In [None]:
# from https://www.x-rates.com/average/?from=RUB&to=USD&amount=1&year=2013

rates = [
    
    0.033064, 0.033135, 0.032450, 0.031901, 0.031923, 0.030951, 0.030517, 0.030310, 0.030674, 0.031174, 0.030592, 0.030425,
    0.029695, 0.028373, 0.027628, 0.028044, 0.028638, 0.029076, 0.028831, 0.027667, 0.026349, 0.024501, 0.021618, 0.017868,
    0.015704, 0.015517, 0.016594, 0.018804, 0.019731, 0.018311, 0.017496, 0.015301, 0.014936, 0.015841, 0.015365
    
]

rates_dict = dict(zip(list(range(35)), rates))

train_new["dollar_ruble_rate"] = train_new["date_block_num"].map(rates_dict)

number of days

In [None]:
days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

days_dict = dict(zip(list(range(13)), rates))

train_new["days_cnt"] = train_new["month"].map(days_dict)

Delete first 6 months data from train set as lag features are all 0 for these time periods.  
I did not split the data randomly because of time-based features. For example, some items are outdated.

In [None]:
test_sort = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
test_sort.drop("ID", axis=1, inplace = True)

# drop first 6 months
train = downcast(train_new.query('date_block_num < 31 and date_block_num > 5')) # 1 - 30
val = downcast(train_new.query('date_block_num > 30 and date_block_num < 34')) # 31 - 33
test = downcast(test_sort.merge(train_new.query('date_block_num == 34'), on = ["shop_id", "item_id"], how="left")) # 34

In [None]:
train = downcast(train)
val = downcast(val)
test = downcast(test)

## Data split


In [None]:
X_train = train.drop("item_cnt_month", axis = 1).reset_index(drop=True)
X_val = val.drop("item_cnt_month", axis = 1).reset_index(drop=True)
y_train = train[["item_cnt_month"]].reset_index(drop=True)
y_val = val[["item_cnt_month"]].reset_index(drop=True)
X_test = test.drop("item_cnt_month", axis = 1).reset_index(drop=True)

In [None]:
X_train = downcast(X_train)
X_val = downcast(X_val)
y_train = downcast(y_train)
y_val = downcast(y_val)
X_test = downcast(X_test)

## Delete data

In [None]:
del train_new
del train
del test
del val
del items
del shops
del item_categories

In [None]:
gc.collect()

## modeling

Because of the kernel capacity, I reduced number of features. Nevertheless, the rmse error does not increase too much.

In [None]:
xgb_features = ['date_block_num',
 'shop_id',
 'item_id',
 'item_category_id',
 'big_category_id',
 'sub_category_id',
 'city_id',
 'shop_category_id',
 'item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 #'item_cnt_month_lag_6',
 #'item_cnt_month_lag_12',
 'date_block_item_cnt_mean_lag_1',
 'shop_item_cnt_mean_lag_1',
 'shop_item_cnt_mean_lag_2',
 'shop_item_cnt_mean_lag_3',
 #'shop_item_cnt_mean_lag_6',
 #'shop_item_cnt_mean_lag_12',
 'item_item_cnt_mean_lag_1',
 'item_category_item_cnt_mean_lag_1',
 # 'city_item_cnt_mean_lag_1',
 'big_category_item_cnt_mean_lag_1',
 'sub_category_item_cnt_mean_lag_1',
 'shop_category_item_cnt_mean_lag_1',
 'item_category_item_item_cnt_mean_lag_1',
 'item_category_item_item_cnt_mean_lag_2',
#  'item_category_item_item_cnt_mean_lag_3',
 #'item_category_item_item_cnt_mean_lag_6',
 #'item_category_item_item_cnt_mean_lag_12',
 'shop_item_category_item_cnt_mean_lag_1',
 'shop_item_category_item_cnt_mean_lag_2',
#  'shop_item_category_item_cnt_mean_lag_3',
 #'shop_item_category_item_cnt_mean_lag_6',
 #'shop_item_category_item_cnt_mean_lag_12',
 'shop_big_category_item_cnt_mean_lag_1',
 'shop_sub_category_item_cnt_mean_lag_1',
 # 'shop_category_item_item_cnt_mean_lag_1',
 # 'shop_category_item_category_item_cnt_mean_lag_1',
 #'item_cnt_rolling_min',
 #'item_cnt_rolling_max',
 # 'item_cnt_rolling_mean',
 # 'item_cnt_rolling_std',
 'month',
 'holidays_cnt',
 'dollar_ruble_rate',
 'days_cnt']

In [None]:
X_train_xgb = X_train.loc[:, xgb_features]
X_val_xgb = X_val.loc[:, xgb_features]
X_test_xgb = X_test.loc[:, xgb_features]

In [None]:
X_train_xgb.fillna(0, inplace=True)
X_val_xgb.fillna(0, inplace=True)
X_test_xgb.fillna(0, inplace=True)

### Xgboost

In [None]:
xgb_model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=400, 
    colsample_bytree=0.6, 
    subsample=0.6, 
    eta=0.2,    
    seed=0,
    learning_rate = 0.1,
    n_jobs=-1)

xgb_model.fit(
    X_train_xgb, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train_xgb, y_train), (X_val_xgb, y_val)], 
    verbose=10, 
    early_stopping_rounds = 10)

In [None]:
xgb_train_pred = xgb_model.predict(X_train_xgb)
xgb_val_pred = xgb_model.predict(X_val_xgb)
xgb_test_pred = xgb_model.predict(X_test_xgb)

#### plot feature importance

In [None]:
fig, ax = plt.subplots(figsize=(10, 15))
plot_importance(xgb_model,ax=ax)

### Random Forest

Here, I aslo selected only some features.

In [None]:

rf_features = ['date_block_num',
 'shop_id',
 'item_id',
 'item_category_id',
 'big_category_id',
 'sub_category_id',
 'city_id',
 'shop_category_id',
 'item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 #'item_cnt_month_lag_6',
 #'item_cnt_month_lag_12',
 'date_block_item_cnt_mean_lag_1',
 'shop_item_cnt_mean_lag_1',
 'shop_item_cnt_mean_lag_2',
 # 'shop_item_cnt_mean_lag_3',
 #'shop_item_cnt_mean_lag_6',
 #'shop_item_cnt_mean_lag_12',
 'item_item_cnt_mean_lag_1',
 'item_category_item_cnt_mean_lag_1',
#  'city_item_cnt_mean_lag_1',
 'big_category_item_cnt_mean_lag_1',
 'sub_category_item_cnt_mean_lag_1',
 'shop_category_item_cnt_mean_lag_1',
 'item_category_item_item_cnt_mean_lag_1',
 # 'item_category_item_item_cnt_mean_lag_2',
 # 'item_category_item_item_cnt_mean_lag_3',
 #'item_category_item_item_cnt_mean_lag_6',
 #'item_category_item_item_cnt_mean_lag_12',
 'shop_item_category_item_cnt_mean_lag_1',
 'shop_item_category_item_cnt_mean_lag_2',
 # 'shop_item_category_item_cnt_mean_lag_3',
 #'shop_item_category_item_cnt_mean_lag_6',
 #'shop_item_category_item_cnt_mean_lag_12',
 'shop_big_category_item_cnt_mean_lag_1',
 'shop_sub_category_item_cnt_mean_lag_1',
 # 'shop_category_item_item_cnt_mean_lag_1',
 # 'shop_category_item_category_item_cnt_mean_lag_1',
 #'item_cnt_rolling_min',
 #'item_cnt_rolling_max',
 #'item_cnt_rolling_mean',
 #'item_cnt_rolling_std',
 'month',
 'holidays_cnt',
 'dollar_ruble_rate',
 'days_cnt']

In [None]:
X_train_rf = X_train.loc[:, rf_features]
X_val_rf = X_val.loc[:, rf_features]
X_test_rf = X_test.loc[:, rf_features]

In [None]:
X_train_rf.fillna(0, inplace=True)
X_val_rf.fillna(0, inplace=True)
X_test_rf.fillna(0, inplace=True) 

In [None]:
rf_model = RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0, n_jobs=-1)
rf_model.fit(X_train_rf, y_train)

In [None]:
rf_train_pred = rf_model.predict(X_train_rf)
rf_val_pred = rf_model.predict(X_val_rf)
rf_test_pred = rf_model.predict(X_test_rf.fillna(0))

### Ensemble
Use linear model

In [None]:
first_level = pd.DataFrame(xgb_val_pred, columns=["xgb"])
first_level["rf"] = rf_val_pred
first_level.info()

first_level_test = pd.DataFrame(xgb_test_pred, columns=["xgb"])
first_level_test["rf"] = rf_test_pred
first_level_test.info()

### Linear regression

In [None]:
meta_model = LinearRegression(n_jobs=-1)
meta_model.fit(first_level, y_val)

prediction

In [None]:
test_prediction = meta_model.predict(first_level_test)

## Submission

In [None]:
submission = pd.DataFrame(test_prediction, columns=["item_cnt_month"]).clip(0, 20).reset_index()
submission.columns = ["ID", "item_cnt_month"]
print(submission)

In [None]:
submission.to_csv("submission.csv", index=False)