In [None]:
!pip install nb_black watermark

In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns

# from sklearnex import patch_sklearn
# patch_sklearn()


%matplotlib inline
%load_ext autoreload
%autoreload 2
%load_ext lab_black
%load_ext watermark

sns.set()

%watermark -v -m -p numpy,scipy,pandas,matplotlib,statsmodels,sklearn,catboost,xgboost,lightgbm,tensorflow -g

# Goal

The Goal is to predict the total amount of products sold in the next month for every given pair (shop, product). That's regression problem

We also have no features in the test set except shop&item ids

Item counts should be clipped into range [0, 20]

## Feature space
1. ID - an Id that represents a (Shop, Item) tuple within the test set
1. shop_id - unique identifier of a shop
1. item_id - unique identifier of a product
1. item_category_id - unique identifier of item category
1. item_cnt_day - number of products sold. You are predicting a monthly amount of this measure
1. item_price - current price of an item
1. date - date in format dd/mm/yyyy
1. date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
1. item_name - name of item
1. shop_name - name of shop
1. item_category_name - name of item category

## Shared variables

In [None]:
from tqdm.notebook import tqdm
import os


RANDOM_STATE = 42
NUM_SPLITS = 5
target_feature = "item_cnt_month"
DATA_DIR = os.path.join("..", "input", "competitive-data-science-predict-future-sales")

# Step 1. First look at the data

In [None]:
sales = pd.read_csv(os.path.join(DATA_DIR, "sales_train.csv"))
print(sales.shape)
sales.head()

It contains sales per day, need to aggregate it to monthly sales

In [None]:
items = pd.read_csv(os.path.join(DATA_DIR, "items.csv"))
print(items.shape)
items.head()

In [None]:
item_categories = pd.read_csv(os.path.join(DATA_DIR, "item_categories.csv"))
print(item_categories.shape)
item_categories.head()

In [None]:
shops = pd.read_csv(os.path.join(DATA_DIR, "shops.csv"))
print(shops.shape)
shops.head()

There are 2935849 rows in the sales, 22170 items, 84 categories and 60 shops.

In [None]:
print("Unique shops:", sales["shop_id"].nunique())
print("Unique items:", sales["item_id"].nunique())
print("Unique categories in items:", items["item_category_id"].nunique())

In [None]:
test = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
print(test.shape)
test.head()

In [None]:
set(test.shop_id.value_counts().index).difference(set(shops.shop_id))

In [None]:
set(test.item_id.value_counts().index).difference(set(items.item_id))

In [None]:
len(set(test.item_id.value_counts().index).difference(set(sales.item_id)))

There are some items that do not exist in the training sample

In [None]:
sales.date = pd.to_datetime(sales.date, format="%d.%m.%Y").dt.date

print("Total days:", len(sales.date.value_counts()))
print("First:", sales.date.min())
print("Last:", sales.date.max())

sales.date.max() - sales.date.min()

We have two values to aggregate: price and count.

Now we will check if the price can change in one month or it's constant for every item

In [None]:
g_indexed = (
    sales.groupby(["shop_id", "item_id", "date_block_num", "item_price"])
    .agg(["count"])
    .reset_index()
    .set_index(["shop_id", "item_id", "date_block_num"])
)
g_indexed[g_indexed.index.duplicated(keep=False)]

## Check for outliers

### item_price

In [None]:
sales["item_price"].plot(kind="box")

In [None]:
sales["item_price"].value_counts().sort_index()

In [None]:
sales.loc[sales["item_price"].idxmax()]

that's clearly an outlier, can drop it

In [None]:
items.loc[sales.loc[sales["item_price"].idxmax()].item_id]

How many outliers are there?

In [None]:
series = sales["item_price"]

Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1

sales[((series < (Q1 - 1.5 * IQR)) | (series > (Q3 + 1.5 * IQR)))].shape

That's a lot

In [None]:
low = series.quantile(0.01)
high = series.quantile(0.99)

sales[(series < low) | (series > high)].shape

In [None]:
sales["item_price"].plot(kind="box", showfliers=False)

In [None]:
with pd.option_context("display.float_format", lambda x: "%.2f" % x):
    print(sales["item_price"].describe())

Could be enought to just drop negative price and one very big sale

### item_cnt_day

In [None]:
sales["item_cnt_day"].plot(kind="box")

In [None]:
sales["item_cnt_day"].plot(kind="box", showfliers=False)

In [None]:
with pd.option_context("display.float_format", lambda x: "%.2f" % x):
    print(sales["item_cnt_day"].describe())

In [None]:
sales["item_cnt_day"].value_counts().sort_index()

In [None]:
sales.loc[sales["item_cnt_day"].idxmax()]

In [None]:
items.loc[sales.loc[sales["item_cnt_day"].idxmax()].item_id]

Someone has bought 2169 item deliveries

In [None]:
shops.loc[sales.loc[sales["item_cnt_day"].idxmax()].shop_id]

In [None]:
sales[sales["item_id"] == sales.loc[sales["item_cnt_day"].idxmax()].item_id][
    "item_cnt_day"
].mean()

That's weird, definitely an outlier. What about 1000?

In [None]:
sales[sales["item_cnt_day"] == 1000]

In [None]:
items.loc[20949]

Someone wanted to make пакет с пакетами

### Dropping outliers

In [None]:
print("Before:", sales.shape)
sales = sales[
    (sales["item_cnt_day"] < 1000)
    & (sales["item_price"] > 0)
    & (sales["item_price"] < 60000)
].copy()
print("After:", sales.shape)

## Features construction

The following features are statistics based on historical data and some characteristics from time-series analysis

The price can change, it might be related to sales or whatever, that is affecting target value too.

We need to add some feature there - something like price gradient that will let us know how was the price changed.

In [None]:
def max_grad(series):
    return series.diff().max()


def min_grad(series):
    return series.diff().min()


def negative_sum(series):
    return series[series < 0].sum()


def positive_sum(series):
    return series[series > 0].sum()


def monthly_trend(series):
    half = len(series) // 2
    return series.iloc[half:].mean() - series.iloc[:half].mean()


common_aggregates = [
    "sum",
    "min",
    "max",
    "mean",
    "median",
    "std",
    max_grad,
    min_grad,
    monthly_trend,
]

# sort frame before group by to correctly calculate monthly trend
sales_ex = (
    sales.sort_values(by=["shop_id", "item_id", "date_block_num", "date"])
    .groupby(["shop_id", "item_id", "date_block_num"])
    .agg(
        {
            "item_cnt_day": common_aggregates + [negative_sum, positive_sum],
            "item_price": common_aggregates + ["count"],
        }
    )
    .reset_index()
)
print(sales_ex.shape)
sales_ex.head()

In [None]:
sales_ex.columns = [
    "_".join(col).strip().rstrip("_") for col in sales_ex.columns.values
]
sales_ex.head()

In [None]:
sales_ex.columns.tolist()

In [None]:
sales_ex.rename(
    columns={
        "item_cnt_day_sum": "item_cnt_month",
        "item_price_count": "num_transactions",
    },
    inplace=True,
)

In [None]:
# clip to match conditions
sales_ex["item_cnt_month"] = sales_ex["item_cnt_month"].clip(0, 20)

In [None]:
sales_ex.columns.tolist()

In [None]:
numeric_features = {
    "date_block_num",
    "item_cnt_day_min",
    "item_cnt_day_max",
    "item_cnt_day_mean",
    "item_cnt_day_median",
    "item_cnt_day_std",
    "item_cnt_day_max_grad",
    "item_cnt_day_min_grad",
    "item_cnt_day_negative_sum",
    "item_cnt_day_positive_sum",
    "item_price_sum",
    "item_price_min",
    "item_price_max",
    "item_price_mean",
    "item_price_median",
    "item_price_std",
    "item_price_max_grad",
    "item_price_min_grad",
    "num_transactions",
    "item_cnt_day_monthly_trend",
    "item_price_monthly_trend",
}

cat_features = set()

Soon it will take a lot more space in the memory

In [None]:
sales_ex.info()

In [None]:
def downcast_numeric(df):
    for d_type in ["float", "integer"]:
        cols = df.select_dtypes(d_type).columns
        df[cols] = df[cols].apply(pd.to_numeric, downcast=d_type)

In [None]:
downcast_numeric(sales_ex)

In [None]:
sales_ex["item_income_month"] = sales_ex["item_cnt_month"] * sales_ex["item_price_mean"]

In [None]:
sales_ex.info()

In [None]:
sales_ex.to_pickle("sales_ex.pkl")

In [None]:
sales_ex = pd.read_pickle("sales_ex.pkl")

## Previous value benchmark

In [None]:
last_month_agg = (
    sales[sales.date_block_num == sales.date_block_num.max()]
    .groupby(["shop_id", "item_id"])
    .agg(
        {
            "item_cnt_day": ["sum"],
        }
    )
    .reset_index()
)
print(last_month_agg.shape)
last_month_agg.head()

In [None]:
last_month_agg.columns = [
    "_".join(col).strip().rstrip("_") for col in last_month_agg.columns.values
]

In [None]:
test_enriched = pd.merge(
    test,
    last_month_agg,
    left_on=["shop_id", "item_id"],
    right_on=["shop_id", "item_id"],
    how="left",
)
print(test_enriched.shape)
test_enriched.head()
test_enriched.rename(columns={"item_cnt_day_sum": "item_cnt_month"}, inplace=True)

In [None]:
test_enriched.item_cnt_month.value_counts(dropna=False)

In [None]:
test_enriched.item_cnt_month.fillna(0, inplace=True)

In [None]:
test_enriched.item_cnt_month.clip(0, 20, inplace=True)

In [None]:
test_enriched.to_csv(
    "prev_month_clip.csv", index_label="ID", columns=["item_cnt_month"]
)

In [None]:
clip = 15

test_enriched.item_cnt_month.clip(0, clip, inplace=True)
test_enriched.to_csv(
    f"prev_month_clip_{clip}.csv", index_label="ID", columns=["item_cnt_month"]
)

Clipped 20 values with zeroed NaNs = 1.16777 kaggle

Zeroed NaNs & No clip = around 8.5 kaggle

Zeroed NaNs & clip 25 = 1.202418 coursera

Zeroed NaNs & clip 19 = 1.161152 coursera

Zeroed NaNs & clip 18 = 1.155025 coursera

Zeroed NaNs & clip 17 = 1.149288 coursera

Zeroed NaNs & clip 16 = 1.143893 coursera

Zeroed NaNs & clip 15 = 1.138738 coursera

Zeroed NaNs & clip 10 = 1.123685 coursera

Zeroed NaNs & clip 5 = 1.135693 coursera

Ones & Clip 20 give 1.39120 kaggle

In [None]:
last_year_agg = (
    sales[sales.date_block_num == sales.date_block_num.max() - 11]
    .groupby(["shop_id", "item_id"])
    .agg(
        {
            "item_cnt_day": ["sum"],
        }
    )
    .reset_index()
)
print(last_year_agg.shape)
last_year_agg.head()

In [None]:
last_year_agg.columns = [
    "_".join(col).strip().rstrip("_") for col in last_year_agg.columns.values
]

In [None]:
test_enriched = pd.merge(
    test,
    last_year_agg,
    left_on=["shop_id", "item_id"],
    right_on=["shop_id", "item_id"],
    how="left",
)
print(test_enriched.shape)
test_enriched.head()
test_enriched.rename(columns={"item_cnt_day_sum": "item_cnt_month"}, inplace=True)

In [None]:
test_enriched.item_cnt_month.value_counts(dropna=False)

In [None]:
test_enriched.item_cnt_month.fillna(0, inplace=True)

In [None]:
clip = 10

test_enriched.item_cnt_month.clip(0, clip, inplace=True)
test_enriched.to_csv(
    f"last_year_clip_{clip}.csv", index_label="ID", columns=["item_cnt_month"]
)

^ This gives 1.411213 public score on coursera

## Making big dataset

In the test test we have pairs shop_id & item_id. Need to reformat dataset to handle that representation

In [None]:
from itertools import product

# For every month we create a grid from all shops/items combinations from that month
grid = []
for block_num in sales_ex["date_block_num"].unique():
    cur_shops = sales_ex[sales_ex["date_block_num"] == block_num]["shop_id"].unique()
    cur_items = sales_ex[sales_ex["date_block_num"] == block_num]["item_id"].unique()
    grid.append(list(product(cur_shops, cur_items, [block_num])))

index_cols = ["shop_id", "item_id", "date_block_num"]
grid = pd.DataFrame(np.vstack(grid), columns=index_cols)

sales_enriched = pd.merge(grid, sales_ex, how="left", on=index_cols)
sales_enriched.item_cnt_month = sales_enriched.item_cnt_month.fillna(0)

print(sales_enriched.shape)
sales_enriched.head()

In [None]:
# replace NaNs with zeros
sales_enriched[list(numeric_features)] = sales_enriched[list(numeric_features)].fillna(
    0
)

In [None]:
test_block = sales_enriched.date_block_num.max() + 1

In [None]:
test["date_block_num"] = test_block

combined = pd.concat([sales_enriched, test], axis=0)

In [None]:
combined.reset_index(inplace=True)

In [None]:
combined.drop(columns=["index"], inplace=True)

those features won't be used in the test set

In [None]:
combined["num_transactions"] = pd.to_numeric(
    combined["num_transactions"], downcast="integer"
)

In [None]:
downcast_numeric(combined)

In [None]:
combined.to_pickle("combined.pkl")

In [None]:
combined = pd.read_pickle("combined.pkl")

Converting numeric columns into sparse series gives tremendous change in file size, but also increases computation time

https://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html

In [None]:
# speed vs memory
# for col in numeric_features.difference({"date_block_num"}):
#     combined[col] = combined[col].astype(pd.SparseDtype("float32", 0))

# Step 2. Feature engineering

There we will plot some data and try to make features from the existing ones

During the plot analysis we will be looking for monthly trends in items, categories. Also need to check shop popularity, category popularity.
Assuming there are no dates in the test set, trends info might not be useful at all.

## Adding total sales features

In [None]:
total_shop_sales = (
    sales_ex.groupby(["shop_id", "date_block_num"])
    .agg(
        {
            "item_cnt_month": ["sum"],
        }
    )
    .reset_index()
)
total_shop_sales.columns = [
    "_".join(col).strip().rstrip("_") for col in total_shop_sales.columns.values
]
total_shop_sales.rename(
    columns={"item_cnt_month_sum": "total_monthly_shop_sales"}, inplace=True
)
print(total_shop_sales.shape)
total_shop_sales.head()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

for key, group in total_shop_sales.groupby(["shop_id"]):
    ax.plot(group["date_block_num"], group["total_monthly_shop_sales"], label=key)

plt.show()

In [None]:
total_shop_sales.loc[total_shop_sales.total_monthly_shop_sales.idxmax()]

In [None]:
numeric_features.add("total_monthly_shop_sales")

In [None]:
total_item_sales = (
    sales_ex.groupby(["item_id", "date_block_num"])
    .agg(
        {
            "item_cnt_month": ["sum"],
        }
    )
    .reset_index()
)
total_item_sales.columns = [
    "_".join(col).strip().rstrip("_") for col in total_item_sales.columns.values
]
total_item_sales.rename(
    columns={"item_cnt_month_sum": "total_monthly_item_sales"}, inplace=True
)
print(total_item_sales.shape)
total_item_sales.head()

In [None]:
total_item_sales.total_monthly_item_sales.value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

for key, group in total_item_sales.groupby(["item_id"]):
    ax.plot(group["date_block_num"], group["total_monthly_item_sales"], label=key)

plt.show()

In [None]:
numeric_features.add("total_monthly_item_sales")

It's messy, can see some trends

In [None]:
sales[sales.date_block_num == 11].date

A lot of sales happens in December.

Data is seasonal.

In [None]:
combined = pd.merge(
    combined, total_item_sales, on=["item_id", "date_block_num"], how="left"
)
combined = pd.merge(
    combined, total_shop_sales, on=["shop_id", "date_block_num"], how="left"
)

In [None]:
combined.tail()

Same for item sales, NaNs are in the test set only

In [None]:
combined["total_monthly_shop_sales"].value_counts(dropna=False).sort_index()

In [None]:
combined.info()

## Processing categorical features

There are two categorical features:
1. shop name, we can extract some features from it
2. item category

In [None]:
items.head()

In [None]:
item_categories.head()

In [None]:
combined = pd.merge(combined, items, on="item_id", how="left")

In [None]:
combined = pd.merge(combined, item_categories, on="item_category_id", how="left")

In [None]:
combined = pd.merge(combined, shops, on="shop_id", how="left")

In [None]:
combined.columns

In [None]:
cat_info = (
    combined.groupby("item_category_name")
    .agg({"item_cnt_month": ["sum"]})
    .sort_values(
        ("item_cnt_month", "sum"),
    )
)

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(cat_info)

In [None]:
combined.groupby("item_category_name").agg(
    {"item_cnt_month": ["sum"]}
).unstack().plot.bar()

Some item categories are quite unpopular

We will merge them to make less categories and remove small categories. 1-3 rows make no sense

In [None]:
def make_cat_name(value):
    split = value.split(" - ")
    if len(split) > 1:
        return split[0]

    return value


combined["global_item_category_name"] = combined["item_category_name"].apply(
    make_cat_name
)

In [None]:
cat_info = (
    combined.groupby("global_item_category_name")
    .agg({"item_cnt_month": ["sum"]})
    .sort_values(
        ("item_cnt_month", "sum"),
    )
)

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(cat_info)

Better, there are still some small categories that can be merged

In [None]:
def make_cat_name(value):
    force_category = {
        "PC - Гарнитуры/Наушники": "Аксессуары",
        "Игры MAC - Цифра": "Игры",
        "Игры Android - Цифра": "Игры",
        "Чистые носители (шпиль)": "Чистые носители",
        "Чистые носители (штучные)": "Чистые носители",
    }

    if value in force_category:
        return force_category[value]

    split = value.split(" - ")
    if len(split) > 1:
        return split[0]

    return value


combined["global_item_category_name"] = combined["item_category_name"].apply(
    make_cat_name
)

In [None]:
cat_info = (
    combined.groupby("global_item_category_name")
    .agg({"item_cnt_month": ["sum"]})
    .sort_values(
        ("item_cnt_month", "sum"),
    )
)

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(cat_info)

In [None]:
cat_info.unstack().plot.bar()

Now it's way more better

In [None]:
cat_features.add("global_item_category_name")

In [None]:
combined.drop(columns=["item_category_name"], inplace=True)

In [None]:
shop_info = (
    combined.groupby("shop_name")
    .agg({"item_cnt_month": ["sum"]})
    .sort_values(
        ("item_cnt_month", "sum"),
    )
)

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(shop_info)

Shop name consists of city and address

Same thing, some shops are unpopular compared to others. That's related to their region

Moreover, if we don't correctly predict for unpopular shops it almost won't affect score

What features can be extracted from this data?
1. City
2. Type of shop: shopping center, shop, mall, etc

And based on city and address we can calculate even more features including human activity, shop popularity

In [None]:
def get_shop_type(value):
    online_category = {
        "Интернет-магазин ЧС",
        "Цифровой склад 1С-Онлайн",
    }
    if value in online_category:
        return "Online"

    other_category = {
        "Выездная Торговля",
        'Москва "Распродажа"',
    }
    if value in other_category:
        return "Other"

    shop_category = {
        "ТРЦ": "Shopping and entertaiment center",
        "ТЦ": "Shopping center",
        "ТРК": "Retail and entertaiment complex",
        "ТК": "Retail complex",
    }

    for k, v in shop_category.items():
        if k in value:
            return v

    return "Shop"


combined["shop_type"] = combined["shop_name"].apply(get_shop_type)

In [None]:
shop_info = (
    combined.groupby(["shop_type"])
    .agg({"item_cnt_month": ["sum"]})
    .sort_values(
        ("item_cnt_month", "sum"),
    )
)

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(shop_info)

In [None]:
shop_info.unstack().plot.bar()

In [None]:
cat_features.add("shop_type")

In [None]:
def get_city(value):
    unknown_city = {
        "Выездная Торговля",
        "Интернет-магазин ЧС",
        "Цифровой склад 1С-Онлайн",
    }

    if value in unknown_city:
        return "Unknown"

    return value.split()[0]


combined["shop_city"] = combined["shop_name"].apply(get_city)

In [None]:
shop_info = (
    combined.groupby(["shop_city"])
    .agg({"item_cnt_month": ["sum"]})
    .sort_values(
        ("item_cnt_month", "sum"),
    )
)

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(shop_info)

In [None]:
shop_info.unstack().plot.bar()

The data looks quite unbalanced, there are a lot of sales in Moscow compared to other cities

In the case of not so good algorithms we can go further and extract features like mall traffic. 

In [None]:
cat_features.add("shop_city")

In [None]:
combined.drop(columns=["shop_name"], inplace=True)

In [None]:
for f in cat_features:
    combined[f] = combined[f].astype("category")

In [None]:
for f in ["item_category_id", "shop_id", "item_id"]:
    combined[f] = combined[f].astype("category")

In [None]:
combined.drop(columns="item_category_id", inplace=True)

In [None]:
combined.info()

## Processing text features

In [None]:
combined.item_name.value_counts()

Next feature is item_name:

```
Feature extraction from text and images

Features from text are extracted
Special preprocessings for text are utilized (TF-IDF, stemming, levenshtening...)
```

In [None]:
MAX_TF_IDF = 20

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


tf_idf = TfidfVectorizer(ngram_range=(1, 3), max_features=MAX_TF_IDF)
tf_idf_mat = tf_idf.fit_transform(items["item_name"]).toarray()
items_text_features = pd.DataFrame(tf_idf_mat)

for i in range(MAX_TF_IDF):
    feature_name = f"tfidf_item_name_{i}"
    items[feature_name] = items_text_features[items_text_features.columns[i]]

In [None]:
tf_idf.get_feature_names()

Feature names are mostly music/games CDs, russian versions, etc

In [None]:
tfidf_features = [f"tfidf_item_name_{i}" for i in range(MAX_TF_IDF)]

In [None]:
items.head()

In [None]:
combined = pd.merge(
    combined, items[["item_id"] + tfidf_features], on="item_id", how="left"
)

In [None]:
combined.drop(columns=["item_name"], inplace=True)

In [None]:
combined.info()

In [None]:
downcast_numeric(combined)

In [None]:
combined.info()

In [None]:
import gc

gc.collect()

In [None]:
combined["item_id"] = combined["item_id"].astype("category")

## Mean encodings

Here we will encode categorical features

In [None]:
cat_features

In [None]:
from sklearn.model_selection import KFold


means_features = set()
train_alias = combined[combined["date_block_num"] != test_block]
global_mean = train_alias[target_feature].mean()
y_tr = train_alias[target_feature].values

for col in tqdm(cat_features):
    col_tr = train_alias[[col] + [target_feature]].copy()

    feature_name = col + "_cnt_month_mean_expanding"
    means_features.add(feature_name)

    # Expanding mean scheme
    cumsum = col_tr.groupby(col)[target_feature].cumsum() - col_tr[target_feature]
    cumcount = col_tr.groupby(col).cumcount()

    col_tr[feature_name] = cumsum / cumcount
    col_tr[feature_name].fillna(global_mean, inplace=True)

    combined.loc[combined["date_block_num"] != test_block, feature_name] = col_tr[
        feature_name
    ]

    # cumulative means is for training, test is transforming just by means
    #     https://stackoverflow.com/questions/60266373/how-to-use-target-encoding-expanding-mean-on-the-test-set
    mapper = col_tr.groupby(col)[target_feature].mean()
    combined.loc[combined["date_block_num"] == test_block, feature_name] = (
        combined[combined["date_block_num"] == test_block][col]
        .map(mapper)
        .astype("float")
    )

    combined[feature_name].fillna(global_mean, inplace=True)

In [None]:
means_features = [col + "_cnt_month_mean_expanding" for col in cat_features]

In [None]:
combined.info()

In [None]:
downcast_numeric(combined)

In [None]:
combined.to_pickle("mean_encoded_clip.pkl")

In [None]:
combined = pd.read_pickle("mean_encoded_clip.pkl")

## Lag features

as the lag features we will use some statistics from previous sales for given periods

date_block_num needs to be excluded from past features

In [None]:
numeric_features

In [None]:
end_month = combined.date_block_num.max()

In [None]:
import gc


lookback_range = [1, 2, 3, 4, 6, 12]
to_look = list(numeric_features.difference({"date_block_num"}))

lookback_features = {}

for diff in tqdm(lookback_range):
    to_future = combined[combined["date_block_num"] + diff <= end_month][
        ["shop_id", "item_id", "date_block_num"] + to_look
    ].copy()
    to_future.date_block_num += diff

    name_map = {f: f"prev_{diff}_{f}" for f in to_look}

    to_future.rename(columns=name_map, inplace=True)
    lookback_features[diff] = list(name_map.values())

    combined = pd.merge(
        combined, to_future, on=["shop_id", "item_id", "date_block_num"], how="left"
    )
    gc.collect()

In [None]:
from itertools import chain


lookback_features_list = list(chain(*lookback_features.values()))

In [None]:
combined[lookback_features_list] = combined[lookback_features_list].fillna(0)

In [None]:
downcast_numeric(combined)

## Check result feature space

In [None]:
combined[combined["date_block_num"] < end_month].columns.duplicated().sum()

In [None]:
combined.info()

In [None]:
num_algo_features = (
    ["date_block_num"] + lookback_features_list + tfidf_features + means_features
)

num_algo_cat_features = list(cat_features)

Current month columns won't be used to build algorithms

In [None]:
set(combined.columns.tolist()).difference(
    set(num_algo_features + num_algo_cat_features)
)

In [None]:
combined.to_pickle("encoded_with_lags_clip.pkl")

In [None]:
import json

with open("all-features.json", "w") as f:
    json.dump({"numeric": num_algo_features, "categorical": num_algo_cat_features}, f)

In [None]:
combined = pd.read_pickle("encoded_with_lags_clip.pkl")

In [None]:
end_month = combined["date_block_num"].max()

We will use hold out approach. Train set should have date block less than 32, validation set ~ date block = 33 and test set = 34

In [None]:
print("train:", len(combined[combined["date_block_num"] < end_month - 1]))
print("validation:", len(combined[combined["date_block_num"] == end_month - 1]))
print("test:", len(combined[combined["date_block_num"] == end_month]))

In [None]:
combined[combined["date_block_num"] < end_month - 1].to_pickle("train_clip.pkl")
combined[combined["date_block_num"] == end_month - 1].to_pickle("validation_clip.pkl")
combined[combined["date_block_num"] == end_month].to_pickle("test_clip.pkl")

In [None]:
train_shifted = combined[
    (combined["date_block_num"] > combined["date_block_num"].min() + 12)
    & (combined["date_block_num"] < end_month - 1)
]

print("train shifted:", train_shifted.shape)
train_shifted.to_pickle("train_shifted.pkl")

The size is halved if we remove blocks from the first year

In [None]:
del combined, train_shifted

gc.collect()

# Step 3. Building Models

### Load data

In [None]:
train = pd.read_pickle("train_clip.pkl")
train_x, train_y = (
    train[num_algo_features + num_algo_cat_features],
    train[target_feature],
)

validation = pd.read_pickle("validation_clip.pkl")
validation_x, validation_y = (
    train[num_algo_features + num_algo_cat_features],
    train[target_feature],
)

In [None]:
num_cols = train_x.columns.get_indexer(num_algo_features)
cat_cols = train_x.columns.get_indexer(num_algo_cat_features)

In [None]:
test = pd.read_pickle("test_clip.pkl")
test.set_index("ID", inplace=True)

## Catboost

### Full

In [None]:
from catboost import CatBoostRegressor


model = CatBoostRegressor(
    random_seed=RANDOM_STATE,
    depth=11,
    iterations=1000,
    learning_rate=0.01,
    boosting_type="Plain",
    max_ctr_complexity=1,
    task_type="GPU",
    devices="0:1",
    verbose=3,
)

In [None]:
model.fit(
    train_x,
    train_y,
    cat_features=num_algo_cat_features,
    eval_set=(validation_x, validation_y),
    # logging_level="Silent",
    # plot=True, not working in 0.25.1
)

In [None]:
preds = model.predict(test[num_algo_features + num_algo_cat_features]).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_catboost.csv")

This model achieves 0.954772 public and 0.952248 private score

### Shifted

In [None]:
train = pd.read_pickle("train_shifted.pkl")
train_x, train_y = (
    train[num_algo_features + num_algo_cat_features],
    train[target_feature],
)

validation = pd.read_pickle("validation_clip.pkl")
validation_x, validation_y = (
    train[num_algo_features + num_algo_cat_features],
    train[target_feature],
)

test = pd.read_pickle("test_clip.pkl")
test.set_index("ID", inplace=True)

In [None]:
from catboost import CatBoostRegressor


model = CatBoostRegressor(
    random_seed=RANDOM_STATE,
    depth=11,
    iterations=1000,
    learning_rate=0.01,
    boosting_type="Plain",
    max_ctr_complexity=1,
    task_type="GPU",
    devices="0:1",
    verbose=3,
)

In [None]:
model.fit(
    train_x,
    train_y,
    cat_features=num_algo_cat_features,
    eval_set=(validation_x, validation_y),
    # logging_level="Silent",
    # plot=True, not working in 0.25.1
)

In [None]:
preds = model.predict(test[num_algo_features + num_algo_cat_features]).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_catboost_shifted.csv")

Public and private LB scores are: 0.951926 and 0.949549

In [None]:
import gc


del train, train_x, train_y, validation, validation_x, validation_y, test
gc.collect()

## Converting data

For the next steps i'll use shifted data and transform categorical data

In [None]:
from sklearn.preprocessing import OneHotEncoder

all_features = num_algo_features + num_algo_cat_features

train = pd.read_pickle("train_shifted.pkl")
train_x, train_y = train[all_features], train[target_feature]
validation = pd.read_pickle("validation_clip.pkl")
validation_x, validation_y = validation[all_features], validation[target_feature]

test = pd.read_pickle("test_clip.pkl")
test.set_index("ID", inplace=True)


encoder = OneHotEncoder().fit(train[num_algo_cat_features])


def convert_frame(df):
    return pd.concat(
        [
            df[all_features],
            pd.DataFrame.sparse.from_spmatrix(
                encoder.transform(df[num_algo_cat_features]),
                index=df.index,
                columns=encoder.get_feature_names(),
            ),
        ],
        axis=1,
    ).drop(columns=num_algo_cat_features)


train_x = convert_frame(train_x)
validation_x = convert_frame(validation_x)
test = convert_frame(test)

In [None]:
train_x.to_pickle("train_encoded_x.pkl")
train_y.to_pickle("train_y.pkl")

validation_x.to_pickle("validation_encoded_x.pkl")
validation_y.to_pickle("validation_encoded_y.pkl")

test.to_pickle("test_encoded.pkl")

In [None]:
train_x = pd.read_pickle("train_encoded_x.pkl")
train_y = pd.read_pickle("train_y.pkl")

validation_x = pd.read_pickle("validation_encoded_x.pkl")
validation_y = pd.read_pickle("validation_encoded_y.pkl")

test = pd.read_pickle("test_encoded.pkl")

## XGBoost

In [None]:
from xgboost import XGBRegressor


model = XGBRegressor(max_depth=7, eta=0.2, num_round=100, seed=RANDOM_STATE,)
model.fit(
    train_x,
    train_y,
    eval_set=[(validation_x, validation_y)],
    eval_metric="rmse",
    verbose=True,
)

In [None]:
preds = model.predict(test).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_xgb.csv")

This model achieves 0.946026 public and 0.934927 private score

## Catboost onehot

In [None]:
from catboost import CatBoostRegressor


model = CatBoostRegressor(
    random_seed=RANDOM_STATE,
    depth=7,
    iterations=1000,
    learning_rate=0.01,
    boosting_type="Plain",
    max_ctr_complexity=1,
    task_type="GPU",
    devices="0:1",
    verbose=3,
)

model.fit(
    train_x,
    train_y,
    eval_set=(validation_x, validation_y),
    # logging_level="Silent",
    # plot=True, not working in 0.25.1
)

In [None]:
preds = model.predict(test).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_catboost_onehot.csv")

Not good: 0.969629 and 0.970380

## LightGBM

### Fix column names to avoid errors

In [None]:
import re
from transliterate import translit


train_x.columns = [
    re.sub("[^A-Za-z0-9_]+", "", translit(x, "ru", reversed=True))
    for x in train_x.columns.tolist()
]
validation_x.columns = [
    re.sub("[^A-Za-z0-9_]+", "", translit(x, "ru", reversed=True))
    for x in validation_x.columns.tolist()
]
test.columns = [
    re.sub("[^A-Za-z0-9_]+", "", translit(x, "ru", reversed=True))
    for x in test.columns.tolist()
]

In [None]:
train_x.to_pickle("train_encoded_x.pkl")
train_y.to_pickle("train_y.pkl")

validation_x.to_pickle("validation_encoded_x.pkl")
validation_y.to_pickle("validation_encoded_y.pkl")

test.to_pickle("test_encoded.pkl")

In [None]:
algo_cat_cols = [x for x in train_x.columns.tolist() if x.startswith("x")]
algo_num_cols = [x for x in train_x.columns.tolist() if not x.startswith("x")]

### Train LGB

In [None]:
import lightgbm as lgb


params = {
    "feature_fraction": 0.75,
    "metric": "rmse",
    "min_data_in_leaf": 2 ** 7,
    "bagging_fraction": 0.75,
    "learning_rate": 0.03,
    "objective": "mse",
    "bagging_seed": 2 ** 7,
    "num_leaves": 2 ** 7,
    "bagging_freq": 1,
    "verbose": 1,
}

lgb_train = lgb.Dataset(train_x, train_y)
lgb_eval = lgb.Dataset(validation_x, validation_y, reference=lgb_train)

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,
    valid_sets=lgb_eval,
    early_stopping_rounds=10,
)

In [None]:
preds = model.predict(test, num_iteration=model.best_iteration).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_lgbm.csv")

scores are 0.936040 and 0.929004

## SGDRegressor

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), algo_num_cols),
        ("cat", "passthrough", algo_cat_cols),
    ]
)

pipe = make_pipeline(preprocessor, SGDRegressor(verbose=1))

pipe.fit(train_x, train_y)
preds = pipe.predict(validation_x)
print(mean_squared_error(validation_y, preds, squared=False))

In [None]:
preds = pipe.predict(test).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_sgd.csv")

Public and private LB scores are: 1.022322 and 1.021003

## Neural Network Tensorflow

In [None]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices("GPU")))

In [None]:
from tensorflow.keras import layers, models
from tensorflow.keras import backend as K
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error


def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))


def make_model():
    model = models.Sequential()
    model.add(
        layers.Dense(
            128, input_dim=len(algo_num_cols) + len(algo_cat_cols), activation="relu"
        )
    )
    model.add(layers.Dense(64, activation="relu"))
    model.add(layers.Dense(1))
    model.compile(loss=root_mean_squared_error, optimizer="adam")
    return model


preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), algo_num_cols),
        ("cat", "passthrough", algo_cat_cols),
    ]
)

validation_scaled = preprocessor.fit(train_x).transform(validation_x)

pipe = make_pipeline(
    preprocessor,
    KerasRegressor(
        build_fn=make_model,
        epochs=100,
        batch_size=102400,
        validation_data=(validation_scaled, validation_y),
        verbose=1,
    ),
)

pipe.fit(train_x, train_y)

In [None]:
preds = pipe.predict(test).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_nn.csv")

0.996570 and 0.994184

# Step 4. Tuning

At this step we will tune 3 models: LightGBM, SGDRegressor and Neural Network using one-hot-encoded train dataset

## Load Data

In [None]:
train_x = pd.read_pickle("train_encoded_x.pkl")
train_y = pd.read_pickle("train_y.pkl")

validation_x = pd.read_pickle("validation_encoded_x.pkl")
validation_y = pd.read_pickle("validation_encoded_y.pkl")

test = pd.read_pickle("test_encoded.pkl")

algo_cat_cols = [x for x in train_x.columns.tolist() if x.startswith("x")]
algo_num_cols = [x for x in train_x.columns.tolist() if not x.startswith("x")]

## Tune LightGBM model

In [None]:
from sklearn.model_selection import KFold, RandomizedSearchCV
import lightgbm as lgb


NUM_SPLITS = 5
NUM_ITER = 20

param_grid = {
    "max_depth": [6, 7, 11],
    "learning_rate": [0.1, 0.01, 0.03],
    "metric": ["rmse"],
    "random_state": [RANDOM_STATE],
    "bagging_freq": [1],
    "feature_fraction": [0.75, 1],
    "bagging_fraction": [0.75, 1],
    "min_data_in_leaf": [100],
    "num_leaves": [100, 150],
    "num_iterations": [20],
    "verbose": [-1],
}

cv = KFold(n_splits=NUM_SPLITS, shuffle=False)
clf = lgb.LGBMRegressor(verbose=-1)

grid = RandomizedSearchCV(
    clf,
    param_distributions=param_grid,
    cv=cv,
    verbose=3,
    scoring="neg_root_mean_squared_error",
    n_iter=NUM_ITER,
    random_state=RANDOM_STATE,
)
grid.fit(train_x, train_y)

print(grid.best_params_, grid.best_score_)

In [None]:
best_lgb_params = {
    "verbose": -1,
    "random_state": 42,
    "num_leaves": 150,
    "num_iterations": 20,
    "min_data_in_leaf": 100,
    "metric": "rmse",
    "max_depth": 11,
    "learning_rate": 0.1,
    "feature_fraction": 0.75,
    "bagging_freq": 1,
    "bagging_fraction": 0.75,
}

### Retrain & predict

In [None]:
lgb_train = lgb.Dataset(train_x, train_y)
lgb_eval = lgb.Dataset(validation_x, validation_y, reference=lgb_train)

model = lgb.train(best_lgb_params, lgb_train, valid_sets=lgb_eval,)

preds = model.predict(test, num_iteration=model.best_iteration).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_lgbm_tuned.csv")

0.942457 and 0.938335

Previous results were better. Later I'll take params from simple lgb

## SGDRegressor

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, RandomizedSearchCV


preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), algo_num_cols),
        ("cat", "passthrough", algo_cat_cols),
    ]
)

pipe = make_pipeline(preprocessor, SGDRegressor(verbose=1))
NUM_SPLITS = 5
NUM_ITER = 20

param_grid = {
    "sgdregressor__alpha": np.linspace(0.0001, 2, 30),
    "sgdregressor__random_state": [RANDOM_STATE],
    "sgdregressor__verbose": [-1],
}

cv = KFold(n_splits=NUM_SPLITS, shuffle=False)

grid = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid,
    cv=cv,
    verbose=3,
    scoring="neg_root_mean_squared_error",
    n_iter=NUM_ITER,
    random_state=RANDOM_STATE,
    n_jobs=3,
)
grid.fit(train_x, train_y)

print(grid.best_params_, grid.best_score_)

### Retrain and predict

In [None]:
best_sgd_params = {
    "verbose": -1,
    "random_state": 42,
    "alpha": 0.0001,
}

In [None]:
pipe = make_pipeline(preprocessor, SGDRegressor(**best_sgd_params))
pipe.fit(train_x, train_y)

preds = pipe.predict(test).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_sgd_tuned.csv")

1.022547 and 1.021522

## Neural Network

We will test 3 models

In [None]:
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer


def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))


def make_model_1():
    model = models.Sequential()
    model.add(
        layers.Dense(
            128, input_dim=len(algo_num_cols) + len(algo_cat_cols), activation="relu"
        )
    )
    model.add(layers.Dense(1))
    model.compile(loss=root_mean_squared_error, optimizer="adam")
    return model


def make_model_2():
    model = models.Sequential()
    model.add(
        layers.Dense(
            64, input_dim=len(algo_num_cols) + len(algo_cat_cols), activation="relu"
        )
    )
    model.add(layers.Dense(32))
    model.add(layers.Dense(1))
    model.compile(loss=root_mean_squared_error, optimizer="adam")
    return model


def make_model_3():
    model = models.Sequential()
    model.add(
        layers.Dense(
            128, input_dim=len(algo_num_cols) + len(algo_cat_cols), activation="relu"
        )
    )
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(64))
    model.add(layers.Dense(1))
    model.compile(loss=root_mean_squared_error, optimizer=optimizers.SGD(momentum=0.1))
    return model


preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), algo_num_cols),
        ("cat", "passthrough", algo_cat_cols),
    ]
)

validation_scaled = preprocessor.fit(train_x).transform(validation_x)

networks = []


for fn in [make_model_1, make_model_2, make_model_3]:
    networks.append(
        make_pipeline(
            preprocessor,
            KerasRegressor(
                build_fn=fn,
                epochs=100,
                batch_size=102400,
                validation_data=(validation_scaled, validation_y),
                verbose=1,
            ),
        )
    )

In [None]:
import gc

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


cv = KFold(n_splits=NUM_SPLITS, shuffle=False)


scores = [[] for m in networks]
num_cv = 0
for train_index, test_index in cv.split(train_x, train_y):
    gc.collect()
    print("split:", num_cv)
    num_cv += 1
    for idx, model in tqdm(list(enumerate(networks))):
        model.fit(train_x.iloc[train_index], train_y.iloc[train_index])

        train_preds = model.predict(train_x.iloc[train_index]).clip(0, 20)
        test_preds = model.predict(train_x.iloc[test_index]).clip(0, 20)
        scores[idx].append(
            (
                mean_squared_error(
                    train_preds, train_y.iloc[train_index], squared=False
                ),
                mean_squared_error(test_preds, train_y.iloc[test_index], squared=False),
            )
        )

In [None]:
def to_rows(label, scores):
    train_scores, val_scores = list(zip(*scores))
    return [
        {"label": label, "score": np.array(train_scores).mean(), "sample": "train"},
        {"label": label, "score": np.array(val_scores).mean(), "sample": "validation"},
    ]


rows = []
rows.extend(to_rows("model1", scores[::3]))
rows.extend(to_rows("model2", scores[1::3]))
rows.extend(to_rows("model3", scores[2::3]))

In [None]:
scores_df = pd.DataFrame.from_records(rows)
sns.catplot(x="label", y="score", hue="sample", data=scores_df, kind="bar", height=6)

In [None]:
scores_df

Model 1 performs better on validation set during CV

### Retrain and predict

In [None]:
nn_pipe = make_pipeline(
    preprocessor,
    KerasRegressor(
        build_fn=make_model_1,
        epochs=100,
        batch_size=102400,
        validation_data=(validation_scaled, validation_y),
        verbose=1,
    ),
)

nn_pipe.fit(train_x, train_y)

preds = nn_pipe.predict(test).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_nn_tuned.csv")

0.935379 and 0.937202

## Random Forest

Average parameters

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


rf = RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE, max_depth=7)
rf.fit(train_x, train_y)

val_preds = rf.predict(validation_x).clip(0, 20)
print("score:", mean_squared_error(val_preds, validation_y))

In [None]:
preds = rf.predict(test).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_rf_tuned.csv")

1.000256 and 1.003907

# Step 5. Stacking

Simple holdout scheme

1. Split train data into three parts: partA and partB and partC.
2. Fit N diverse models on partA, predict for partB, partC, test_data getting meta-features partB_meta, partC_meta and test_meta respectively.
3. Fit a metamodel to a partB_meta while validating its hyperparameters on partC_meta.
4. When the metamodel is validated, fit it to [partB_meta, partC_meta] and predict for test_meta.

In [None]:
max_train_block = train_x["date_block_num"].max()

In [None]:
part_a, part_b = (
    train_x[train_x.date_block_num < max_train_block],
    train_x[train_x.date_block_num == max_train_block],
)

In [None]:
part_a_y, part_b_y = (
    train_y[train_x.date_block_num < max_train_block],
    train_y[train_x.date_block_num == max_train_block],
)

In [None]:
meta_df = pd.DataFrame(index=part_b.index.tolist() + validation_x.index.tolist())
test_meta_df = pd.DataFrame(index=test.index)

In [None]:
meta_df.shape[0] == part_b.shape[0] + validation_x.shape[0]

In [None]:
meta_df["date_block_num"] = np.hstack(
    [part_b.date_block_num, validation_x.date_block_num]
)

## Diverse models

### Light gbm

In [None]:
best_lgb_params = {
    "feature_fraction": 0.75,
    "metric": "rmse",
    "min_data_in_leaf": 2 ** 7,
    "bagging_fraction": 0.75,
    "learning_rate": 0.03,
    "objective": "mse",
    "bagging_seed": 2 ** 7,
    "num_leaves": 2 ** 7,
    "bagging_freq": 1,
    "verbose": 1,
}

In [None]:
import lightgbm as lgb


lgb_part_a = lgb.Dataset(part_a, part_a_y)
lgb_part_b = lgb.Dataset(part_b, part_b_y)
lgb_part_c = lgb.Dataset(validation_x, validation_y)


lgb_model = lgb.train(best_lgb_params, lgb_part_a)

In [None]:
preds_b = lgb_model.predict(part_b, num_iteration=lgb_model.best_iteration).clip(0, 20)
preds_c = lgb_model.predict(validation_x, num_iteration=lgb_model.best_iteration).clip(
    0, 20
)
preds_test = lgb_model.predict(test, num_iteration=lgb_model.best_iteration).clip(0, 20)

In [None]:
meta_df["lgb"] = np.hstack([preds_b, preds_c])
test_meta_df["lgb"] = preds_test

In [None]:
lgb_model.save_model("lgb_classifier.txt", num_iteration=lgb_model.best_iteration)

In [None]:
# lgb_model = lgb.Booster(model_file="lgb_classifier.txt")

### SGDRegressor

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer


preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), algo_num_cols),
        ("cat", "passthrough", algo_cat_cols),
    ]
)

sgd_model = make_pipeline(preprocessor, SGDRegressor(**best_sgd_params))
sgd_model.fit(part_a, part_a_y)

In [None]:
preds_b = sgd_model.predict(part_b).clip(0, 20)
preds_c = sgd_model.predict(validation_x).clip(0, 20)
preds_test = sgd_model.predict(test).clip(0, 20)

In [None]:
meta_df["sgd"] = np.hstack([preds_b, preds_c])
test_meta_df["sgd"] = preds_test

In [None]:
import joblib

joblib.dump(sgd_model, "sgd_model.joblib")

In [None]:
import joblib


sgd_model = joblib.load("sgd_model.joblib")

### Neural Network

In [None]:
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer


def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))


def make_model_1():
    model = models.Sequential()
    model.add(
        layers.Dense(
            128, input_dim=len(algo_num_cols) + len(algo_cat_cols), activation="relu"
        )
    )
    model.add(layers.Dense(1))
    model.compile(loss=root_mean_squared_error, optimizer="adam")
    return model


preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), algo_num_cols),
        ("cat", "passthrough", algo_cat_cols),
    ]
)

validation_scaled = preprocessor.fit(train_x).transform(validation_x)

nn_model = make_pipeline(
    preprocessor,
    KerasRegressor(
        build_fn=make_model_1,
        epochs=100,
        batch_size=102400,
        validation_data=(validation_scaled, validation_y),
        verbose=1,
    ),
)

nn_model.fit(part_a, part_a_y)

In [None]:
preds_b = nn_model.predict(part_b).clip(0, 20)
preds_c = nn_model.predict(validation_x).clip(0, 20)
preds_test = nn_model.predict(test).clip(0, 20)

In [None]:
meta_df["nn"] = np.hstack([preds_b, preds_c])
test_meta_df["nn"] = preds_test

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


rf_model = RandomForestRegressor(
    n_estimators=200, random_state=RANDOM_STATE, max_depth=7
)
rf_model.fit(part_a, part_a_y)

In [None]:
preds_b = rf_model.predict(part_b).clip(0, 20)
preds_c = rf_model.predict(validation_x).clip(0, 20)
preds_test = rf_model.predict(test).clip(0, 20)

In [None]:
meta_df["rf"] = np.hstack([preds_b, preds_c])
test_meta_df["rf"] = preds_test

In [None]:
import joblib

joblib.dump(rf_model, "rf_model.joblib")

In [None]:
import joblib


rf_model = joblib.load("rf_model.joblib")

## Save predictions

In [None]:
meta_df.head()

In [None]:
test_meta_df.head()

In [None]:
meta_df.to_pickle("meta_df.pkl")

In [None]:
test_meta_df.to_pickle("test_meta_df.pkl")

## Meta model

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


meta_features = ["lgb", "sgd", "nn", "rf"]
part_b_meta = meta_df[meta_df.date_block_num == 32][meta_features]
part_c_meta = meta_df[meta_df.date_block_num == 33][meta_features]


alphas = np.linspace(0.01, 2, 30)
best_score = 0
best_alpha = 0

for alpha in tqdm(alphas):
    clf = Ridge(alpha=alpha).fit(part_b_meta, part_b_y)
    preds = clf.predict(part_c_meta).clip(0, 20)
    score = mean_squared_error(preds, validation_y, squared=False)
    if score > best_score:
        best_score = score
        best_alpha = alpha

print("Score:", best_score)
print("Alpha:", best_alpha)

In [None]:
meta_clf = Ridge(alpha=best_alpha).fit(
    meta_df[meta_features], np.hstack([part_b_y, validation_y])
)

In [None]:
preds = meta_clf.predict(test_meta_df[meta_features]).clip(0, 20)
preds_df = pd.DataFrame(preds, columns=["item_cnt_month"])
preds_df["ID"] = test.index.astype(int)
preds_df.set_index("ID", inplace=True)
preds_df.to_csv("test_preds_stacking.csv")

0.933830 and 0.932091