In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import os

## Basic data preprocessing
This notebook does basic cleaning and deduplication on the competition data.
Text is also translated for subsequent usage.


### Changelog
- `v3` add shops preprocessing
- `v3` add "month num" to train data for convenience
- `v4` removing item id 6066: seems an outlier in terms of price and is not included in the test set
- `v4` added plotting to this notebook
- `v5` added revenue (just for convenience)
- `v5` saving also a monthly train dataset
- `v7` also saving the "extended" monthly dataset
- `v8` adding future shop-item pairs to the extended dataset
- `v9` introduced variable days_no_sales_beginning
- `v11` added encodings at the shop-category level
- `v15` add even more encodings
- `v16` add shops clustering
- `v19` improving basic processing + clustering based on means from "train_montly" instead of "train_monthly_extended"
- `v21` add text features processing by Truncated SVD. ([ref](https://scikit-learn.org/stable/modules/decomposition.html#truncated-singular-value-decomposition-and-latent-semantic-analysis))

In [None]:
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
cats = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
# set index to ID to avoid dropping it later
test  = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')

# for every text column: deleting spec symbols and extra spaces, lower case
def str_clean(x):
    return re.sub(re.compile(r'\W'),' ',x.lower()).strip()

shops['shop_name'] = shops['shop_name'].apply(str_clean)
items['item_name'] = items['item_name'].apply(str_clean)
cats['item_category_name'] = cats['item_category_name'].apply(str_clean)

In [None]:
# Add translations
cats_translated  = pd.read_csv('../input/predict-future-sales-translated-to-english/item_categories.csv')\
                     .rename(columns={"item_category_name":"item_category_name_en"})
cats = cats.merge(cats_translated, on="item_category_id")

items_translated  = pd.read_csv('../input/predict-future-sales-translated-to-english/items.csv', encoding='cp1252')\
                     .rename(columns={"item_name":"item_name_en"})[["item_id","item_name_en"]]
items = items.merge(items_translated, on="item_id")

shops_translated  = pd.read_csv('../input/predict-future-sales-translated-to-english/shops.csv', encoding='cp1252')\
                     .rename(columns={"shop_name":"shop_name_en"})
shops = shops.merge(shops_translated, on="shop_id")

In [None]:
assert shops[shops.shop_name.duplicated()].empty

# "shops" cleaning
# 58:1 is a good catch from https://www.kaggle.com/alafan/lightgbm-without-tuning
substitute_dupes = {11: 10, 40:39, 1:58, 0:57}
train["shop_id"] = train["shop_id"].replace(to_replace=substitute_dupes)

# just double check:
for k,v in substitute_dupes.items(): #inefficient but clear
    assert test[test.shop_id == k].empty
    assert train.query(f"shop_id=={k}").empty

In [None]:
# originally seen on: https://www.kaggle.com/alafan/lightgbm-without-tuning

shops['city'] = shops.shop_name.apply(lambda x: x.split()[0])

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
shops['shop_city_id'] = le.fit_transform(shops['city'])

# is it an internet shop?
shops['shop_is_internet'] = shops['shop_id'].apply(lambda x: 1 if x in [12,55] else 0)

In [None]:
assert test.shape[0] == test.merge(shops, on="shop_id").shape[0], "Mismatch in shops preprocessing step"

In [None]:
# Most probably this was related to a previous version of the data
dupes = items[(items.duplicated(subset=['item_name','item_category_id'],keep=False))]
dupes['in_test'] = dupes.item_id.isin(test.item_id.unique())
dupes = dupes.groupby('item_name').agg({'item_id':['first','last',"nunique"],'in_test':['first','last']})


In [None]:
substitute_dupes = {}
if not dupes.empty:
    print("running")
    for i in dupes.iterrows():
        
        if i[1].in_test["last"] and i[1].in_test["first"]: # bad edge case!
            print("Found edge case for item, doing nothing:")
            print(i[1])
        elif i[1].in_test["first"]:
            keep = i[1].item_id["first"]
            repl = i[1].item_id["last"]
        elif i[1].in_test["last"]:
            keep = i[1].item_id["last"]
            repl = i[1].item_id["first"]
        else: # none of the two is in test, keeping the first arbitrarily
            keep = i[1].item_id["first"]
            repl = i[1].item_id["last"]

        substitute_dupes[repl] = keep
        

if not dupes.empty:
    
    new_items = items.set_index("item_id").drop(index=list(substitute_dupes.keys())).reset_index()
    check = test.groupby(["item_id"]).first().merge(new_items, on="item_id")
    assert len(test["item_id"].unique()) == check.shape[0]
    
    items = new_items
    items["item_id"] = items["item_id"].replace(to_replace=substitute_dupes)
    train["item_id"] = train["item_id"].replace(to_replace=substitute_dupes)
    
    dupes = items[(items.duplicated(subset=['item_name','item_category_id'],keep=False))]
    assert dupes.shape[0] == 2 # there is exactly one edge case 


In [None]:
items.query("item_id==13011") # sanity check: this is in test

In [None]:
print("playstation stuff seems to be quite appreciated - let's add a 1hot encoded variable")
print("prefixing with 'man' to indicate that the feature has been extracted manually ")
items["mantxt_playstation"] = items["item_name_en_tokenized"].str.contains("plays").astype(int)

print(f"matched {items['mantxt_playstation'].sum()} playstation items")

In [None]:
train["date"] = \
                pd.to_datetime(train["date"], format='%d.%m.%Y')

In [None]:
train["revenue"] = train["item_price"] * train["item_cnt_day"]

In [None]:
# Add month num for covenience
train["month_num"] = 1 + train.date_block_num % 12
train.sample(3)

In [None]:
train.tail(1000).sample(10)

## A first look at sales

In [None]:
import plotly.express as px
px.line(train.groupby("date_block_num").sum().reset_index(),
        x="date_block_num",
        y="item_cnt_day",
        title="Items sold - overall" )

In [None]:
print("Will start training from date block num 13, as the first 10 months are required to calculate statistics in a 1yr lookback window")
px.line(train.groupby("date_block_num").sum().reset_index(),
        x="date_block_num",
        y="revenue",
        title="Revenues - overall" )

In [None]:
print("How will the target distribution of month 34 be like? It will probably be something in between month 22 (1 yr before) and month 33")
print(" - only a few items are in the shops and have 0 sales at all. We also have a pretty long tail to the right, let's check it in the next cell")
check = train.query("date_block_num in(33,22)").groupby(["date_block_num","shop_id","item_id"]).sum().reset_index()
check["item_cnt_month"] = check["item_cnt_day"].clip(0,20) # has been grouped
fig = px.histogram(check, x="item_cnt_month", color="date_block_num", title="Conjectures about target distributions", barmode="overlay")
fig.show()

check=None

## Text features from items descriptions - preprocessing

In [None]:
assert items.duplicated(subset="item_id").sum()==0

In [None]:
print("Tokenizing here. - an alternative approach would be using ngrams (not covered).")
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

stop_words = ["per","I", "me", "the", "what", "which", "having", "for", "with", "of", "about", "but", "if", "both", "each", "any", "a"] # https://gist.github.com/sebleier/554280
stemmer = SnowballStemmer("english") # Choose a language
custom_tokenizer = RegexpTokenizer(r'\w+')

def manipulate_str(a):
    a = a.lower()    
    word_list = custom_tokenizer.tokenize(a)
    
    stemmed_words = list()
    for w in word_list:
        sw = stemmer.stem(w)
        
        if w not in stop_words and len(sw) > 2:
            stemmed_words.append(sw)
        
    return ' '.join( set(stemmed_words) )

items["item_name_en_tokenized"] = items.item_name_en.apply(lambda x: manipulate_str(x))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=5, max_features=9000, stop_words="english") # would be 6000+ (up to 12k)
vectorizer.fit(items["item_name_en_tokenized"])
text_features = vectorizer.transform(items["item_name_en_tokenized"])

text_features.shape


In [None]:
# stuff are super sparse here
text_features.sum(axis=1).mean()

In [None]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import numpy as np
np.random.seed(0)

In [None]:
print("20 components account for about 25% of the variance -> we'll keep only those")
comps = list(range(2,22,2))
var_ratio = []
for n_comp in comps:
    svd = TruncatedSVD(n_components=n_comp, n_iter=10, random_state=2022)
    svd.fit(text_features)
    exp_var_ratio = svd.explained_variance_ratio_.sum()
    var_ratio.append(exp_var_ratio)
    print(f"With {n_comp} components, explained var ratio is {exp_var_ratio:.3f}")

In [None]:
import matplotlib.pyplot as plt
components = [0, 3, 6, 11, comps[-1]-3]

f, axs = plt.subplots(len(components),1, figsize=(12, 4*len(components)))

for i, component_to_viz in enumerate(components): # inefficient
    c = pd.DataFrame()
    c["magnitude"] = svd.components_[component_to_viz] * 100
    c["text"] = [f"txt_{c}" for c in vectorizer.get_feature_names_out()]
    
    c.sort_values("magnitude", ascending=False).head(16).plot.bar(x="text",
                                                                  y="magnitude",
                                                                  title=f"component {component_to_viz}", ax=axs[i])

plt.tight_layout()
plt.show()

In [None]:
transformed_text_features = svd.transform(text_features) * 100 # finding it easier to read.
n_feats_txt = transformed_text_features.shape[1]
print(f"Merging {n_feats_txt} into {items}")
col_names = [f"txt_{c}" for c in range(1,n_feats_txt+1)]

if type(text_features) is not pd.DataFrame:
    text_features = pd.DataFrame(transformed_text_features, columns=col_names)
    text_features[col_names] = text_features[col_names].astype(np.float16)

In [None]:
items = \
    pd.concat([items, text_features], axis=1)

# Plotting

Only plotting - minor data cleaning before saving

In [None]:
import seaborn as sns
import plotly.express as px

plot_df = \
    train.merge(items, on="item_id")
plot_df["revenues"] = plot_df.item_cnt_day * plot_df.item_price

print("Grouping by item - 'month_of_sales' describes for how long a given item has been sold")
plot_df = plot_df.groupby("item_id").agg(mean_price=("item_price", "mean"),
                                         month_of_sales=("date_block_num", "nunique"),
                                         shops_distribution=("shop_id", "nunique"),
                                         revenues=("revenues", "sum"),
                                         item_category_id=("item_category_id", "first"),
                                         item_cnt_total=("item_cnt_day", "sum"),
                                         item_name_en=("item_name_en", "first")
                      )
plot_df["item_category_id"] = plot_df["item_category_id"].astype(int)
plot_df["month_of_sales"] = plot_df["month_of_sales"].astype(int)
plot_df["revenues"] = plot_df["revenues"].astype(float)
plot_df["avg_items_per_month"] = plot_df["item_cnt_total"]/plot_df["month_of_sales"]


In [None]:
print("Playstations seems to be quite appreciated")
f = px.scatter(plot_df.query("revenues>=0 and item_id != 6066").reset_index(),
               x="mean_price",
               y="month_of_sales",
               color="item_category_id",
               size="revenues",
               hover_data=["item_id","item_name_en"]) #, size="revenues"
f.show()

In [None]:
print("So, avg items per month can literally go off the chart")
f = px.scatter(plot_df.query("revenues>=0 and item_id != 6066 and item_cnt_total>30").reset_index(),
               x="avg_items_per_month",
               y="month_of_sales",
               color="item_category_id",
               size="revenues",
               hover_data=["item_id","item_name_en"],
               title="A few items are very big sellers - in terms of count") #, size="revenues"
f.show()

In [None]:
print("The shape is quite interesting - do items tend to perform better in early months?")
print("Batteries are definitely a best seller - along with some video games (and The Hobbit!)")
cp = plot_df.query("revenues>=0 and item_id != 6066 and item_cnt_total>30").reset_index()
cp["avg_items_per_month"] = cp["avg_items_per_month"].clip(0,40)
cp["price_inverse"] = 1/cp["mean_price"] 
print("So, avg items per month can literally go off the chart")
f = px.scatter(cp,
               x="avg_items_per_month",
               y="month_of_sales",
               size="price_inverse",
               hover_data=["item_id","item_name_en","avg_items_per_month"],
               title="A few items are very big sellers - in terms of count") #, size="revenues"
f.show()
cp = None

In [None]:
plot_df.query("avg_items_per_month > 750")

In [None]:
plot_series = \
train.query("item_id in(3351, 3731, 10201, 20949,9242)").groupby(["item_id","date_block_num"])\
     .agg(item_price=("item_price", "mean"),
          item_cnt_month=("item_cnt_day", "sum")).reset_index().sort_values("date_block_num")

px.line(plot_series, x="date_block_num", y="item_cnt_month", color="item_id", hover_data=["item_price"])

In [None]:
print("Again - playstation items (and games in general) generate a lot of volumes - while books and music are comparatively low sellers")
plot_hist = \
plot_df.reset_index().merge(cats, on="item_category_id").groupby("item_category_id")\
       .agg(avg_items_per_month=("avg_items_per_month","mean"),
            category_items=("item_id", "size"),
            item_category_name_en=("item_category_name_en","first")
           ).query("category_items>10").sort_values("avg_items_per_month", ascending=False)

fig = px.histogram(plot_hist, x="item_category_name_en", y="avg_items_per_month")
fig.show()

In [None]:
print("But notice that the most populated categories (in terms of offering) are DVDs, BluRays and CDs")
fig = px.histogram(plot_hist, x="item_category_name_en", y="category_items")
fig.show()

# Save basic info ("train" dataset)

In [None]:
train = train.query("item_id != 6066")

In [None]:
items.to_csv("items.csv", index=False) # contains also text features (see section above)
shops.to_csv("shops.csv", index=False)
cats.to_csv("item_categories.csv", index=False)
train.to_csv("sales_train.csv", index=False)
test.to_csv("test.csv")
# Suggestion: downcast everything upon reloading

In [None]:
items.sample(3)

In [None]:
shops.sample(3)

In [None]:
cats.sample(3)

In [None]:
train.sample(5)

In [None]:
test.sample(5)

# Also add monthly stats
With (`train`) and without (`train_monthly`) cross-product.

Notice that `train` dataset contains only products that have been sold (or are in stock in a given shop).
However, the test set is made by the cross product of `shops` x `items`: we are asked to predict sales across all couples!

For cross validation purposes it can be convenient to replicate the "test" configuration in the train dataset, I'll call this new dataset `train_monthly_extended`.

In [None]:
print("Resampling train data to monthly")
train_monthly = train.groupby(["date_block_num","shop_id","item_id"])\
                     .agg(item_cnt_month=("item_cnt_day", "sum"),
                          revenue_month=("revenue","sum"),
                          days_with_sales=("date_block_num", "count"),
                          first_day_with_sales=("date", "min"),
                          last_day_with_sales=("date", "max"),
                          item_price_avg=("item_price", "mean"),
                          item_price_max=("item_price", "mean"),
                          item_price_min=("item_price", "min"),
                          ).reset_index()

train_monthly["first_day_with_sales_in_shop"] = train_monthly.groupby(["date_block_num","shop_id"])["first_day_with_sales"].transform("min")
train_monthly["days_no_sales_beginning"] = -(train_monthly["first_day_with_sales_in_shop"] - train_monthly["first_day_with_sales"]).dt.days

train_monthly.sample(3)

In [None]:
train_monthly

In [None]:
train_monthly.to_csv("train_monthly.csv", index=False)


In [None]:
train_monthly.sample(3)

In [None]:
print("Working on the cross-product")
from itertools import product
df_product = [] # list of np arrays
for block_num in train_monthly['date_block_num'].unique():
    
    cur_shops = train_monthly.query(f"date_block_num in ({block_num}, {block_num+1})")["shop_id"].unique()
    cur_items = train_monthly.query(f"date_block_num in ({block_num}, {block_num+1})")["item_id"].unique()
    
    if block_num == 33:
        print("Adding also future shops-item pairs")
        cur_shops = np.unique(np.concatenate( (test.shop_id.unique(), cur_shops) ))
        cur_items = np.unique(np.concatenate( (test.item_id.unique(), cur_items) ))
    
    # product takes iterables (size 1 permitted) as args
    df_product.append(np.array(list(product(*[cur_shops, cur_items, [block_num]]))))

df_product = pd.DataFrame(np.vstack(df_product), columns=['shop_id', 'item_id', 'date_block_num'])
df_product.head()

In [None]:
df_product.query("shop_id==5 and item_id==5320")

In [None]:
print(f"Entries in product monthly dataframe: {df_product.shape[0]}")
print(f"Difference: {df_product.shape[0] - train_monthly.shape[0]}")

In [None]:
train_monthly_extended = \
    df_product.merge(train_monthly, on=["shop_id","item_id","date_block_num"], how="left")
train_monthly_extended["item_cnt_month"] = train_monthly_extended["item_cnt_month"].fillna(0)

train_monthly_extended.sample(4)

In [None]:
train_monthly_extended.query("item_id==5320").shape

### Cluster shops according to their assortment

Will then generate "encoding" features based on shop similarities.
`n_clusters` has been heuristically chosen: first a low number (5: coarse information) and to a higher one (9, fine grained, when there is data available).


In [None]:
import warnings
warnings.filterwarnings("ignore")

clustering_data_to = [18, 22, 26, 30, 33]
cats["short_name"] = cats["item_category_name_en"].apply(lambda x: x.split(" ")[0])

#https://www.scikit-yb.org/en/latest/api/cluster/elbow.html
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from yellowbrick.cluster import KElbowVisualizer

num_clusters = {}
for n_clusters in [5,9]:
    clustering_validated = {}
    for i, val in enumerate(clustering_data_to):
        build_shop_stats = train.merge(items, on="item_id").query(f"date_block_num<={val} and date_block_num>={val-12}").merge(cats, on="item_category_id")
        build_shop_stats = build_shop_stats.groupby(["shop_id","item_category_id"]).agg(
                                  num_items_unique=("item_id","nunique"),
                                  item_cnt_mean=("item_cnt_day","mean"),
                                  revenue_mean=("revenue","mean"))
        build_shop_stats["num_items_unique"] = build_shop_stats["num_items_unique"]/(build_shop_stats["num_items_unique"].mean())
        build_shop_stats["revenue_mean"] = build_shop_stats["revenue_mean"]/(build_shop_stats["revenue_mean"].mean())

        train_shop_dset = \
                    build_shop_stats.reset_index().pivot(index="shop_id",columns=["item_category_id"]).fillna(0)

        if i == 1:
            model = KMeans()
            visualizer = KElbowVisualizer(model, k=(3,16))
            visualizer.fit(train_shop_dset)        # Fit the data to the visualizer
            visualizer.show()        # Finalize and render the figure

            train_shop_dset["shop_cluster"] = visualizer.predict(train_shop_dset)

        print("True model is fitted here")
        model = KMeans(n_clusters=n_clusters) # good compromise :)
        model.fit(train_shop_dset)  
        train_shop_dset["shop_cluster"] = model.predict(train_shop_dset)  
        clustered = shops.merge(train_shop_dset["shop_cluster"], on="shop_id", how="left")[["shop_id","shop_cluster"]]
        clustered["shop_cluster"] = clustered["shop_cluster"].fillna(-1)

        clustering_validated[val] = clustered.copy(deep=True)

        print(f"{val} done ---")
        num_clusters[n_clusters] = clustering_validated

In [None]:
train_monthly = \
                train_monthly.merge(items[["item_id","item_category_id"]], on="item_id", how="inner")

In [None]:
print("Applying clustering")
for n_clusters in [5,9]:
    train_monthly[f"shop_cluster_{n_clusters}"] = np.nan
    for i, val in enumerate(clustering_data_to):
        clustered = num_clusters[n_clusters][val].set_index("shop_id")
        start_from = 0
        if i >= 1:
            start_from = clustering_data_to[i-1]
        indexes = train_monthly.query(f"date_block_num>={start_from} and date_block_num<={val}").index
        train_monthly.loc[indexes, f"shop_cluster_{n_clusters}"] = clustered.loc[train_monthly.loc[indexes].shop_id].shop_cluster.to_numpy()

In [None]:
train_monthly.query("date_block_num==33")[["shop_cluster_5","shop_cluster_9"]].sample(10)

In [None]:
match_shops_to_clusters = \
    train_monthly.groupby(["shop_id","date_block_num"]).agg(shop_cluster_9=("shop_cluster_9","first"),
                                                            shop_cluster_5=("shop_cluster_5","first"))

len_before = train_monthly_extended.shape[0]
train_monthly_extended = train_monthly_extended.merge(match_shops_to_clusters, on=["shop_id","date_block_num"], how="left")
assert train_monthly_extended.shape[0] == len_before

In [None]:
assert "shop_cluster_9" in train_monthly_extended.columns
assert "shop_cluster_5" in train_monthly_extended.columns

train_monthly_extended[["shop_cluster_9","shop_cluster_5"]] = \
            train_monthly_extended[["shop_cluster_9","shop_cluster_5"]].fillna(-1)

In [None]:
len_before = train_monthly_extended.shape[0]
train_monthly_extended = \
                train_monthly_extended.merge(items[["item_id","item_category_id"]], on="item_id", how="inner")
assert train_monthly_extended.shape[0] == len_before

In [None]:
new_features = []
for c in ["shop_cluster_5","shop_cluster_9"]:
    print(f"Processing {c}")
    for c_name, grp_keys in zip([f"{c}_encoded",f"{c}_item_encoded",f"{c}_cat_encoded"],
                                [["date_block_num", c], ["date_block_num","item_id", c], ["date_block_num","item_category_id", c]]):
        
        a = train_monthly.groupby(grp_keys)[["item_cnt_month"]]\
                         .mean("mean").astype(np.float16).rename(columns={"item_cnt_month": c_name})
        
        len_before = train_monthly_extended.shape[0]
        train_monthly_extended = train_monthly_extended.merge(a.reset_index(), on=grp_keys, how="left")
        
        num_nans = train_monthly_extended[c_name].isna().sum()
        if num_nans > 0:
            print(f"Filling {num_nans} entries for feature {c_name}")
            train_monthly_extended[c_name] = train_monthly_extended[c_name].fillna(-1)
        assert len_before == train_monthly_extended.shape[0]
        new_features.append(c_name)

In [None]:
sns.histplot(
             train_monthly_extended[["shop_cluster_9_cat_encoded","shop_cluster_5_cat_encoded"]].sample(8000).clip(0,10)
            )

In [None]:
print("Feature corr")
sns.pairplot(
            train_monthly_extended.query("date_block_num>20")[["shop_cluster_5_encoded","shop_cluster_5_cat_encoded","shop_cluster_9_cat_encoded","item_cnt_month","date_block_num"]].sample(6000).clip(0,34),
            hue="date_block_num"
            )

In [None]:
if len(new_features)>0:
    train_monthly_extended[new_features].sample(10)
else:
    raise

In [None]:
print("Check if the clustering is plausible")
train_monthly.query("date_block_num==33").groupby("shop_id")[["shop_cluster_9"]].first().merge(train.groupby("shop_id")["revenue"].mean(), on="shop_id").sort_values("revenue", ascending=True)

### Add `item_cnt_month` encodings for categories
"item_cnt_month" is probably the most precious information we have. Let's try to make the most out of it by creating different types of encodings.

In [None]:
print("I don't like target encoding as it can easily generate leakages, let's encode 'item_cnt_month' instead")
cats["item_category_name_splitted"] = cats.item_category_name_en.apply(lambda x: x.split("-")[0])
cats["item_category_name_short"] = cats.item_category_name_en.apply(lambda x: x[:7])

train_monthly_cat_stats = train_monthly.copy()

In [None]:
def downcast_feats(df):
    for c in df.columns:
        #print(df[c].dtype)
        if df[c].dtype=="float64":
            df[c] = df[c].astype("float16")
        elif df[c].dtype=="int64":
            df[c] = df[c].astype("int32")
    return df

In [None]:
if "item_category_id" not in train_monthly_cat_stats.columns:
    print("Also merging items")
    train_monthly_cat_stats = train_monthly_cat_stats.merge(items, on="item_id").merge(cats, on="item_category_id")
else:
    train_monthly_cat_stats = train_monthly_cat_stats.merge(cats, on="item_category_id")

In [None]:
train_monthly_cat_stats = train_monthly_cat_stats[['date_block_num', 'shop_id', 'item_id', 'item_cnt_month','item_category_name_splitted','revenue_month','item_category_name_short']]

In [None]:
train_monthly_cat_stats = train_monthly_cat_stats.set_index(['date_block_num', 'shop_id', 'item_id'])

In [None]:
def get_stats(train_monthly_cat_stats, feature_prefix="category_shop", group_keys = ["date_block_num", "item_category_name_short", "shop_id"], time_windows=[12,5,1]):
    """
    Generate statistics on a given set of group keys by using past data only.
    """
    assert group_keys[0] == "date_block_num"
    for i,time_window in enumerate(time_windows):
        build_df = []
        print(f"Processing i={i}, tw={time_window}")
        for d in train_monthly_cat_stats.reset_index().date_block_num.unique():
            tmp = \
                  train_monthly_cat_stats.query(f"date_block_num<={d} and date_block_num>{d-time_window}")
            tmp = tmp.reset_index()\
                     .groupby(group_keys[1:])\
                     .agg(a = ("item_cnt_month", "mean"),
                          b = ("item_id", "nunique"))
            tmp["date_block_num"] = d
            tmp = tmp.reset_index().set_index(group_keys)
            tmp = tmp.rename(columns={"a":f"mean_item_cnt_{feature_prefix}_prev_{time_window}mo",
                                     # "b":f"num_item_{feature_prefix}_prev_{time_window}mo"
                                     }
                            )
            if i == 0 and d<33:
                build_df.append(tmp.copy(deep=True))
            elif i == 0 and d==33:
                print("BUILDING")
                build_df.append(tmp.copy(deep=True))
                df = pd.concat(build_df,axis=0).reset_index().set_index(group_keys)
                df = df.rename(columns={"a":f"mean_item_cnt_{feature_prefix}_prev_{time_window}mo",
                                       # "b":f"num_item_{feature_prefix}_prev_{time_window}mo"
                                       })
                build_df = None
            else:
                df.loc[tmp.index, f"mean_item_cnt_{feature_prefix}_prev_{time_window}mo"] = tmp[f"mean_item_cnt_{feature_prefix}_prev_{time_window}mo"]
                #df.loc[tmp.index, f"num_item_{feature_prefix}_prev_{time_window}mo"] = tmp["b"]
    return df




In [None]:
stat_cat_shop = get_stats(train_monthly_cat_stats,
                          feature_prefix="category_shop",
                          group_keys = ["date_block_num", "item_category_name_short", "shop_id"],
                          time_windows=[12,5,1])
stat_cat_shop =  downcast_feats(stat_cat_shop.reset_index())

In [None]:
sns.histplot(stat_cat_shop[["mean_item_cnt_category_shop_prev_5mo","mean_item_cnt_category_shop_prev_1mo"]].clip(0,50))

In [None]:
stat_cat = get_stats(train_monthly_cat_stats, feature_prefix="category", group_keys = ["date_block_num", "item_category_name_short"], time_windows=[3])
stat_cat =  downcast_feats(stat_cat.reset_index())

sns.histplot(stat_cat[["mean_item_cnt_category_prev_3mo"]].clip(0,50))

In [None]:
train_monthly_cat_stats

In [None]:
item_stats = get_stats(train_monthly_cat_stats, feature_prefix="item", group_keys = ["date_block_num", "item_id"], time_windows=[12,3,1])
item_stats =  downcast_feats(item_stats.reset_index())


sns.histplot(item_stats[["mean_item_cnt_item_prev_12mo","mean_item_cnt_item_prev_3mo","mean_item_cnt_item_prev_1mo"]].clip(0,7))

In [None]:
shop_stats = get_stats(train_monthly_cat_stats, feature_prefix="shop", group_keys = ["date_block_num", "shop_id"], time_windows=[6,3,1]) # would probably be better to build median
shop_stats =  downcast_feats(shop_stats.reset_index())

#### Merge category encodings

In [None]:
lookup = items.merge(cats, on="item_category_id")[["item_id","item_category_name_short"]]
right = stat_cat_shop.merge(lookup, on="item_category_name_short").drop(columns=["item_category_name_short"])

size_before = train_monthly_extended.shape[0]
train_monthly_extended = \
train_monthly_extended.merge(right,
                             on=["date_block_num", "item_id", "shop_id"], how="left")
assert size_before == train_monthly_extended.shape[0]
right = None

In [None]:
right = stat_cat_shop.merge(lookup, on="item_category_name_short")
right

In [None]:
lookup = items.merge(cats, on="item_category_id")[["item_id","item_category_name_short"]]
right = stat_cat.merge(lookup, on="item_category_name_short").drop(columns=["item_category_name_short"])

size_before = train_monthly_extended.shape[0]
train_monthly_extended = \
train_monthly_extended.merge(right,
                             on=["date_block_num", "item_id"], how="left")
assert size_before == train_monthly_extended.shape[0]
right = None

In [None]:
right = item_stats.merge(lookup, on="item_id").drop(columns=["item_category_name_short"])
size_before = train_monthly_extended.shape[0]

train_monthly_extended = \
train_monthly_extended.merge(right,
                             on=["date_block_num", "item_id"], how="left")
assert size_before == train_monthly_extended.shape[0]
right = None

In [None]:
right = shop_stats
size_before = train_monthly_extended.shape[0]

train_monthly_extended = \
train_monthly_extended.merge(right,
                             on=["date_block_num", "shop_id"], how="left")
assert size_before == train_monthly_extended.shape[0]
right = None

In [None]:
train_monthly_extended.columns

# Save the results, ready for training

These results are potentially ready as is, but you may want to add time lags to some of the generated features.
I found it convenient to add them only in a second notebook.

In [None]:
train_monthly_extended.sample(6)

In [None]:
cols = ["item_cnt_month", "revenue_month", "days_with_sales", "days_no_sales_beginning"]
train_monthly_extended[cols] = train_monthly_extended[cols].fillna(0)

In [None]:
train_monthly_extended.columns

In [None]:
train_monthly_extended.to_csv("train_monthly_extended.csv", index=False)
