In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
import plotly.express as px
import time
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
mse = mean_squared_error

def downcast(df: pd.DataFrame) -> pd.DataFrame:
    float_cols = [c for c in df if df[c].dtype in ["float64"]]
    int_cols = [c for c in df if df[c].dtype in ['int64']]
    df[float_cols] = df[float_cols].astype('float32')
    df[int_cols] = df[int_cols].astype('int16')
    return df

def lag_feature(df: pd.DataFrame, lag: int, col: str, merge_cols, fill_value=-10, suffix=""):     
    temp = df[ merge_cols + [col] ]
    temp = temp.groupby(merge_cols).agg({f'{col}':'mean'}).reset_index()
    new_col_name = f'{col}{suffix}_lag{lag}'
    temp.columns = merge_cols + [new_col_name]
    temp['date_block_num'] += lag
    
    if new_col_name not in df.columns:
        df = pd.merge(df, temp, on=merge_cols, how='left')
    temp = None # gc
    
    df[new_col_name] = df[new_col_name].fillna(fill_value).astype('float32')
    
    return df, new_col_name

In [None]:
# Basic preprocessing is done in the corresponding notebook
items = pd.read_csv('../input/data-preprocessing/items.csv')
shops = pd.read_csv('../input/data-preprocessing/shops.csv')
cats = pd.read_csv('../input/data-preprocessing/item_categories.csv')
train = pd.read_csv('../input/data-preprocessing/sales_train.csv')
test  = pd.read_csv('../input/data-preprocessing/test.csv').set_index("ID")

dataframes = [train, shops, items, cats]
for d in dataframes:
    d = downcast(d)

In [None]:
train.sample(3)

In [None]:
# tme stands for "Train Monthly Extended", i.e. the corss product of date_block_num x shop_id x item_id
tme  = pd.read_csv('../input/data-preprocessing/train_monthly_extended.csv')
tme = downcast(tme)

cast_cols = ["item_cnt_month","days_with_sales","date_block_num"]
tme[cast_cols] = tme[cast_cols].astype(int)
tme["item_cnt_month"] =  tme["item_cnt_month"].clip(0,50)

tme.sample(4)

In [None]:
# do this after lagging
sns.scatterplot(data=tme.query("item_cnt_month>0 and days_with_sales>=0").sample(7000),
                x="days_with_sales", y="item_cnt_month", alpha=.5)

In [None]:
print("> Building features based on the lags of item_cnt_month")
cols = [] # stores the new features
for lag in [1,2,3,4,6,12]:
    t = time.process_time()

    print(f"Processing lag {lag} - filling strategy is for decision trees")
    tme, new_col = lag_feature(tme, lag, "item_cnt_month", ['date_block_num', 'shop_id', 'item_id'], fill_value=np.nan)
    elapsed_time = time.process_time() - t
    print(f"   -- {new_col} took {round(elapsed_time,1)}")
    cols.append(new_col)


print("> Building (renaming) target")
tme, new_col = lag_feature(tme, -1, "item_cnt_month", ['date_block_num', 'shop_id', 'item_id'], fill_value=0)
tme = tme.rename(columns={new_col: "target"})


In [None]:
for lag in [1,2,3,4,12]:
    t = time.process_time()
    print(f"Processing lag {lag} - filling strategy is for decision trees")
    tme, new_col = lag_feature(tme, lag, "item_cnt_month", ['date_block_num', 'shop_id'], fill_value=np.nan, suffix="s")
    elapsed_time = time.process_time() - t
    print(f"   -- {new_col} took {round(elapsed_time,1)}")
    cols.append(new_col)

In [None]:
for lag in [1,2,3,4,12]:
    t = time.process_time()
    print(f"Processing lag {lag} - filling strategy is for decision trees")
    tme, new_col = lag_feature(tme, lag, "item_cnt_month", ['date_block_num', 'item_id'], fill_value=np.nan, suffix="i")
    elapsed_time = time.process_time() - t
    print(f"   -- {new_col} took {round(elapsed_time,1)}")
    cols.append(new_col)

In [None]:
tme["avg_item_3mo"] = ((tme["item_cnt_month_lag1"] + tme["item_cnt_month_lag2"] + tme["item_cnt_month_lag3"])/3).astype(np.float16)
tme["diff_1yr"] = ((tme["item_cnt_month_lag1"] - tme["item_cnt_month_lag12"])).astype(np.float16)
tme["roc_1_2"] = (tme["item_cnt_month_lag1"]/tme["item_cnt_month_lag2"])
tme["roc_2_3"] = (tme["item_cnt_month_lag1"]/tme["item_cnt_month_lag3"])
tme["roc_1_4"] = (tme["item_cnt_month_lag1"]/tme["item_cnt_month_lag4"])
tme["roc_1_12"] = (tme["item_cnt_month_lag1"]/tme["item_cnt_month_lag12"])

tme["diff_12_34"] = tme["item_cnt_month_lag1"] + tme["item_cnt_month_lag2"] \
                    -tme["item_cnt_month_lag3"] - tme["item_cnt_month_lag4"]

tme["month_num"] = ((1 + (tme["date_block_num"]) % 12)).astype(np.uint8)
tme["daydiff"] = (tme["days_no_sales_beginning"] - tme["days_with_sales"]).astype(np.uint8)

In [None]:
print("How long does a given item stay in a shop?")
tme['item_age'] = (tme['date_block_num'] - tme.groupby('item_id')['date_block_num'].transform('min')).astype('int8')
tme['item_age'] = tme['item_age'].clip(0, 25)
tme['item_age_in_shop'] = (tme['date_block_num'] - tme.groupby(['item_id',"shop_id"])['date_block_num'].transform('min')).astype('int8').clip(0, 25)
tme["price_greater_than80"] = (tme["item_price_avg"]>1000).astype(int)
tme["price_var_within_month"] = (tme["item_price_max"] - tme["item_price_min"])/tme["item_price_avg"]
tme["price_max_avg_within_month"] = (tme["item_price_max"] - tme["item_price_avg"])

for lag in [1,2,3]:
    t = time.process_time()
    print(f"Processing lag {lag} - filling strategy is for decision trees")
    tme, new_col = lag_feature(tme, lag, "price_var_within_month", ['date_block_num', 'item_id'], fill_value=np.nan, suffix="p")
    elapsed_time = time.process_time() - t
    print(f"   -- {new_col} took {round(elapsed_time,1)}")
    cols.append(new_col)

tme.sample(4)

In [None]:
tme.to_pickle("tme_with_feature_eng.pkl")

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

stop_words = ["per","I", "me", "the", "what", "which", "having", "for", "with", "of", "about", "but", "if", "both", "each", "any", "a"] # https://gist.github.com/sebleier/554280
stemmer = SnowballStemmer("english") # Choose a language
custom_tokenizer = RegexpTokenizer(r'\w+')

def manipulate_str(a):
    a = a.lower()    
    word_list = custom_tokenizer.tokenize(a)
    
    stemmed_words = list()
    for w in word_list:
        sw = stemmer.stem(w)
        
        if w not in stop_words and len(sw) > 2:
            stemmed_words.append(sw)
        
    return ' '.join( set(stemmed_words) )

items["item_name_en_tokenized"] = items.item_name_en.apply(lambda x: manipulate_str(x))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=15, max_features=40, stop_words="english") # would be 6000+ (up to 12k)
vectorizer.fit(items["item_name_en_tokenized"])
text_features = vectorizer.transform(items["item_name_en_tokenized"])

text_features.shape

In [None]:
col_names = [f"txt_{c}" for c in vectorizer.get_feature_names_out()]
if type(text_features) is not pd.DataFrame:
    text_features = pd.DataFrame.sparse.from_spmatrix(text_features, columns=col_names)



In [None]:
items = \
pd.concat([items, text_features], axis=1)

In [None]:
text_features.mean()

In [None]:

items["txt_delivery"] = items["item_name_en"].apply(lambda x: "delivery" in x.lower()).astype(int)


In [None]:
cols = [f for f in items.columns if f.startswith("txt")]
tme = tme.merge(items[cols + ["item_id"]], on="item_id", how="left")

In [None]:
for c in cols:
    print(f"Processing {c}")
    tme[c] = tme.groupby(["date_block_num", c])["item_cnt_month"].transform("mean").astype(np.float16)

In [None]:
tme

In [None]:
tme.to_pickle("tme_with_feature_eng_and_text.pkl")

In [None]:
print(tme.columns)

# Test run below

In [None]:
use_feats = set(["price_var_within_month",'mean_item_cnt_shop_prev_3mo', 'mean_item_cnt_shop_prev_1mo',
                 'mean_item_cnt_shop_prev_6mo',
                 'item_cnt_month_lag1', 'item_cnt_month_lag2', 'item_cnt_month_lag3',
                 'item_cnt_month_lag6', 'item_cnt_month_lag12',
                 'item_cnt_months_lag1', 'item_cnt_months_lag2', 'item_cnt_months_lag3',"item_cnt_month_lag4","item_cnt_month_lag6",
                 'item_cnt_monthi_lag1', 'item_cnt_monthi_lag2', 'item_cnt_monthi_lag3','item_cnt_monthi_lag4','item_cnt_monthi_lag12',
                 'avg_item_3mo', 'diff_1yr', 'roc_1_2', 'roc_2_3', 'month_num',
                 'daydiff', 'item_age', 'item_age_in_shop', 'price_greater_than80',
                 'mean_item_cnt_item_prev_12mo','mean_item_cnt_category_shop_prev_12mo',
                 'item_cnt_month', 'mean_item_cnt_category_shop_prev_5mo',
                 'mean_item_cnt_category_shop_prev_1mo',
                 'price_max_avg_within_month', 'price_var_within_monthp_lag1',
                 'price_var_within_monthp_lag2', 'price_var_within_monthp_lag3',
                 'roc_1_4', 'roc_1_12', 'diff_12_34','price_greater_than80'])
use_feats = list(use_feats)

In [None]:
val = [18, 25, 27, 30, 32, tme.date_block_num.max()]
scores = {}
start_train = 13
tme["est_sales"] = np.nan
num_boost_round = 100
params = \
        {'lr': 0.12, 'max_depth': 10, 'n_estimators': 200, 'bagging_fraction': 0.8,
         'min_data_in_leaf': 20, 'max_bin': 200, 'initial_sales_clipping': 50,
         'num_leaves': 1022, 'bagging_freq': 5, 'objective': 'mse'}

params["num_leaves"] = min(2 ** params["max_depth"] - 2, 4096)
params["bagging_freq"] = 5
params["objective"] = "mse"
params["learning_rate"] = params["lr"]
import lightgbm as lgb

if 0:
    for i, validate_to in enumerate(val):
        scores[validate_to] = {}
        X_train = tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to}")[use_feats].copy(deep=True)
        y_train = tme.query(f"date_block_num>={start_train} and date_block_num < {validate_to}")["target"].copy(deep=True).clip(0,50) # arbitrary

        X_val = X_train.sample(int(X_train.shape[0]*.15), random_state=2022) # used for early stopping
        y_val = y_train.loc[X_val.index]

        X_train = X_train.drop(X_val.index)
        y_train = y_train.drop(y_val.index)

        lgb_train_dataset = lgb.Dataset(X_train, y_train)
        lgb_eval_dataset = lgb.Dataset(X_val, y_val)

        model = lgb.train(
                          params, 
                          lgb_train_dataset,
                          num_boost_round=num_boost_round,
                          valid_sets=(lgb_train_dataset, lgb_eval_dataset), 
                          feature_name = use_feats,
                          verbose_eval=5, 
                          early_stopping_rounds = 10
                         )

        if validate_to == tme.date_block_num.max() or (i+1)==len(val):
            next_val = tme.date_block_num.max()
        else:
            next_val = val[i+1] - 1

        print(f"Generating test predictions from {validate_to} to {next_val}")
        condition = f"date_block_num >= {validate_to} and date_block_num<={next_val}"
        X_test = tme.query(condition)[use_feats]
        y_test = tme.query(condition)["target"].copy(deep=True)

        tme.loc[X_test.index, "est_sales"] = model.predict(X_test) # not clipping

        train_rmse = mean_squared_error(model.predict(X_train).clip(0,20), y_train.clip(0,20)) ** 0.5
        val_rmse = mean_squared_error(model.predict(X_val).clip(0,20), y_val.clip(0,20)) ** 0.5
        test_rmse = mean_squared_error(model.predict(X_test).clip(0,20), y_test.clip(0,20)) ** 0.5

        print(f"Validating to {num_boost_round}")
        print(f"Train rmse: {train_rmse:.3f}")
        print(f"Val   rmse: {val_rmse:.3f}")
        if validate_to != 33:
            print(f"Test  rmse: {test_rmse:.3f}")



In [None]:
if 0:
    tme[["date_block_num","item_id","shop_id","est_sales","target"]].to_pickle("chained_predictions.pkl")
    for i,name in zip(model.feature_name(), model.feature_importance()):
        print(f"{name}: {i}")

In [None]:
tme.sample(10000).plot(x="price_var_within_month", y="target", kind="scatter", alpha=.4)

# Submission section

In [None]:
if 0:
    print(tme.query("date_block_num==33")[["est_sales"]].clip(0,20).mean())
    print(tme.query("date_block_num==33")[["item_cnt_month"]].clip(0,20).mean())
    preds = \
        tme.query("date_block_num==33")[["shop_id","item_id","est_sales","item_cnt_month"]]
    preds["est_sales"] =  preds["est_sales"].clip(0, 20)
    preds = preds.drop(columns=["item_cnt_month"])

    print(preds.mean())
    len_before = test.shape[0]

    sample_submission = \
    test.reset_index().merge(preds,
               on=["shop_id","item_id"]).set_index("ID").rename(columns={"est_sales":"item_cnt_month"})

    assert sample_submission.shape[0] == len_before

    sample_submission.item_cnt_month.mean()
    sample_submission[["item_cnt_month"]].to_csv('../working/submission.csv')