# Load data

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)
from hyperopt import fmin, tpe, hp, STATUS_OK
from functools import partial
import numpy as np
np.set_printoptions(suppress=True)
import cudf
from datetime import datetime, date
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
calendar = pd.read_csv("../input/predict-future-sales-supplementary/calendar.csv")
item_cat = pd.read_csv("../input/predict-future-sales-supplementary/item_category.csv")
currency = pd.read_csv("../input/predict-future-sales-supplementary/usd-rub.csv")
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
shops = pd.read_csv("../input/predict-future-sales-supplementary/shops-translated.csv")
sub = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")
train

# Sanity check

In [None]:
train = train.drop_duplicates()
train = train[(train.item_cnt_day > 0) & (train.item_price > 0)]

Detect duplicate shops

In [None]:
print(shops[shops.shop_id.isin([0, 57])]['Name'])
print(shops[shops.shop_id.isin([1, 58])]['Name'])
print(shops[shops.shop_id.isin([40, 39])]['Name'])

In [None]:
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57

train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58

train.loc[train.shop_id == 40, 'shop_id'] = 39
test.loc[test.shop_id == 40, 'shop_id'] = 39

In [None]:
feature_generation = dict(
    use_2014 = False,
    ignore_price_outlier = True,
    ignore_item_cnt_outlier = True,
    last_n_month_sales = False,
    clip_price = False,
    clip_item_cnt_day = False,
    clip_item_cnt_month = True,
    month_quarter_season_sin_cos = True,
    add_mul_decompose = True,
    item_cat = True,
    shop_cat = True,
    freq_encoding = False,
    is_month_holiday = True,
    most_popular_item = True,
    last_price_number = True,
    item_price_stats = True,
)

cat_features = ["shop_id", "item_id"]
encode_cols = []

# Feature Engineering

## Use from 2014 only
Intuition: ignoring old data may help

In [None]:
if feature_generation.get("use_2014"):
    train = train[train.date_block_num >=12]

## Outlier Removal/Winsorization
Intuition: well it's outliers

In [None]:
if feature_generation["ignore_price_outlier"]:
    min_price, max_price = train["item_price"].quantile([0.1, 0.9])
    train = train[train.item_price <= max_price]
    print("Ignore item price outlier")
    print(f"Min price: {min_price}, Max price: {max_price}")
    
elif feature_generation["clip_price"]:
    min_price, max_price = train["item_price"].quantile([0.1, 0.9])
    train["item_price"] = train["item_price"].clip(upper=max_price)
    print("Clip price")
    print(f"Min price: {min_price}, Max price: {max_price}")

if feature_generation["clip_item_cnt_day"]:
    max_cnt = train["item_cnt_day"].quantile(0.9)
    train["item_cnt_day"] = train["item_cnt_day"].clip(lower = 0, upper=max_cnt)
    print("Clip item_cnt_day")
    print(f"Max item cnt day: {max_cnt}")
    
elif feature_generation["ignore_item_cnt_outlier"]:
    max_cnt = train["item_cnt_day"].quantile(0.9)
    train = train[train.item_cnt_day <= max_cnt]
    print("Ignore item_cnt outlier")
    print(f"Max item cnt day: {max_cnt}")

## Monthly Groupby

In [None]:
train = pd.merge(train, test, on=["shop_id", "item_id"], how="inner")
train = train.drop("ID", axis=1)
train = cudf.from_pandas(train)
train = train.groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_price": "mean", "item_cnt_day": "sum", "date": "min"}).reset_index()
train = train.to_pandas()
train.rename(columns={"item_cnt_day": "item_cnt_month"}, inplace=True)
train

## Clip item_cnt_month in range [0, 20]

In [None]:
if feature_generation.get("clip_item_cnt_month"):
    train["item_cnt_month"] = train["item_cnt_month"].clip(0, 20)
train

## Holiday features
Intuition: people tend to buy more during holidays (and actually January sees peak sales)

In [None]:
if feature_generation.get("is_month_holiday"):
    calendar = pd.read_csv("../input/predict-future-sales-supplementary/calendar.csv")
    calendar["date"] = pd.to_datetime(calendar.date, format = "%Y-%m-%d")
    calendar['month'] = pd.DatetimeIndex(calendar['date']).month
    calendar['date_block_num'] = calendar['date'].dt.to_period('M').sort_values().factorize()[0]
    train = train.merge(calendar.groupby("date_block_num")["holiday"].sum().reset_index(), on="date_block_num")
    test["date_block_num"] = 34
    test = test.merge(calendar.groupby("date_block_num")["holiday"].sum().reset_index(), on="date_block_num")
    test = test.drop("date_block_num", axis=1)
train

## Fill item_price in test set with the corresponding item_id/shop_id pair in train set. Fill NaNs with mean price of all items.

In [None]:
shop_item_price = cudf.from_pandas(train).groupby(["shop_id", "item_id"]).agg({"item_price": "mean"}).reset_index()
shop_item_price = shop_item_price.to_pandas()
item_price = cudf.from_pandas(train).groupby(["item_id"]).agg({"item_price": "mean"}).reset_index()
item_price = item_price.to_pandas()
tr = pd.merge(test, shop_item_price, how="left", on=["shop_id", "item_id"])
tr = pd.merge(tr, item_price, how="left", on="item_id")
tr.fillna(0, inplace=True)
tr["item_price_mean"] = tr.item_price_x + tr.item_price_y
tr.loc[tr.item_price_mean == 0, "item_price_mean"] = item_price.item_price.mean()
tr.drop(["item_price_x", "item_price_y"], axis=1, inplace=True)
test["item_price"] = tr.item_price_mean
test = test.drop("ID", axis=1)
test["date_block_num"] = 34
test["date"] = "01.11.2015"
train_new = pd.concat([train.sort_values("date"), test], axis=0, ignore_index=True)
del train
del tr
del item_price
del shop_item_price
train_new

## Lagged features
Intuition: previous sales from last months may help

In [None]:
if feature_generation.get("last_n_month_sales"):
    train_new = cudf.from_pandas(train_new)
    groupby_obj = train_new.groupby(["date_block_num", "shop_id", "item_id"]) # Took a long time
    periods = [1, 2]
    for p in periods:
        train_new[f"last_{p}_month_sales"] = groupby_obj["item_cnt_month"].shift(periods = p)
    del groupby_obj
    train_new = train_new.to_pandas()
train_new

## Cyclic features: Month, Week, Seasonality
Intuition: the sudden change in month from Dec to Jan (from 12 to 1) may confuse the model

In [None]:
if feature_generation.get("month_quarter_season_sin_cos"):
    train_new["date"] = pd.to_datetime(train_new.date, format = "%d.%m.%Y")
    train_new["year"] = pd.DatetimeIndex(train_new["date"]).year
    train_new['month'] = pd.DatetimeIndex(train_new['date']).month
    train_new['quarter'] = pd.DatetimeIndex(train_new['date']).quarter
    train_new['season'] = train_new.month % 12 // 3 + 1
    month_in_year = 12
    train_new['month_sin'] = np.sin(2*np.pi*train_new.month/month_in_year)
    train_new['month_cos'] = np.cos(2*np.pi*train_new.month/month_in_year)
    quarters_in_year = 4
    train_new['quarter_sin'] = np.sin(2*np.pi*train_new.quarter/quarters_in_year)
    train_new['quarter_cos'] = np.cos(2*np.pi*train_new.quarter/quarters_in_year)
    seasons_in_year = 4
    train_new['season_sin'] = np.sin(2*np.pi*train_new.season/seasons_in_year)
    train_new['season_cos'] = np.cos(2*np.pi*train_new.season/seasons_in_year)
    
    for col in train_new.columns:
        if "_sin" in col or "_cos" in col:
            train_new[col] = np.clip(train_new[col].values, 0.0, 1.0).astype("float16")
        
train_new

## Seasonal decompose features


In [None]:
if feature_generation.get("add_mul_decompose"):
    from statsmodels.tsa.seasonal import seasonal_decompose

    decompose_cols =  ["item_price"]

    for col in decompose_cols:
        decomp = seasonal_decompose(train_new[col], freq=52, model='additive', extrapolate_trend='freq')
        train_new[f"{col}_trend_add"] = decomp.trend
        train_new[f"{col}_seasonal_add"] = decomp.seasonal
        decomp = seasonal_decompose(train_new[col], freq=52, model='multiplicative', extrapolate_trend='freq')
        train_new[f"{col}_trend_mul"] = decomp.trend
        train_new[f"{col}_seasonal_mul"] = decomp.seasonal

    train_new = train_new.sort_values("date").reset_index(drop=True)
train_new

## More info about Shop and Item features
Intuition: shop type, item category might help

In [None]:
if feature_generation.get("item_cat"):
    train_new = train_new.merge(item_cat.drop("item_name_translated", axis=1), how="inner", on="item_id").reset_index(drop=True)
    cat_features.append("item_cat1")
    cat_features.append("item_cat2")
    encode_cols.append("item_cat1")
    encode_cols.append("item_cat2")
    train_new["item_cat2"].fillna(train_new["item_cat2"].value_counts().idxmax(), inplace=True)
if feature_generation["shop_cat"]:
    train_new = train_new.merge(shops.drop("Name", axis=1), how="inner", on="shop_id").reset_index(drop=True)
    cat_features.append("City")
    cat_features.append("Type")
    encode_cols.append("City")
    encode_cols.append("Type")
train_new

## Jan seems to have peak sales among all years
Intuition: Jan among all years witnessed peak sales

In [None]:
if feature_generation.get("is_month_holiday"):
    train_new["is_month_holiday"] = 0
    train_new.loc[train_new.month == 1, "is_month_holiday"] = 1
    cat_features.append("is_month_holiday")
train_new

## Most popular ItemID of itemCat1, itemCat2, ShopID

In [None]:
if feature_generation.get("most_popular_item"):
    most_popular_itemCat1 = train_new.loc[train_new.groupby(["item_cat1"])["item_cnt_month"].idxmax().dropna()][["item_cat1", "item_id"]]
    most_popular_itemCat1.rename(columns={"item_id": "most_popular_itemCat1_itemid"}, inplace=True)
    train_new = train_new.merge(most_popular_itemCat1, how="left", on="item_cat1")
    train_new["most_popular_itemCat1_itemid"].fillna(train_new.groupby(["item_cat1"])["item_cnt_month"].count().idxmax(), inplace=True)
    cat_features.append("most_popular_itemCat1_itemid")
    
    most_popular_itemCat2 = train_new.loc[train_new.groupby(["item_cat2"])["item_cnt_month"].idxmax().dropna()][["item_cat2", "item_id"]]
    most_popular_itemCat2.rename(columns={"item_id": "most_popular_itemCat2_itemid"}, inplace=True)
    train_new = train_new.merge(most_popular_itemCat2, how="left", on = "item_cat2")
    train_new["most_popular_itemCat2_itemid"].fillna(train_new.groupby(["item_cat2"])["item_cnt_month"].count().idxmax(), inplace=True)
    cat_features.append("most_popular_itemCat2_itemid")
    
    most_popular_shop_item = train_new.loc[train_new.groupby(["shop_id"])["item_cnt_month"].idxmax().dropna()][["item_id", "shop_id"]]
    most_popular_shop_item.rename(columns={"item_id": "most_popular_shop_item"}, inplace=True)
    train_new = train_new.merge(most_popular_shop_item, how="left", on= "shop_id")
    train_new["most_popular_shop_item"].fillna(train_new.groupby(["shop_id"])["item_cnt_month"].count().idxmax(), inplace=True)
    cat_features.append("most_popular_shop_item")
    
train_new

## item_price statistics features using groupby of various column combinations

In [None]:
if feature_generation.get("item_price_stats"):
    stats = ["min", "max", "mean", "median", "std"]
    groupby_methods = [
        ["shop_id", "item_id"], ["item_cat1", "item_id"], ["item_cat2", "item_id"], ["item_cat1", "item_cat2", "item_id"],
        ["shop_id", "item_cat1", "item_id"], ["shop_id", "item_cat2", "item_id"], ["shop_id", "item_cat1", "item_cat2", "item_id"]]
    for method in groupby_methods:
        for stat in stats:
            method_name = "_".join(method)
            train_new[f"{method_name}_" + stat] = train_new.groupby(method)["item_price"].transform(stat)
            train_new[f"{method_name}_" + stat].fillna(train_new[f"{method_name}_" + stat].mean(), inplace=True)
train_new
        

## item_price 99, 49 features
Intuition: maybe it will help the model to see something if the price is 1099, 109, 499?

In [None]:
if feature_generation.get("last_price_number"):
    res = []
    for idx, val in train_new["item_price"].iteritems():
        if val < 10:
            res.append(val)
        elif 10 <= val < 100:
            res.append(val % 10)
        elif 100 <= val < 1000:
            res.append(val % 100)
        elif 1000 <= val < 10000:
            res.append(val % 1000)
        elif 10000 <= val:
            res.append(val % 10000)
    train_new["last_price_number"] = [round(x) for x in res]
train_new

## Frequency Encoding

In [None]:
if feature_generation.get("freq_encoding"):
    encoded_cols = ["shop_id", "item_id",
                    "item_cat1", "item_cat2", "City", "Type", 
                    "most_popular_itemCat1_itemid","most_popular_itemCat2_itemid", 
                    "most_popular_shop_item"]
    for col in encoded_cols:
        encoding = train_new.groupby(col).size()
        encoding = encoding / len(train_new)
        train_new[col + "_FreqEnc"] = train_new[col].map(encoding)
        train_new = train_new.drop(col, axis=1)
        cat_features.remove(col)
train_new

# Training Part

In [None]:
cols_to_drop = ["date", "month", "quarter", "season"]

tr = train_new[train_new.date_block_num <= 33].drop(cols_to_drop, axis=1)
test = train_new[train_new.date_block_num == 34].drop(cols_to_drop, axis=1)

del train_new

In [None]:
tr.info()

In [None]:
cat_features

In [None]:
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error

MONTHS = 33

ITERATIONS = 20000
MAX_EVALS = 75
EARLY_STOP = ITERATIONS // 10
rstate = np.random.RandomState(42) 

def run_train(df, df_test, cat_features):
    
    for cat in cat_features:
        df[cat] = df[cat].astype(str)
        df_test[cat] = df_test[cat].astype(str)
    df = df.reset_index(drop=True)
    
    df_test = df_test.reset_index(drop=True)
    
    
    X_tr = df[df.date_block_num < MONTHS].drop(["date_block_num", "item_cnt_month"], axis=1)
    y_tr = df[df.date_block_num < MONTHS]["item_cnt_month"].to_frame()
    X_val = df[df.date_block_num >= MONTHS].drop(["date_block_num", "item_cnt_month"], axis=1)
    y_val = df[df.date_block_num >= MONTHS]["item_cnt_month"].to_frame()
    
    df_test = df_test.drop(["item_cnt_month"], axis=1)
    
    train_pool = Pool(X_tr, y_tr["item_cnt_month"], cat_features = cat_features)
    val_pool = Pool(X_val, y_val["item_cnt_month"], cat_features = cat_features)
    test_pool = Pool(df_test, cat_features=cat_features)
    integer_params = ['depth','min_data_in_leaf','max_bin']
    
    def objective_func(params, train_pool, val_pool):
        for param in integer_params:
            params[param] = int(params[param])
        # if params['bootstrap_type']['bootstrap_type'] == 'Bayesian':
        #    bagging_temp = params['bootstrap_type'].get('bagging_temperature')
        #    params['bagging_temperature'] = bagging_temp
        if params['grow_policy']['grow_policy'] == 'LossGuide':
            max_leaves = params['grow_policy'].get('max_leaves')
            params['max_leaves'] = int(max_leaves)
        # params['bootstrap_type'] = params['bootstrap_type']['bootstrap_type']
        params['grow_policy'] = params['grow_policy']['grow_policy']

        params['fold_len_multiplier'] = max(params['fold_len_multiplier'], 1)

        print(params)
        model = CatBoostRegressor(iterations = ITERATIONS, loss_function = "RMSE", task_type="GPU", devices='0:1',
                                  eval_metric = "RMSE", verbose=False, **params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds = EARLY_STOP)
        loss = model.get_best_score()["validation"]["RMSE"]
        del model
        print(loss)
        return {"loss": loss, "status": STATUS_OK}


    CB_MAX_DEPTH = 11
    bootstrap_type = [{'bootstrap_type':'Poisson'}, 
                      {'bootstrap_type':'MVS'},
                       {'bootstrap_type':'Bayesian',
                        'bagging_temperature' : hp.uniform('bagging_temperature', 0, 1)},
                      {'bootstrap_type':'Bernoulli'}] 
    LEB = ['No', 'AnyImprovement', 
           'Armijo'
          ]
    grow_policy = [{'grow_policy':'SymmetricTree'},
                   {'grow_policy':'Depthwise'},
                   {'grow_policy':'Lossguide',
                    'max_leaves': hp.quniform('max_leaves', 2, 32, 1)}]
    space ={
            'depth': hp.quniform('depth', 7, CB_MAX_DEPTH, 1),
            'max_bin' : hp.quniform('max_bin', 1, 35, 1), 
            'l2_leaf_reg' : hp.loguniform('l2_leaf_reg', 1, 10),
            'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 1, 25, 1),
            # 'bootstrap_type' : hp.choice('bootstrap_type', bootstrap_type),
            'learning_rate' : hp.uniform('learning_rate', 0.05, 1),
            'leaf_estimation_backtracking' : hp.choice('leaf_estimation_backtracking', LEB),
            'grow_policy': hp.choice('grow_policy', grow_policy),
            'fold_len_multiplier' : hp.loguniform('fold_len_multiplier', np.log(1.01), np.log(2.5)),
           }
    fn = partial(objective_func, train_pool=train_pool, val_pool=val_pool)
    print("Tuning hyperparams...")
    best_params = fmin(fn = fn, space=space, algo=tpe.suggest, max_evals = MAX_EVALS, rstate=rstate)

#     best_params['bootstrap_type'] = bootstrap_type[best_params['bootstrap_type']]['bootstrap_type']
    best_params['grow_policy'] = grow_policy[best_params['grow_policy']]['grow_policy']
    best_params['leaf_estimation_backtracking'] = LEB[best_params['leaf_estimation_backtracking']]   

    for param in integer_params:
        best_params[param] = int(best_params[param])
    if 'max_leaves' in best_params:
        best_params['max_leaves'] = int(best_params['max_leaves'])
    

    model = CatBoostRegressor(iterations = ITERATIONS, loss_function = "RMSE", custom_metric = "RMSE", task_type="GPU", devices='0:1',
                              verbose=1000, **best_params)
    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds = EARLY_STOP, plot=True)
    
    # Finetuning
    model2 = CatBoostRegressor(iterations = 2000, loss_function = "RMSE", custom_metric = "RMSE", task_type="GPU", devices='0:1',
                              verbose=1000, **best_params)
    model2.fit(train_pool, eval_set=val_pool, early_stopping_rounds = EARLY_STOP, plot=True, init_model=model2)
    model2.fit(val_pool, plot=True)
        
    df_test["predictions"] = model2.predict(test_pool)
    
    return df_test, model

In [None]:
df_test, model = run_train(tr, test, cat_features)
df_test

In [None]:
sub["item_cnt_month"] = df_test["predictions"].clip(0, 20)
sub.to_csv("submission.csv", index=False)
sub.head(30)