In [1]:
import gc
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.tseries.offsets import Day, MonthBegin, MonthEnd
import json
import zipfile
import os
from sklearn.preprocessing import OrdinalEncoder
import warnings
import lightgbm as lgbm
from sklearn.preprocessing import MinMaxScaler
warnings.filterwarnings("ignore")

In [None]:
api_token = {"username":"watsons","key":"dc7da47ca9aa5e696b65f97c16fd627b"}
if not os.path.exists("/root/.kaggle"):
    os.makedirs("/root/.kaggle")
 
with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)
!chmod 600 /root/.kaggle/kaggle.json
 
if not os.path.exists("/kaggle"):
    os.makedirs("/kaggle")
os.chdir('/kaggle')
!kaggle competitions download -c competitive-data-science-predict-future-sales
!unzip \*.zip  && rm *.zip

In [3]:
def reduce_mem_usage(df, silent=True, allow_categorical=True, float_dtype="float32"):
  def _downcast_numeric(series, allow_categorical=allow_categorical):
    if pd.api.types.is_sparse(series.dtype) is True:
      return series
    elif pd.api.types.is_numeric_dtype(series.dtype) is False:
      if pd.api.types.is_datetime64_any_dtype(series.dtype):
        return series
      else:
        if allow_categorical:
          return series
        else:
          codes, uniques = series.factorize()
          series = pd.Series(data=codes, index=series.index)
          series = _downcast_numeric(series)
          return series
      else:
        series = pd.to_numeric(series, downcast="integer")
      if pd.api.types.is_float_dtype(series.dtype):
        series = series.astype(float_dtype)
      return series

  if silent is False:
    start_mem = np.sum(df.memory_usage()) / 1024 ** 2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
  if df.ndim == 1:
    df = _downcast_numeric(df)
  else:
    for col in df.columns:
      df.loc[:, col] = _downcast_numeric(df.loc[:,col])
  if silent is False:
    end_mem = np.sum(df.memory_usage()) / 1024 ** 2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
  return df

def shrink_mem_new_cols(matrix, oldcols=None, allow_categorical=False):
  if oldcols is not None:
    newcols = matrix.columns.difference(oldcols)
  else:
    newcols = matrix.columns
  matrix.loc[:,newcols] = reduce_mem_usage(matrix.loc[:,newcols], allow_categorical=allow_categorical)
  oldcols = matrix.columns  # This is used to track which columns have already been downcast
  return matrix, oldcols

In [4]:
items = pd.read_csv("items.csv")
shops = pd.read_csv("shops.csv")
train = pd.read_csv("sales_train.csv")
test = pd.read_csv("test.csv")

In [6]:
train["date"] = pd.to_datetime(train["date"], format="%d.%m.%Y")
train = train.loc[train.shop_id.isin(test["shop_id"].unique()), :]
train = train[(train["item_price"] > 0) & (train["item_price"] < 50000)]
train = train[train["item_cnt_day"] < 1000]

In [7]:
def create_testlike_train(sales_train, test=None):
  indexlist = []
  for i in sales_train.date_block_num.unique():
    x = itertools.product([i],
      sales_train.loc[sales_train.date_block_num == i].shop_id.unique(),
      sales_train.loc[sales_train.date_block_num == i].item_id.unique())
    indexlist.append(np.array(list(x)))
  df = pd.DataFrame(data=np.concatenate(indexlist,axis=0),columns=["date_block_num", "shop_id", "item_id"],)

  sales_train["item_revenue_day"] = sales_train["item_price"] * sales_train["item_cnt_day"]
  sales_train_grouped = sales_train.groupby(["date_block_num", "shop_id", "item_id"]).agg(
    item_cnt_month = pd.NamedAgg(column="item_cnt_day", aggfunc="sum"),
    item_revenue_month = pd.NamedAgg(column="item_revenue_day", aggfunc="sum"),
    item_cnt_std = pd.NamedAgg(column="item_cnt_day", aggfunc=np.std),
    item_cnt_count = pd.NamedAgg(column="item_cnt_day", aggfunc='count')
    )

  df = df.merge(sales_train_grouped, how="left", on=["date_block_num", "shop_id", "item_id"],)

  if test is not None:
    test["date_block_num"] = 34
    test["date_block_num"] = test["date_block_num"].astype(np.int8)
    test["shop_id"] = test.shop_id.astype(np.int8)
    test["item_id"] = test.item_id.astype(np.int16)
    test = test.drop(columns="ID")

    df = pd.concat([df, test[["date_block_num", "shop_id", "item_id"]]])

  df.item_cnt_month = df.item_cnt_month.fillna(0)
  df.item_revenue_month = df.item_revenue_month.fillna(0)
  df.item_cnt_std = df.item_cnt_std.fillna(0)
  df.item_cnt_count = df.item_cnt_count.fillna(0)
  return df

In [None]:
matrix = create_testlike_train(train, test)
matrix = matrix.merge(items[['item_id', 'item_category_id']], how='left', on='item_id',)
oldcols = matrix.columns
matrix = reduce_mem_usage(matrix, silent=False)

In [9]:
def add_time_features(m, train):
  dummies = m.loc[m.date_block_num == 34, ["date_block_num", "shop_id", "item_id"]]
  dummies = dummies.assign(date=pd.to_datetime("2015-11-30"), item_price=1, item_cnt_day=0, item_revenue_day=0,)
  train = pd.concat([train, dummies])
  del dummies

  month_last_day = train.groupby("date_block_num").date.max().rename("month_last_day")
  month_last_day[~month_last_day.dt.is_month_end] = (month_last_day[~month_last_day.dt.is_month_end] + MonthEnd())
  month_first_day = train.groupby("date_block_num").date.min().rename("month_first_day")
  month_first_day[~month_first_day.dt.is_month_start] = (month_first_day[~month_first_day.dt.is_month_start] - MonthBegin())
  month_length = (month_last_day - month_first_day + Day()).rename("month_length")
  m = m.merge(month_length, left_on="date_block_num", right_index=True, how="left")
  m = m.merge(month_first_day, left_on="date_block_num", right_index=True, how="left")
  
  def last_sale_days(matrix):
    last_shop_item_dates = []
    for dbn in range(1, 35):
      lsid_temp = (train.query(f"date_block_num<{dbn}").groupby(["shop_id", "item_id"]).date.max()
      .rename("last_shop_item_sale_date").reset_index())
      lsid_temp["date_block_num"] = dbn
      last_shop_item_dates.append(lsid_temp)

    last_shop_item_dates = pd.concat(last_shop_item_dates)
    matrix = matrix.merge(last_shop_item_dates, on=["date_block_num", "shop_id", "item_id"], how="left")

    def days_since_last_feat(m, feat_name, date_feat_name, missingval):
      m[feat_name] = (m["month_first_day"] - m[date_feat_name]).dt.days
      m.loc[m[feat_name] > 2000, feat_name] = missingval
      m.loc[m[feat_name].isna(), feat_name] = missingval
      m['first_buy'] = 0
      m.loc[m[feat_name] == missingval, 'first_buy'] = 1
      return m

    matrix = days_since_last_feat(matrix, "last_shop_item_sale_days", "last_shop_item_sale_date", 9999)

    return matrix

  savelist = []
  for dbn in range(35):
    item_amt = len(train[train['date_block_num'] == dbn].groupby(['item_id']))
    shop_item_amt = len(train[train['date_block_num'] == dbn].groupby(['shop_id', 'item_id']))
    savelist.append([dbn, item_amt, shop_item_amt])
  df_right = pd.DataFrame(np.array(savelist), columns=['date_block_num', 'item_amt', 'shop_item_amt'])
  
  m = m.merge(df_right, how='left', on='date_block_num')
  m = last_sale_days(m)
  
  m["month"] = m["month_first_day"].dt.month
  m["year"] = m["month_first_day"].dt.year
  m.drop(['month_first_day'], axis=1, inplace=True)
  m["item_age"] = m.groupby("item_id")["date_block_num"].transform(lambda x: x - x.min())
  m["new_item"] = m["item_age"] == 0
  m["new_item"] = m["new_item"].astype("int8")
  replace_val = m['last_shop_item_sale_days'][m['last_shop_item_sale_days'] != 9999].mean()
  m['last_shop_item_sale_days'].replace(9999, replace_val, inplace=True)

  return m

In [10]:
matrix = add_time_features(matrix, train)

In [11]:
def add_price_features(matrix, train):
  price_features = pd.DataFrame(train.groupby(["date_block_num", "item_id"]).item_price.mean()).reset_index()
  price_features = price_features.merge(items[["item_id", "item_category_id"]], how="left", on="item_id")
  price_features["norm_diff_cat_price"] = (price_features.groupby(["date_block_num", "item_category_id"])["item_price"]
  .transform(lambda x: round((x - x.mean()) / x.mean(),3)))
  price_features = price_features[["date_block_num", "item_id", "item_price", "norm_diff_cat_price",]]

  features = ["item_price", "norm_diff_cat_price"]
  aggs = {f: "last" for f in features}
  renames = {f: "last_" + f for f in features}
  features = []
  for dbn in range(1, 35):
    f_temp = (price_features.query(f"date_block_num<{dbn}").groupby("item_id").agg(aggs).rename(columns=renames))
    f_temp["date_block_num"] = dbn
    features.append(f_temp)
  features = pd.concat(features).reset_index()
  matrix = matrix.merge(features, on=["date_block_num", "item_id"], how="left")
  return matrix

In [None]:
matrix = add_price_features(matrix, train)
matrix.drop(['last_shop_item_sale_date'], axis=1, inplace=True)
matrix[['month_length','year']] = OrdinalEncoder().fit_transform(matrix[['month_length','year']])
del train,test,items,shops
matrix, oldcols = shrink_mem_new_cols(matrix, oldcols)
gc.collect()

In [13]:
def add_pct_change(matrix, group_feats, target="item_cnt_month", aggfunc="mean", periods=[1], lag=1, clip_value=None):
  dat = matrix.pivot_table(index=group_feats + ["date_block_num"], values=target, aggfunc=aggfunc, fill_value=0, dropna=False,).astype("float32")
  for g in group_feats:
    firsts = matrix.groupby(g).date_block_num.min().rename("firsts")
    dat = dat.merge(firsts, left_on=g, right_index=True, how="left")
    dat.loc[dat.index.get_level_values("date_block_num") < dat["firsts"], target] = float("nan")
    del dat["firsts"]

  for period in periods:
    feat_name = "_".join(group_feats + [target] + [aggfunc] + ["delta"] + [str(period)] + [f"lag_{lag}"])
    dat = dat.groupby(group_feats)[target].transform(lambda x: x.pct_change(periods=period, fill_method="pad")).rename(feat_name)
    if clip_value is not None:
      dat = dat.clip(lower=-clip_value, upper=clip_value)

  dat = dat.reset_index()
  dat["date_block_num"] += lag
  matrix = matrix.merge(dat, on=["date_block_num"] + group_feats, how="left")
  matrix[feat_name] = reduce_mem_usage(matrix[feat_name])
  return matrix

In [14]:
matrix = add_pct_change(matrix, ["item_id"], "item_cnt_month", clip_value=3)
matrix = add_pct_change(matrix, ["item_category_id"], "item_cnt_month", clip_value=3)
matrix = add_pct_change(matrix, ["shop_id"], "item_cnt_month", clip_value=3)
matrix = add_pct_change(matrix, ["item_category_id",'shop_id'], "item_cnt_month", clip_value=3)
matrix = add_pct_change(matrix, ["shop_id",'item_id'], "item_cnt_month", clip_value=3)
'''
matrix = add_pct_change(matrix, ["item_id"], "item_cnt_count", clip_value=3)
matrix = add_pct_change(matrix, ["item_category_id"], "item_cnt_count", clip_value=3)
matrix = add_pct_change(matrix, ["shop_id"], "item_cnt_count", clip_value=3)
matrix = add_pct_change(matrix, ["item_category_id",'shop_id'], "item_cnt_count", clip_value=3)
matrix = add_pct_change(matrix, ["shop_id",'item_id'], "item_cnt_count", clip_value=3)
'''
matrix = add_pct_change(matrix, ['item_id','shop_id'], "last_item_price", clip_value=3, lag=0)
matrix = add_pct_change(matrix, ["item_id"], "last_norm_diff_cat_price", clip_value=3, lag=0)

In [None]:
matrix, oldcols= shrink_mem_new_cols(matrix, oldcols)
gc.collect()

In [16]:
def rolling_stat(matrix, feats, window, argfeat="item_cnt_month", aggfunc="mean"):
  feat_name = '_'.join(feats + [argfeat] + [str(window)] + [aggfunc] + [argfeat])
  source = matrix
  store = []
  for i in range(2,35):
    mes = (source[source.date_block_num.isin(range(max([i - window, 0]), i))]
    .groupby(feats)[argfeat].agg(aggfunc).astype('float64').rename(feat_name).reset_index())
    mes["date_block_num"] = i
    store.append(mes)
  store = pd.concat(store)
  matrix = matrix.merge(store, on=feats + ["date_block_num"], how="left")
  return matrix

In [17]:
windowlist = [1,12]
for window in windowlist:
  matrix = rolling_stat(matrix,["shop_id",'item_id'],window=window)
  matrix = rolling_stat(matrix,["shop_id",'item_category_id'],window=window)
  matrix = rolling_stat(matrix,["shop_id"],window=window)
  matrix = rolling_stat(matrix,['item_category_id'],window=window)
  matrix = rolling_stat(matrix,['item_id'],window=window)

  matrix = rolling_stat(matrix,["shop_id",'item_id'],window=window,argfeat="item_revenue_month")
  matrix = rolling_stat(matrix,["shop_id",'item_category_id'],window=window,argfeat="item_revenue_month")
  matrix = rolling_stat(matrix,["shop_id"],window=window,argfeat="item_revenue_month")
  matrix = rolling_stat(matrix,['item_category_id'],window=window,argfeat="item_revenue_month")
  matrix = rolling_stat(matrix,['item_id'],window=window,argfeat="item_revenue_month")
  
  matrix = rolling_stat(matrix,["shop_id",'item_id'],window=window,argfeat="item_cnt_count")
  matrix = rolling_stat(matrix,["shop_id",'item_category_id'],window=window,argfeat="item_cnt_count")
  matrix = rolling_stat(matrix,["shop_id"],window=window,argfeat="item_cnt_count")
  matrix = rolling_stat(matrix,['item_category_id'],window=window,argfeat="item_cnt_count")
  matrix = rolling_stat(matrix,['item_id'],window=window,argfeat="item_cnt_count")

  matrix = rolling_stat(matrix,["shop_id",'item_id'],window=window,argfeat="item_cnt_std")


In [18]:
catlist = ['item_cnt_month', 'month_length', 'first_buy', 'month', 'year', 'new_item', "shop_id", "item_id", "item_category_id", 'date_block_num']
conlist = list(set(matrix.columns).difference(set(catlist)))

for col in conlist:
  matrix[col] = MinMaxScaler().fit_transform(np.array(matrix[col]).reshape(-1,1))
gc.collect()
matrix, oldcols = shrink_mem_new_cols(matrix, oldcols)
matrix.to_pickle("checkpoint_final.pkl")

In [36]:
def fit_booster(X_train, y_train, X_test=None, y_test=None,categoricals=[], test_run=False, early_stopping=True):
    params = {
    "min_child_samples": 30,
    "learning_rate": 0.01,
    "max_depth":4,
    "colsample_bytree": 0.6,
    "subsample": 0.6,
    "n_estimators": 200,
    }

    early_stopping_rounds = None
    if early_stopping == True:
        early_stopping_rounds = 50

    if test_run:
        eval_set = [(X_train, y_train)]
    else:
        eval_set = [(X_train, y_train), (X_test, y_test)]

    booster = lgbm.LGBMRegressor(**params)


    booster.fit(
        X_train,
        y_train,
        eval_set=eval_set,
        eval_metric=["rmse"],
        verbose=100,
        categorical_feature=categoricals,
        early_stopping_rounds=early_stopping_rounds,
    )

    return booster

In [None]:
matrix = pd.read_pickle("checkpoint_final.pkl")
categoricals = ['month_length', 'first_buy', 'month', 'year', 'new_item', "item_category_id", 'date_block_num']
keep_from_month = 2 
test_month = 33
dropcols = ["shop_id", "item_id", 'item_cnt_std', 'item_cnt_count', 'item_revenue_month']  # The features are dropped to reduce overfitting

valid = matrix.drop(columns=dropcols).loc[matrix.date_block_num == test_month, : ]
train = matrix.drop(columns=dropcols).loc[matrix.date_block_num < test_month, : ]
train = train[train.date_block_num >= keep_from_month + 4]
X_train = train.drop(columns="item_cnt_month")
y_train = train.item_cnt_month
X_valid = valid.drop(columns="item_cnt_month")
y_valid = valid.item_cnt_month

gc.collect()
del (valid, train, matrix)
lgbooster = fit_booster(X_train,y_train,X_valid,y_valid,categoricals=categoricals)


In [22]:
#_ = lgbm.plot_importance(lgbooster, figsize=(10,50), height=0.7, importance_type="gain", max_num_features=50)

In [None]:
matrix = pd.read_pickle("checkpoint_final.pkl")

test_month = 34
train = matrix.drop(columns=dropcols).loc[matrix.date_block_num < test_month, : ]
train = train[train.date_block_num >= keep_from_month + 4]
X_train = train.drop(columns="item_cnt_month")
y_train = train.item_cnt_month
X_test = matrix.loc[matrix.date_block_num == test_month, : ]
X_test = X_test.drop(columns="item_cnt_month")
del train,matrix
lgbooster = fit_booster(X_train,y_train,categoricals=categoricals,test_run=True, early_stopping=None)
X_test["item_cnt_month"] = lgbooster.predict(X_test.drop(columns=dropcols)).clip(0,20)
test_orig = pd.read_csv("test.csv")
test = test_orig.merge(
    X_test[["shop_id", "item_id", "item_cnt_month"]],
    on=["shop_id", "item_id"],
    how="inner",
    copy=True,
)
assert test_orig.equals(test[["ID", "shop_id", "item_id"]])
test[["ID", "item_cnt_month"]].to_csv("submission.csv", index=False)

In [40]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f submission.csv -m "Message"

100% 5.34M/5.34M [00:00<00:00, 12.0MB/s]
400 - Bad Request
