In [None]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
from tqdm import tqdm
import time
import pickle
from itertools import product
from lightgbm import LGBMRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import RepeatedKFold, cross_val_score, RandomizedSearchCV
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

warnings.filterwarnings('ignore')

In [None]:
path = r'/kaggle/input/competitive-data-science-predict-future-sales/'

items = pd.read_csv(path + '/items.csv')
item_categories = pd.read_csv(path + '/item_categories.csv')
sales_train = pd.read_csv(path + '/sales_train.csv')
shops = pd.read_csv(path + '/shops.csv')

test = pd.read_csv(path + '/test.csv')

In [None]:
# item name cleaning

# split item names by first bracket
items["name1"], items["name2"] = items.item_name.str.split("[", 1).str
items["name1"], items["name3"] = items.item_name.str.split("(", 1).str
# replace special characters and turn to lower case
items["name2"] = items.name2.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
items["name3"] = items.name3.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()

# fill nulls with '0'
import re
def name_correction(x):
    x = x.lower() # all letters lower case
    x = x.partition('[')[0] # partition by square brackets
    x = x.partition('(')[0] # partition by curly brackets
    x = re.sub('[^A-Za-z0-9А-Яа-я]+', ' ', x) # remove special characters
    x = x.replace('  ', ' ') # replace double spaces with single spaces
    x = x.strip() # remove leading and trailing white space
    return x

items = items.fillna('0')
items["item_name"] = items["item_name"].apply(lambda x: name_correction(x))

# return all characters except the last if name 2 is not "0" - the closing bracket
items.name2 = items.name2.apply( lambda x: x[:-1] if x !="0" else "0")

# clean item type
items["type"] = items.name2.apply(lambda x: x[0:8] if x.split(" ")[0] == "xbox" else x.split(" ")[0] )
items.loc[(items.type == "x360") | (items.type == "xbox360") | (items.type == "xbox 360") ,"type"] = "xbox 360"
items.loc[ items.type == "", "type"] = "mac"
items.type = items.type.apply( lambda x: x.replace(" ", "") )
items.loc[ (items.type == 'pc' )| (items.type == 'pс') | (items.type == "pc"), "type" ] = "pc"
items.loc[ items.type == 'рs3' , "type"] = "ps3"

group_sum = items.groupby(["type"]).agg({"item_id": "count"})
group_sum = group_sum.reset_index()
drop_cols = []
for cat in group_sum.type.unique():
    if group_sum.loc[(group_sum.type == cat), "item_id"].values[0] <40:
        drop_cols.append(cat)
items.name2 = items.name2.apply( lambda x: "other" if (x in drop_cols) else x )
items = items.drop(["type"], axis = 1)

items.name2 = LabelEncoder().fit_transform(items.name2)
items.name3 = LabelEncoder().fit_transform(items.name3)

items.drop(["item_name", "name1"],axis = 1, inplace= True)
items.head(6)

In [None]:
# Cleaning item category 
item_categories["category_type"] = item_categories.item_category_name.apply(
    lambda x: x.split(" ")[0]).astype(str)
item_categories.loc[(item_categories.category_type == "Игровые") | (
    item_categories.category_type == "Аксессуары"), "category"] = "Игры"
category = []
for cat in item_categories.category_type.unique():
    if len(item_categories[item_categories.category_type == cat]) >= 5:
        category.append(cat)
item_categories.category_type = item_categories.category_type.apply(
    lambda x: x if (x in category) else "etc")

# Label Encoding
item_categories.category_type = LabelEncoder().fit_transform(item_categories.category_type)
item_categories["split"] = item_categories.item_category_name.apply(lambda x: x.split("-"))
item_categories["category_subtype"] = item_categories.split.apply(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
item_categories["category_subtype"] = LabelEncoder().fit_transform( item_categories["category_subtype"] )
item_categories = item_categories[["item_category_id", "category_subtype", "category_type"]]

item_categories.head(6)

In [None]:
# Clean up some shop names and add 'city' and 'category' to shops dataset
# revise duplicated shop names in both shops and test datasets

# Якутск Орджоникидзе, 56
test.loc[test.shop_id == 0, 'shop_id'] = 57
sales_train.loc[sales_train.shop_id == 0, 'shop_id'] = 57

# Якутск ТЦ "Центральный"
test.loc[test.shop_id == 1, 'shop_id'] = 58
sales_train.loc[sales_train.shop_id == 1, 'shop_id'] = 58

# Жуковский ул. Чкалова 39м²
test.loc[test.shop_id == 10, 'shop_id'] = 11
sales_train.loc[sales_train.shop_id == 10, 'shop_id'] = 11

shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"',
          "shop_name"] = 'СергиевПосад ТЦ "7Я"'
shops["city"] = shops.shop_name.str.split(" ").map(lambda x: x[0])
shops["category"] = shops.shop_name.str.split(" ").map(lambda x: x[1])
shops.loc[shops.city == "!Якутск", "city"] = "Якутск"

# Only keep shop category if there are 5 or more shops of that category, the rest are grouped as "other".
category = []
for cat in shops.category.unique():
    if len(shops[shops.category == cat]) >= 5:
        category.append(cat)
shops.category = shops.category.apply(
    lambda x: x if (x in category) else "other")

# label encoding
from sklearn.preprocessing import LabelEncoder
shops["shop_category"] = LabelEncoder().fit_transform( shops.category )
shops["shop_city"] = LabelEncoder().fit_transform( shops.city )
shops = shops[["shop_id", "shop_category", "shop_city"]]

shops.head(6)

In [None]:
# add month feature

sales_train['month'] = sales_train['date'].apply(lambda x: x.split('.')[1]).astype('int64')
block_month = sales_train[['date_block_num','month']].drop_duplicates().reset_index(drop=True)
block_month.loc[34] = [34,11]

In [None]:
## check outliers, shop_id x item_cnt_day
# check outliers, item_id x item_cnt_day
# check outliers, item_price x item_cnt_day

fig, ax = plt.subplots(figsize=(20, 14), nrows=2, ncols=2)

ax[0][0].scatter(data=sales_train,
                 x='shop_id',
                 y='item_cnt_day',)
ax[0][0].set_xlabel('shop_id')
ax[0][0].set_ylabel('item_cnt_day')

ax[0][1].scatter(data=sales_train,
                 x='item_id',
                 y='item_cnt_day',)
ax[0][1].set_xlabel('item_id')
ax[0][1].set_ylabel('item_cnt_day')

ax[1][0].scatter(data=sales_train,
                 x='item_price',
                 y='item_cnt_day',)
ax[1][0].set_xlabel('item_price')
ax[1][0].set_ylabel('item_cnt_day')

In [None]:
# check outliers, shop_id x item_cnt_day --> two outliers > 2000+, 1000+
# check outliers, item_id x item_cnt_day --> two outliers > 1000+
# check outliers, item_price x item_cnt_day --> two outliers > 1000, one item price > 300000, a few items price < 0

df_train = sales_train[(sales_train['item_cnt_day'] < 1000)
                       & (sales_train['item_cnt_day'] > 0)]

df_train = sales_train[(sales_train['item_price'] < 150000)
                       & (sales_train['item_price'] > 0)]

In [None]:
# calculate monthly sales

df_train = df_train.groupby(['date_block_num', 'shop_id', 'item_id'], as_index=False).agg(
    {'item_cnt_day': 'sum'}).rename(columns={'item_cnt_day': 'item_cnt_month'}, inplace=False)

In [None]:
# combine df_train and test together

test.drop(['ID'], axis=1, inplace=True)
test['date_block_num'] = 34
test['item_cnt_month'] = 0

df_train_test = pd.concat([df_train,test], axis=0)
df_train_test.head()

In [None]:
# product([num], tmp.shop_id.unique(), tmp.item_id.unique()
# create maxtrix combinations of shop_id,and item_id for a certain month (date_block_num)
# matrix factorization

matrix = []
for num in df_train_test['date_block_num'].unique(): 
    tmp = df_train_test[df_train_test.date_block_num==num]
    matrix.append(np.array(list(product([num], tmp.shop_id.unique(), tmp.item_id.unique())), dtype='int16'))
    
# Turn the grid into a dataframe
matrix = pd.DataFrame(np.vstack(matrix), columns=['date_block_num', 'shop_id', 'item_id'], dtype=np.int16)

# Add the features from sales data to the matrix
matrix = matrix.merge(df_train_test, how='left', on=['date_block_num', 'shop_id', 'item_id']).fillna(0)

#Merge features from shops, items and item_categories:
matrix = matrix.merge(shops, how='left', on='shop_id')
matrix = matrix.merge(items[['item_id','item_category_id']], how='left', on='item_id')
matrix = matrix.merge(item_categories, how='left', on='item_category_id')

# revise month feature
matrix['month'] = matrix.date_block_num%12
# Clip counts
matrix['item_cnt_month'] = matrix['item_cnt_month'].clip(0, 20)

In [None]:
# reduce size of each variables to avoid memory error

df_train_test = matrix.copy()


del items
del item_categories
del shops
del sales_train
del matrix

df_train_test['date_block_num'] = df_train_test['date_block_num'].astype(np.int8)
df_train_test['shop_id'] = df_train_test['shop_id'].astype(np.int8)
df_train_test['item_id'] = df_train_test['item_id'].astype(np.int16)
df_train_test['month'] = df_train_test['month'].astype(np.int8)
df_train_test['item_cnt_month'] = df_train_test['item_cnt_month'].astype(np.int32)
df_train_test['shop_category'] = df_train_test['shop_category'].astype(np.int8)
df_train_test['shop_city'] = df_train_test['shop_city'].astype(np.int8)
df_train_test['item_category_id'] = df_train_test['item_category_id'].astype(np.int8)
df_train_test['category_type'] = df_train_test['category_type'].astype(np.int8)
df_train_test['category_subtype'] = df_train_test['category_subtype'].astype(np.int8)

In [None]:
print('{0:.2f}'.format(df_train_test.memory_usage(index=False, deep=True).sum()/(2**20)), 'MB')

In [None]:
# prep lagged features

# shop_category X date_block_num X item_cnt_month

shop_category_block_cnt = df_train_test.groupby(['date_block_num', 'shop_category'], as_index=False).sum(
).rename(columns={'item_cnt_month': 'shop_category_block_cnt'})[['shop_category', 'date_block_num', 'shop_category_block_cnt']]

lag_period = [1, 2, 3, 12] #memory not enough on kaggle
#lag_period = [1, 2, 3, 4, 5, 12]

# add lag features: shop_block_cnt_lag_

shop_category_block_cnt_lag = shop_category_block_cnt.copy()

for lag in lag_period:

    temp = shop_category_block_cnt.copy()

    lag_fet_name = 'shop_category_block_cnt_lag_' + str(lag)
    temp['date_block_num'] += lag
    temp[lag_fet_name] = temp['shop_category_block_cnt']
    temp.drop('shop_category_block_cnt', axis=1, inplace=True)
    shop_category_block_cnt_lag = shop_category_block_cnt_lag.merge(
        temp[['shop_category', 'date_block_num', lag_fet_name]], on=['shop_category', 'date_block_num'], how='left')

shop_category_block_cnt_lag.drop(
    'shop_category_block_cnt', axis=1, inplace=True)

# shop_id X date_block_num X item_cnt_month

shop_block_cnt = df_train_test.groupby(['date_block_num', 'shop_id'], as_index=False).sum(
).rename(columns={'item_cnt_month': 'shop_block_cnt'})[['shop_id', 'date_block_num', 'shop_block_cnt']]

# add lag features: shop_block_cnt_lag_

shop_block_cnt_lag = shop_block_cnt.copy()

for lag in lag_period:

    temp = shop_block_cnt.copy()

    lag_fet_name = 'shop_block_cnt_lag_' + str(lag)
    temp['date_block_num'] += lag
    temp[lag_fet_name] = temp['shop_block_cnt']
    temp.drop('shop_block_cnt', axis=1, inplace=True)
    shop_block_cnt_lag = shop_block_cnt_lag.merge(
        temp[['shop_id', 'date_block_num', lag_fet_name]], on=['shop_id', 'date_block_num'], how='left')

shop_block_cnt_lag.drop('shop_block_cnt', axis=1, inplace=True)

# item_id X date_block_num X item_cnt_month

item_block_cnt = df_train_test.groupby(['date_block_num', 'item_id'], as_index=False).sum(
).rename(columns={'item_cnt_month': 'item_block_cnt'})[['item_id', 'date_block_num', 'item_block_cnt']]

# add lag features: item_block_cnt_lag_

item_block_cnt_lag = item_block_cnt.copy()

for lag in lag_period:

    temp = item_block_cnt.copy()

    lag_fet_name = 'item_block_cnt_lag_' + str(lag)
    temp['date_block_num'] += lag
    temp[lag_fet_name] = temp['item_block_cnt']
    temp.drop('item_block_cnt', axis=1, inplace=True)
    item_block_cnt_lag = item_block_cnt_lag.merge(
        temp[['item_id', 'date_block_num', lag_fet_name]], on=['item_id', 'date_block_num'], how='left')

item_block_cnt_lag.drop('item_block_cnt', axis=1, inplace=True)

# category_type X date_block_num X item_cnt_month

category_type_block_cnt = df_train_test.groupby(['date_block_num', 'category_type'], as_index=False).sum(
).rename(columns={'item_cnt_month': 'category_type_block_cnt'})[['category_type', 'date_block_num', 'category_type_block_cnt']]

# add lag features: category_type_block_cnt_lag_

category_type_block_cnt_lag = category_type_block_cnt.copy()

for lag in lag_period:

    temp = category_type_block_cnt.copy()

    lag_fet_name = 'category_type_block_cnt_lag_' + str(lag)
    temp['date_block_num'] += lag
    temp[lag_fet_name] = temp['category_type_block_cnt']
    temp.drop('category_type_block_cnt', axis=1, inplace=True)
    category_type_block_cnt_lag = category_type_block_cnt_lag.merge(
        temp[['category_type', 'date_block_num', lag_fet_name]], on=['category_type', 'date_block_num'], how='left')

category_type_block_cnt_lag.drop(
    'category_type_block_cnt', axis=1, inplace=True)

In [None]:
# merge three lag dfs into df_train_test

df_merged = df_train_test.copy()

df_merged = df_merged.merge(shop_category_block_cnt_lag, on=[
                            'shop_category', 'date_block_num'], how='left')
del shop_category_block_cnt_lag

df_merged = df_merged.merge(shop_block_cnt_lag, on=[
                            'shop_id', 'date_block_num'], how='left')
del shop_block_cnt_lag

df_merged = df_merged.merge(item_block_cnt_lag, on=[
                            'item_id', 'date_block_num'], how='left')
del item_block_cnt_lag

df_merged = df_merged.merge(category_type_block_cnt_lag, on=[
                            'category_type', 'date_block_num'], how='left')
del category_type_block_cnt_lag

# df_merged = df_merged.merge(shop_item_block_cnt_lag, on=[
#                            'shop_id', 'item_id', 'date_block_num'], how='left')

df_merged.fillna(0, inplace=True)
df_merged['item_cnt_month'] = df_merged['item_cnt_month'].clip(0, 20)

In [None]:
# mean encode labels
# take the mean of encoded columns X item_cnt_month

enc_cols = ['item_category_id', 'category_type', 'category_subtype',
            'shop_category', 'shop_city']

for col in enc_cols:
    gb = pd.DataFrame(df_merged[df_merged['date_block_num'] < 34].groupby(    # not use test data to avoid leakage
        col).mean().reset_index())

    col_mean_enc = gb[['item_cnt_month']]

    minmaxscaler = MinMaxScaler()
    #stdscaler = StandardScaler()

    col_mean_enc = minmaxscaler.fit_transform(col_mean_enc)
    #col_mean_enc = stdscaler.fit_transform(col_mean_enc)

    # create a map dict and do the mapping
    enc_map = {ind: enc[0]
               for ind, enc in enumerate(col_mean_enc)}
    df_merged[col] = df_merged[col].map(enc_map)
    
del enc_map

In [None]:
# add Item name (Tfidf text feature) --> give it a shot　 --> imrpoved

items_tfidf = pd.read_csv(path +'/items.csv')

items_subset = items_tfidf[['item_id', 'item_name']]
feature_count = 25
tfidf = TfidfVectorizer(max_features=feature_count)
items_df_item_name_text_features = pd.DataFrame(
    tfidf.fit_transform(items_subset['item_name']).toarray())

cols = items_df_item_name_text_features.columns
for i in range(feature_count):
    feature_name = 'item_name_tfidf_' + str(i)
    items_subset[feature_name] = items_df_item_name_text_features[cols[i]]
    
items_subset.drop('item_name', axis=1, inplace=True)
df_merged = df_merged.merge(items_subset, on='item_id', how='left')


In [None]:
del items_tfidf, tfidf,items_df_item_name_text_features,items_subset
del df_train, df_train_test
del gb,temp,block_month

In [None]:
# remove 2013 data: previous 12 months

df_merged = df_merged[df_merged['date_block_num'] >= 12]
df_merged.reset_index(drop=True, inplace=True)

In [None]:
# independent and target variables 

X_train = df_merged[(df_merged['date_block_num'] >= 12) & (
    df_merged['date_block_num'] < 34)].drop(['date_block_num', 'item_cnt_month'], axis=1)
y_train = df_merged[(df_merged['date_block_num'] >= 12) & (
    df_merged['date_block_num'] < 34)]['item_cnt_month']

X_test = df_merged[df_merged['date_block_num']
                   == 34].drop(['date_block_num', 'item_cnt_month'], axis=1)

print(X_train.shape)
print(y_train.shape)


In [None]:
test_sub = pd.read_csv(path + '/test.csv')
test_sub.drop(['shop_id', 'item_id'], axis=1, inplace=True)

# memory not enough
del df_merged

In [None]:
# plain lgbm model

#lgb_cv = RepeatedKFold(n_repeats=3, n_splits=3)

models_candidates = [
    LGBMRegressor(seed=55, silent=True, metric='rmse'),
] # lgbm competes over xgb and cat

for model_reg in tqdm(models_candidates):

    model = model_reg

    start = time.time()
    scores = cross_val_score(
        model, X_train, y_train,
                             scoring='neg_root_mean_squared_error', cv=2, 
                             #n_jobs=-1
                            ) #cv=2 due to insufficient memory
    end = time.time()

    print('model type: %s' % str(model_reg)[:3])
    print('training time: %.2f s' % (end - start))
    print('mean score: %.3f' % scores.mean())
    print('score std: %.3f\n' % scores.std())

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    # Clip all your monthly count of training and testing dataset between [0,20].
    pred = np.clip(pred, 0, 20)

    test_sub['item_cnt_month'] = pd.Series(pred)

    if str(model_reg)[:3] == '<ca':
        filename = 'submission.csv'
        model_name = str(model_reg)[1:4] + '_final.sav'
    else:
        filename = 'submission.csv'
        model_name = str(model_reg)[:3] + '_final.sav'
    test_sub.to_csv(filename, index=False)

    # save models for later use
    # pickle.dump(model, open(model_name, 'wb'))