In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports and data

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
shop = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
item = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
item_category = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
sample_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')


## Data preprocessing

In [None]:
train['pd_date'] = pd.to_datetime(train['date'], format='%d.%m.%Y')
train['year'] = pd.DatetimeIndex(train['pd_date']).year
train['month'] = pd.DatetimeIndex(train['pd_date']).month
# train['day'] = pd.DatetimeIndex(train['pd_date']).day

index_cols = ['shop_id', 'item_id', 'date_block_num']
train_grouped = train.loc[ # Outlier
    (train['item_price'] < 100000) & (train['item_cnt_day'] <= 900)
].drop(  # item_price is not in X_test
    columns=['date', 'item_price', 'pd_date']
).groupby(
    index_cols
).agg({'item_cnt_day': 'sum'}).reset_index().rename(
    columns={'item_cnt_day': 'item_cnt_month'}
)

# test['year'] = 2015
# test['month'] = 11
test['date_block_num'] = 34

train_grouped = pd.concat(
    [train_grouped, test.drop(columns=['ID'])],
    axis=0, ignore_index=True, keys=index_cols
)
print(train_grouped.shape, '\n', train_grouped.head())

## Lag features

In [None]:
# define lag_feature
def lag_feature(data, lags, column):
    temp = data[index_cols + [column]]
    for lag in lags:
        shifted = temp.copy()
        shifted.columns = index_cols + [column + '_lag_' + str(lag)]
        shifted['date_block_num'] += lag
        data = pd.merge(data, shifted, on=index_cols, how='left')
        data[column + '_lag_' + str(lag)] = data[column + '_lag_' + str(lag)].astype('float16')
    return data

train_lagged = lag_feature(train_grouped, [1, 2, 3], 'item_cnt_month')
train_lagged.fillna(0, inplace=True)
train_lagged = train_lagged.loc[train_lagged['date_block_num'] > 2]
train_lagged.head()

## Shop feature

In [None]:
# extract and encode cities
shop['city'] = shop['shop_name'].apply(lambda x: x.split()[0].lower())
shop.loc[shop.city == '!якутск', 'city'] = 'якутск'
shop['city_code'] = LabelEncoder().fit_transform(shop['city'])
# add coordinates of cities
coords = dict()
coords['якутск'] = (62.028098, 129.732555, 4)
coords['адыгея'] = (44.609764, 40.100516, 3)
coords['балашиха'] = (55.8094500, 37.9580600, 1)
coords['волжский'] = (53.4305800, 50.1190000, 3)
coords['вологда'] = (59.2239000, 39.8839800, 2)
coords['воронеж'] = (51.6720400, 39.1843000, 3)
coords['выездная'] = (0, 0, 0)
coords['жуковский'] = (55.5952800, 38.1202800, 1)
coords['интернет-магазин'] = (0, 0, 0)
coords['казань'] = (55.7887400, 49.1221400, 4)
coords['калуга'] = (54.5293000, 36.2754200, 4)
coords['коломна'] = (55.0794400, 38.7783300, 4)
coords['красноярск'] = (56.0183900, 92.8671700, 4)
coords['курск'] = (51.7373300, 36.1873500, 3)
coords['москва'] = (55.7522200, 37.6155600, 1)
coords['мытищи'] = (55.9116300, 37.7307600, 1)
coords['н.новгород'] = (56.3286700, 44.0020500, 4)
coords['новосибирск'] = (55.0415000, 82.9346000, 4)
coords['омск'] = (54.9924400, 73.3685900, 4)
coords['ростовнадону'] = (47.2313500, 39.7232800, 3)
coords['спб'] = (59.9386300, 30.3141300, 2)
coords['самара'] = (53.2000700, 50.1500000, 4)
coords['сергиев'] = (56.3000000, 38.1333300, 4)
coords['сургут'] = (61.2500000, 73.4166700, 4)
coords['томск'] = (56.4977100, 84.9743700, 4)
coords['тюмень'] = (57.1522200, 65.5272200, 4)
coords['уфа'] = (54.7430600, 55.9677900, 4)
coords['химки'] = (55.8970400, 37.4296900, 1)
coords['цифровой'] = (0, 0, 0)
coords['чехов'] = (55.1477000, 37.4772800, 4)
coords['ярославль'] = (57.6298700, 39.8736800, 2) 

shop['city_coord_1'] = shop['city'].apply(lambda x: coords[x][0])
shop['city_coord_2'] = shop['city'].apply(lambda x: coords[x][1])
shop['country_part'] = shop['city'].apply(lambda x: coords[x][2])

shop.head()
#shops = shops[['shop_id', 'city_code', 'city_coord_1', 'city_coord_2', 'country_part']]

## Item feature

In [None]:
cat_map = {
    'Чистые носители (штучные)': 'Чистые носители',
    'Чистые носители (шпиль)' : 'Чистые носители',
    'PC ': 'Аксессуары',
    'Служебные': 'Служебные '
}
# extract common categories
item_category['item_category'] = item_category['item_category_name'].apply(
    lambda x: x.split('-')[0]
)
item_category['item_category'] = item_category['item_category'].apply(
    lambda x: cat_map[x] if x in cat_map.keys() else x
)
# encoding common categories
item_category['item_category_common'] = LabelEncoder().fit_transform(
    item_category['item_category']
)

In [None]:
def value_reduction(data):
    for column in data.columns:
        if data[column].dtype == 'float64':
            data[column] = data[column].astype(np.float32)
        if (data[column].dtype == 'int64' or data[column].dtype == 'int32') and (data[column].max() < 32767 and data[column].min() > -32768) and data[column].isnull().sum()==0:
            data[column] = data[column].astype(np.int16)
    return data


# Join with item and shop
all_data = train_lagged.join(
    item.set_index('item_id'), on='item_id'
).drop(
    columns=['item_name']
).join(
    item_category.drop(
        columns=['item_category_name', 'item_category']
    ).set_index('item_category_id'), on='item_category_id'
).join(
    shop.drop(columns=['shop_name', 'city']).set_index('shop_id'), on='shop_id'
)

## Target encoding

In [None]:
def mean_encoding(data, groupby_list, col_list):
    res = data
    for i in range(0, len(groupby_list)):
        groupby = groupby_list[i]
        col = col_list[i]
        index = ['date_block_num'] + groupby
        target_mean = data.groupby(
            index
        ).agg(
            {'item_cnt_month': 'mean'}
        ).reset_index().rename(
            columns={"item_cnt_month": col},
            errors="raise"
        )
        res = res.join(
            target_mean.set_index(index), on=index
        )
        res[col] = res[
            col
        ].fillna(0).astype(np.float16)
        
        res = lag_feature(res, [1, 2, 3], col)
        res.drop(columns=[col], axis=1, inplace=True)
    return res

mean_encoded = mean_encoding(
    all_data, [
        ['item_id'],
        ['item_id', 'city_code'],
        ['item_id', 'shop_id']
    ], [
        'item_target_enc',
        'item_loc_target_enc',
        'item_shop_target_enc'
    ]
    
)
mean_encoded.head()

## Add the following columns
* item_first_sold: the item is first sold in these shops in this month
* shop_item_sold_before: whether the item has been sold in the same shop before

In [None]:
# item_first_sold and shop_item_sold_before
first_item_block = mean_encoded.groupby(['item_id']).agg(
    {'date_block_num': 'min'}
).reset_index()
first_item_block['item_first_sold'] = 1

first_shop_item_buy_block = mean_encoded[mean_encoded['date_block_num'] > 0].groupby(
    ['shop_id', 'item_id']
).agg(
    {'date_block_num': 'min'}
).reset_index().rename(
    columns={"date_block_num": "first_date_block_num"},
    errors="raise"
)

with_interaction = mean_encoded.join(
    first_item_block.set_index(['item_id', 'date_block_num']),
    on=['item_id', 'date_block_num']
).join(
    first_shop_item_buy_block.set_index(['item_id', 'shop_id']),
    on=['item_id', 'shop_id']
)

with_interaction['first_date_block_num'].fillna(100, inplace=True)
with_interaction['shop_item_sold_before'] = (
    with_interaction['first_date_block_num'] < with_interaction['date_block_num']
).astype('int8')
with_interaction.drop(['first_date_block_num'], axis=1, inplace=True)

with_interaction['item_first_sold'].fillna(0, inplace=True)
with_interaction['shop_item_sold_before'].fillna(0, inplace=True)
 
with_interaction['item_first_sold'] = with_interaction['item_first_sold'].astype('int8')  
with_interaction['shop_item_sold_before'] = with_interaction['shop_item_sold_before'].astype('int8') 

# add avg category for new features
item_id_target_mean = with_interaction[
    with_interaction['item_first_sold'] == 1
].groupby(
    ['date_block_num','item_category_id']
).agg(
    {'item_cnt_month': 'mean'}
).reset_index().rename(
    columns={'item_cnt_month': 'new_item_cat_avg'}, errors='raise'
)

with_interaction = with_interaction.join(
    item_id_target_mean.set_index(['date_block_num','item_category_id']),
    on=['date_block_num','item_category_id']
)

with_interaction['new_item_cat_avg'] = (
    with_interaction['new_item_cat_avg'].fillna(0).astype(np.float16)
)

with_interaction = lag_feature(with_interaction, [1, 2, 3], 'new_item_cat_avg')
with_interaction.drop(['new_item_cat_avg'], axis=1, inplace=True)

print(
    first_item_block.head(), '\n',
    first_shop_item_buy_block.head(), '\n',
    item_id_target_mean.head(), '\n',
    with_interaction.head()
)

## Prepare train, valid, test

In [None]:
with_interaction.isna().sum()

In [None]:
prepared_data = with_interaction.fillna(0)

X_train = prepared_data.loc[prepared_data['date_block_num'] < 33].drop(
    columns=['item_cnt_month']
)
y_train = prepared_data.loc[prepared_data['date_block_num'] < 33].filter(
    items=['item_cnt_month']
)
X_valid = prepared_data.loc[prepared_data['date_block_num'] == 33].drop(
    columns=['item_cnt_month']
)
y_valid = prepared_data.loc[prepared_data['date_block_num'] == 33].filter(
    items=['item_cnt_month']
)
X_test = prepared_data.loc[prepared_data['date_block_num'] == 34].drop(
    columns=['item_cnt_month']
)
cat_features=[
    'item_category_id', 'item_category_common',
    'city_code', 'country_part'
]
print(X_train.head(), '\n', X_test.head())

## CatBoost model

In [None]:
model_cat = CatBoostRegressor(
    random_state=1,
    verbose=50, depth=4,
    learning_rate=0.01, l2_leaf_reg=7,
    max_leaves=2047, min_data_in_leaf=1,
    subsample=0.7,
    loss_function='RMSE', eval_metric='RMSE',
    early_stopping_rounds=30,
    grow_policy='Lossguide',
    cat_features=cat_features,
    # iterations=2000, 
    # task_type='GPU', bootstrap_type='Poisson',         
)
model_cat.fit(
    X_train, y_train,
    # eval_set=(X_valid, y_valid),
    # logging_level='Silent'
)
print(
    'Train rmse for CatBoost:',
    np.sqrt(MSE(y_train, model_cat.predict(X_train)))
)
print(
    'Validation rmse for CatBoost:',
    np.sqrt(MSE(y_valid, model_cat.predict(X_valid)))
)

## LGBM

In [None]:
params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'num_leaves': 1023,
    'min_data_in_leaf':10,
    'feature_fraction': 0.7,
    'learning_rate': 0.01,
    'num_rounds': 2000,
    'early_stopping_rounds': 30,
    'seed': 1
}
model_lgb = lgb.train(
    params=params,
    train_set=lgb.Dataset(X_train, y_train),
    valid_sets=(lgb.Dataset(X_train, y_train), lgb.Dataset(X_valid, y_valid)),
    verbose_eval=50, categorical_feature=cat_features
)
print(
    'Train rmse for LGBM:',
    np.sqrt(MSE(y_train, model_lgb.predict(X_train)))
)
print(
    'Validation rmse for LGBM:',
    np.sqrt(MSE(y_valid, model_lgb.predict(X_valid)))
)

## Random forest

In [None]:
model_rf = RandomForestRegressor(
    random_state = 1, max_depth=10, max_features='sqrt',
    min_samples_leaf=7, min_samples_split=11, n_estimators=75
)
model_rf.fit(X_train, y_train)
print(
    'Train rmse for RandomForest:',
    np.sqrt(MSE(y_train, model_rf.predict(X_train)))
)
print(
    'Validation rmse for RandomForest:',
    np.sqrt(MSE(y_valid, model_rf.predict(X_valid)))
)

## Predict

In [None]:
# Fill in the line below: get test predictions
models = {
    'cat': model_cat,
    'rf': model_rf,
    'lgb': model_lgb,
}
for name, model in models.items():
    preds_test = model.predict(X_test)
    output = pd.DataFrame({'ID': test.ID,
                           'item_cnt_month': preds_test})
    output.to_csv('submission_' + name + '.csv', index=False)