In [None]:
import numpy as np
import pandas as pd 
from itertools import product
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from catboost import CatBoostRegressor
import lightgbm as lgb


pd.set_option('display.max_columns', 30)

# **Datasets research**

**Dataset items**

In [None]:
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
items.head()

In [None]:
items.info()

In [None]:
items.describe()

In [None]:
print('Unique values of product names: {}'.format(items.item_name.unique()))
print()
print('Number of unique values of products: {}'.format(items.item_name.nunique()))

In [None]:
def simple_hist(data, x, bins, title, xlabel, xmin, xmax):
    plt.figure(figsize = (12, 8))
    sns.set()
    sns.distplot(data[x], color = 'lightcoral')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.xlim(xmin, xmax)
    plt.show()

simple_hist(items, 'item_category_id', 10, 
            'Distribution of item categories in the item dataframe', 'item_categories_id', -2, 85)

**Dataset item_categories**

In [None]:
item_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
item_categories.head()

In [None]:
item_categories.info()

In [None]:
item_categories.describe()

In [None]:
print('Unique values of product identifiers: {}'.format(item_categories.item_category_id.unique()))
print()
print('Number of unique values of product identifiers: {}'.format(item_categories.item_category_id.nunique()))

**Dataset shops**

In [None]:
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
shops.head()

In [None]:
shops.info()

In [None]:
shops.describe()

In [None]:
print('Unique meanings of store names: {}'.format(shops.shop_name.unique()))
print()
print('Number of unique store values: {}'.format(shops.shop_name.nunique()))

**Dataset train**

In [None]:
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
simple_hist(train, 'shop_id', 10, 
            'Distribution of stores in the train dataframe', 'shop_id', -5, 65)

In [None]:
simple_hist(train, 'item_id', 30, 
            'Distribution of items in the train dataframe', 'item_id', -1000, 25000)

In [None]:
simple_hist(train, 'item_price', 1000, 
            'Distribution of the price of items in the train dataframe', 'item_price', -100, 10000)

In [None]:
plt.figure(figsize = (12, 8))
sns.boxplot(y=train['item_price'])
plt.ylim(0, 10000)
plt.grid()
plt.title('Boxplot for the price of goods in the range from 0 to 10000 rubles')
plt.ylabel('item_price')

In [None]:
# the histogram is uninformative - it is better to look at the numbers
train.item_cnt_day.value_counts()

**Dataset test**

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
test.head()

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
simple_hist(test, 'shop_id', 10, 
            'Shops distributions in the test dataframe', 'id shops', 0, 70)
# the distributions of the training and test datasets are different in this fic

In [None]:
simple_hist(test, 'item_id', 30, 
            'Distribution of items in the test dataframe', 'id items', 0, 25000)
# the distributions of the training and test datasets are similar in this feature

**Dataset submission**

In [None]:
sample_submission = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')

In [None]:
sample_submission.head()

**Data Leakage**

Around 42% of training shop_id ~ item_id pairs are present in test set, but I don't use it.

In [None]:
df_temp = pd.Series(list(train[['item_id', 'shop_id']].itertuples(index=False, name=None)))
test_iter_temp = pd.Series(list(test[['item_id', 'shop_id']].itertuples(index=False, name=None)))
print(str(round(df_temp.isin(test_iter_temp).sum()/len(df_temp),2)*100)+'%')

In [None]:
#test_shop_ids = test['shop_id'].unique()
#test_item_ids = test['item_id'].unique()
# Only shops that exist in test set.
#train = train[train['shop_id'].isin(test_shop_ids)]
# Only items that exist in test set.
#train = train[train['item_id'].isin(test_item_ids)]

# **Prepare and feature engineering data**

In [None]:
# remove outliers
train = train[train.item_price < 100000]
train = train[train.item_cnt_day <= 900]

**Aggregate train data**

Since the test data is generated with combination of shops and items, we have to restructure train data to match the test data generation.

In [None]:
# aggregate
index_cols = ['shop_id', 'item_id', 'date_block_num']

# compute all shops/items combinations
grid = []
for block_num in train['date_block_num'].unique():
    cur_shops = train.loc[train['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = train.loc[train['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

In [None]:
# add sale for month
train_merge = train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day':'sum'})
train_merge.columns = ['item_cnt_month']
train_merge.reset_index(inplace=True)

In [None]:
# merge grid and train
train_merge = pd.merge(grid, train_merge, on = index_cols, how='left').fillna(0)
train_merge['item_cnt_month'] = train_merge['item_cnt_month'].clip(0, 40)

**Merge train and item datasets**

In [None]:
items_prepare = pd.merge(items, item_categories, on='item_category_id')
train_merge = pd.merge(train_merge, items_prepare, on = ['item_id'], how = 'left')

**Prepare test**

In [None]:
# prepare to concat with train
test_temp = test.copy()
test_temp['date_block_num'] = 34
test_temp.drop('ID', axis=1, inplace=True)

In [None]:
# merge with items and item_category
test_temp = test_temp.merge(items, how='left', on='item_id')
test_temp = test_temp.merge(item_categories, how='left', on='item_category_id')
test_temp.drop('item_name', axis=1, inplace=True)

In [None]:
# concat test and train dataframes
train_merge = pd.concat([train_merge, test_temp], axis=0, ignore_index=True, keys=index_cols)
train_merge.fillna(0, inplace=True)

**Prepare items features**

Categorization of products.

In [None]:
map_dict = {
            'Чистые носители (штучные)': 'Чистые носители',
            'Чистые носители (шпиль)' : 'Чистые носители',
            'PC ': 'Аксессуары',
            'Служебные': 'Служебные '
            }
# extract common categories
train_merge['item_category'] = train_merge['item_category_name'].apply(lambda x: x.split('-')[0])
train_merge['item_category'] = train_merge['item_category'].apply(lambda x: map_dict[x] if x in map_dict.keys() else x)
# encoding common categories
train_merge['item_category_common'] = LabelEncoder().fit_transform(train_merge['item_category'])

**Prepare shops features**

Extract and encode the names of cities. Add new features - coordinates of cities and parts of the country.

In [None]:
# extract and encode cities
shops['city'] = shops['shop_name'].apply(lambda x: x.split()[0].lower())
shops.loc[shops.city == '!якутск', 'city'] = 'якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
# add coordinates of cities
coords = dict()
coords['якутск'] = (62.028098, 129.732555, 4)
coords['адыгея'] = (44.609764, 40.100516, 3)
coords['балашиха'] = (55.8094500, 37.9580600, 1)
coords['волжский'] = (53.4305800, 50.1190000, 3)
coords['вологда'] = (59.2239000, 39.8839800, 2)
coords['воронеж'] = (51.6720400, 39.1843000, 3)
coords['выездная'] = (0, 0, 0)
coords['жуковский'] = (55.5952800, 38.1202800, 1)
coords['интернет-магазин'] = (0, 0, 0)
coords['казань'] = (55.7887400, 49.1221400, 4)
coords['калуга'] = (54.5293000, 36.2754200, 4)
coords['коломна'] = (55.0794400, 38.7783300, 4)
coords['красноярск'] = (56.0183900, 92.8671700, 4)
coords['курск'] = (51.7373300, 36.1873500, 3)
coords['москва'] = (55.7522200, 37.6155600, 1)
coords['мытищи'] = (55.9116300, 37.7307600, 1)
coords['н.новгород'] = (56.3286700, 44.0020500, 4)
coords['новосибирск'] = (55.0415000, 82.9346000, 4)
coords['омск'] = (54.9924400, 73.3685900, 4)
coords['ростовнадону'] = (47.2313500, 39.7232800, 3)
coords['спб'] = (59.9386300, 30.3141300, 2)
coords['самара'] = (53.2000700, 50.1500000, 4)
coords['сергиев'] = (56.3000000, 38.1333300, 4)
coords['сургут'] = (61.2500000, 73.4166700, 4)
coords['томск'] = (56.4977100, 84.9743700, 4)
coords['тюмень'] = (57.1522200, 65.5272200, 4)
coords['уфа'] = (54.7430600, 55.9677900, 4)
coords['химки'] = (55.8970400, 37.4296900, 1)
coords['цифровой'] = (0, 0, 0)
coords['чехов'] = (55.1477000, 37.4772800, 4)
coords['ярославль'] = (57.6298700, 39.8736800, 2) 

shops['city_coord_1'] = shops['city'].apply(lambda x: coords[x][0])
shops['city_coord_2'] = shops['city'].apply(lambda x: coords[x][1])
shops['country_part'] = shops['city'].apply(lambda x: coords[x][2])

shops = shops[['shop_id', 'city_code', 'city_coord_1', 'city_coord_2', 'country_part']]

In [None]:
train_merge = pd.merge(train_merge, shops, on = ['shop_id'], how='left')

In [None]:
train_merge.drop(['item_name', 'item_category_name', 'item_category'], axis=1, inplace=True)

In [None]:
train_merge.head()

**Generate lag feature and mean encoding**

In [None]:
# define lag_feature
def lag_feature(data, lags, column):
    temp = data[['date_block_num', 'shop_id', 'item_id', column]]
    for lag in lags:
        shifted = temp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', column + '_lag_' + str(lag)]
        shifted['date_block_num'] += lag
        data = pd.merge(data, shifted, on=['date_block_num','shop_id','item_id'], how='left')
        data[column+'_lag_'+str(lag)] = data[column+'_lag_'+str(lag)].astype('float16')
    return data

In [None]:
# add sales lags for last 3 months
train_merge = lag_feature(train_merge, [1, 2, 3], 'item_cnt_month')

In [None]:
train_merge.info()

In [None]:
# value reduction
def value_reduction(data):
    for column in data.columns:
        if data[column].dtype == 'float64':
            data[column] = data[column].astype(np.float32)
        if (data[column].dtype == 'int64' or data[column].dtype == 'int32') and (data[column].max() < 32767 and data[column].min() > -32768) and data[column].isnull().sum()==0:
            data[column] = data[column].astype(np.int16)
    return data

train_merge = value_reduction(train_merge)

In [None]:
# add mean encoding for items for last 3 month
item_id_target_mean = train_merge.groupby(['date_block_num','item_id'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_enc"}, errors="raise")
train_merge = pd.merge(train_merge, item_id_target_mean, on=['date_block_num','item_id'], how='left')

train_merge['item_target_enc'] = (train_merge['item_target_enc']
                                .fillna(0)
                                .astype(np.float16))

train_merge = lag_feature(train_merge, [1, 2, 3], 'item_target_enc')
train_merge.drop(['item_target_enc'], axis=1, inplace=True)

In [None]:
# add target encoding for item/city for last 3 months
item_id_target_mean = train_merge.groupby(['date_block_num','item_id', 'city_code'])['item_cnt_month'].mean().reset_index().rename(columns={
    "item_cnt_month": "item_loc_target_enc"}, errors="raise")
train_merge = pd.merge(train_merge, item_id_target_mean, on=['date_block_num','item_id', 'city_code'], how='left')

train_merge['item_loc_target_enc'] = (train_merge['item_loc_target_enc']
                                .fillna(0)
                                .astype(np.float16))

train_merge = lag_feature(train_merge, [1, 2, 3], 'item_loc_target_enc')
train_merge.drop(['item_loc_target_enc'], axis=1, inplace=True)

In [None]:
# add target encoding for item/shop for last 3 months 
item_id_target_mean = train_merge.groupby(['date_block_num','item_id', 'shop_id'])['item_cnt_month'].mean().reset_index().rename(columns={
    "item_cnt_month": "item_shop_target_enc"}, errors="raise")

train_merge = pd.merge(train_merge, item_id_target_mean, on=['date_block_num','item_id', 'shop_id'], how='left')

train_merge['item_shop_target_enc'] = (train_merge['item_shop_target_enc']
                                .fillna(0)
                                .astype(np.float16))

train_merge = lag_feature(train_merge, [1, 2, 3], 'item_shop_target_enc')
train_merge.drop(['item_shop_target_enc'], axis=1, inplace=True)

In [None]:
# interaction features
first_item_block = train_merge.groupby(['item_id'])['date_block_num'].min().reset_index()
first_item_block['item_first_interaction'] = 1

first_shop_item_buy_block = train_merge[train_merge['date_block_num'] > 0].groupby(['shop_id', 'item_id'])['date_block_num'].min().reset_index()
first_shop_item_buy_block['first_date_block_num'] = first_shop_item_buy_block['date_block_num']

In [None]:
# merge train and new features
train_merge = pd.merge(train_merge, first_item_block[['item_id', 'date_block_num', 'item_first_interaction']], on=['item_id', 'date_block_num'], how='left')
train_merge = pd.merge(train_merge, first_shop_item_buy_block[['item_id', 'shop_id', 'first_date_block_num']], on=['item_id', 'shop_id'], how='left')

In [None]:
# fillna and change type
train_merge['first_date_block_num'].fillna(100, inplace=True)
train_merge['shop_item_sold_before'] = (train_merge['first_date_block_num'] < train_merge['date_block_num']).astype('int8')
train_merge.drop(['first_date_block_num'], axis=1, inplace=True)

train_merge['item_first_interaction'].fillna(0, inplace=True)
train_merge['shop_item_sold_before'].fillna(0, inplace=True)
 
train_merge['item_first_interaction'] = train_merge['item_first_interaction'].astype('int8')  
train_merge['shop_item_sold_before'] = train_merge['shop_item_sold_before'].astype('int8') 

In [None]:
# add avg category for new features
item_id_target_mean = train_merge[train_merge['item_first_interaction'] == 1].groupby(['date_block_num','item_category_id'])['item_cnt_month'].mean().reset_index().rename(columns={'item_cnt_month': 'new_item_cat_avg'}, errors='raise')

train_merge = pd.merge(train_merge, item_id_target_mean, on=['date_block_num','item_category_id'], how='left')

train_merge['new_item_cat_avg'] = (train_merge['new_item_cat_avg']
                                .fillna(0)
                                .astype(np.float16))

train_merge = lag_feature(train_merge, [1, 2, 3], 'new_item_cat_avg')
train_merge.drop(['new_item_cat_avg'], axis=1, inplace=True)

In [None]:
train_merge.isna().sum()

In [None]:
# fill Nan values to 0
train_merge.fillna(0, inplace=True)
# take data only after 3 since the most lag month interval is 3
train_merge = train_merge[train_merge['date_block_num'] > 2]

In [None]:
# save finished dataset to pickle
train_merge.to_pickle('train_merge.pkl')

In [None]:
# split dataset 
X_train = train_merge[train_merge.date_block_num < 33].drop(['item_cnt_month'], axis=1)
y_train = train_merge[train_merge.date_block_num < 33]['item_cnt_month']
X_valid = train_merge[train_merge.date_block_num == 33].drop(['item_cnt_month'], axis=1)
y_valid = train_merge[train_merge.date_block_num == 33]['item_cnt_month']
X_test = train_merge[train_merge.date_block_num == 34].drop(['item_cnt_month'], axis=1)
print('Shape X_train: {}'.format(X_train.shape))
print()
print('Shape y_train: {}'.format(y_train.shape))
print()
print('Shape X_valid: {}'.format(X_valid.shape))
print()
print('Shape y_valid: {}'.format(y_valid.shape))
print()
print('Shape X_test: {}'.format(X_test.shape))

# **Fit models**

**Catboost**

In [None]:
cat_features = ['country_part', 
                'item_category_common',
                'item_category_id', 
                'city_code']

catboost = CatBoostRegressor(random_state=1, 
                             iterations=2000, verbose=200, depth = 4, 
                             learning_rate=0.01, l2_leaf_reg=7,
                             max_leaves = 2047, min_data_in_leaf = 1,
                             subsample = 0.7,
                             loss_function='RMSE', eval_metric='RMSE',
                             task_type='GPU',early_stopping_rounds=30,
                             grow_policy='Lossguide', bootstrap_type='Poisson',
                            cat_features=cat_features)

In [None]:
catboost.fit(X_train, y_train)

In [None]:
# save catboost model
pickle.dump(catboost, open('catboost.sav', 'wb'))

In [None]:
predict_cb_train = catboost.predict(X_train)
predict_cb_valid = catboost.predict(X_valid)
predict_cb_test = catboost.predict(X_test)
print('Train rmse for Catboost:', np.sqrt(mean_squared_error(y_train, predict_cb_train)))
print('Validation rmse for Catboost:', np.sqrt(mean_squared_error(y_valid, predict_cb_valid)))

**LinearRegression**

In [None]:
lr_features = ['item_target_enc_lag_1', 'item_target_enc_lag_2',
              'item_loc_target_enc_lag_1', 'item_loc_target_enc_lag_2', 'item_loc_target_enc_lag_3', 
               'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3']
lr_train = X_train[lr_features]
lr_val = X_valid[lr_features]
lr_test = X_test[lr_features]


lr_scaler = MinMaxScaler()
lr_scaler.fit(lr_train)
lr_train = lr_scaler.transform(lr_train)
lr_valid = lr_scaler.transform(lr_val)
lr_test = lr_scaler.transform(lr_test)

lr_level1 = LinearRegression()
lr_level1.fit(lr_train, y_train)

In [None]:
# save linear regression model
pickle.dump(lr_level1, open('lr_level1.sav', 'wb'))

In [None]:
predict_lr_train = lr_level1.predict(lr_train)
predict_lr_valid = lr_level1.predict(lr_valid)
predict_lr_test = lr_level1.predict(lr_test)
print('Train rmse for LinearRegression:', np.sqrt(mean_squared_error(y_train, predict_lr_train)))
print('Validation rmse for LinearRegression:', np.sqrt(mean_squared_error(y_valid, predict_lr_valid)))

**LightGBM**

In [None]:
# define build model function
def build_lgb_model(params, X_train, X_val, y_train, y_val, cat_features):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    model = lgb.train(params=params, train_set=lgb_train, valid_sets=(lgb_train, lgb_val), verbose_eval=50,
                     categorical_feature=cat_features)
    return model

# define parameters
params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'num_leaves': 1023,
    'min_data_in_leaf':10,
    'feature_fraction': 0.7,
    'learning_rate': 0.01,
    'num_rounds': 2000,
    'early_stopping_rounds': 30,
    'seed': 1
}
# fit model
lgb_model = build_lgb_model(params, X_train, X_valid, y_train, y_valid, cat_features)

In [None]:
# save lightgbm model
pickle.dump(lgb_model, open('lgb_1.sav', 'wb'))

In [None]:
predict_lgb_train = lgb_model.predict(X_train)
predict_lgb_valid = lgb_model.predict(X_valid)
predict_lgb_test = lgb_model.predict(X_test)
print('Train rmse for LightGBM:', np.sqrt(mean_squared_error(y_train, predict_lgb_train)))
print('Validation rmse for LightGBM:', np.sqrt(mean_squared_error(y_valid, predict_lgb_valid)))

**RandomForest**

In [None]:
rf = RandomForestRegressor(random_state = 1, max_depth=10, max_features='sqrt', min_samples_leaf=7,
                      min_samples_split=11, n_estimators=75)

In [None]:
rf.fit(X_train, y_train)

In [None]:
pickle.dump(rf, open('rf.sav', 'wb'))

In [None]:
predict_rf_train = rf.predict(X_train)
predict_rf_valid = rf.predict(X_valid)
predict_rf_test = rf.predict(X_test)
print('Train rmse for RandomForest:', np.sqrt(mean_squared_error(y_train, predict_rf_train)))
print('Validation rmse for RandomForest:', np.sqrt(mean_squared_error(y_valid, predict_rf_valid)))

**Ensemble model**

Ensemble architecture:
1st level:
* Catboost
* XGBM
* Random forest
* Linear Regression

2nd level:
* Linear Regression

In [None]:
# dataset that will be the train set of the ensemble model
first_level = pd.DataFrame(predict_cb_valid, columns=['catboost'])
first_level['lightgbm'] = predict_lgb_valid
first_level['random_forest'] = predict_rf_valid
first_level['linear_regression'] = predict_lr_valid
first_level['label'] = y_valid.values
first_level.head(5)

In [None]:
# dataset that will be the test set of the ensemble model
first_level_test = pd.DataFrame(predict_cb_test, columns=['catboost'])
first_level_test['lightgbm'] = predict_lgb_test
first_level_test['random_forest'] = predict_rf_test
first_level_test['linear_regression'] = predict_lr_test
first_level_test.head()

In [None]:
meta_model = LinearRegression(n_jobs=-1)

In [None]:
X_first_level = first_level.drop('label', axis=1)
y_first_level = first_level['label']

In [None]:
# trained on validation set using the 1st level models predictions as features
meta_model.fit(X_first_level, y_first_level)

In [None]:
# make predictions on test set using the 1st level models predictions as feature
ensemble_pred_test = meta_model.predict(first_level_test).clip(0, 20)

In [None]:
# save ensemble model
pickle.dump(meta_model, open('meta_model.sav', 'wb'))

**Predict and submit task**

In [None]:
lgb_submission = pd.DataFrame({
    'ID': test.index, 
    'item_cnt_month': lgb_model.predict(X_test).clip(0, 20)
})
lgb_submission.to_csv('lgb_submission.csv', index=False)
print(lgb_submission)

ensemble_submission = pd.DataFrame({
    'ID': test.index, 
    'item_cnt_month': ensemble_pred_test
})
ensemble_submission.to_csv('ensemble_submission.csv', index=False)
print(ensemble_submission)

**Total score**
* LightGBM model (public score) - 0.8981
* LightGBM model (private score) - 0.9120
* Ensemble model (public score) - 0.9050
* Ensemble model (private score) - 0.9070