In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [None]:
df_train= pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")
df_test.head()

In [None]:
df_items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
df_items.head()

In [None]:
df_shops = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")
df_shops.head()

In [None]:
df_item_categories = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
df_item_categories.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.shape

In [None]:
df_train.isnull().sum

In [None]:
df_test.info()

In [None]:
df_test.describe()

In [None]:
df_test.shape

In [None]:
df_shops.info()

In [None]:
df_shops.describe()

In [None]:
df_shops.isnull().sum

In [None]:
df_item_categories.info()

In [None]:
df_item_categories.describe()

In [None]:
df_train = df_train[df_train.item_price < 100000]
df_train = df_train[df_train.item_cnt_day <= 900]

In [None]:
from itertools import product
index_cols = ['shop_id', 'item_id', 'date_block_num']

# compute all shops/items combinations
grid = []
for block_num in df_train['date_block_num'].unique():
    cur_shops = df_train.loc[df_train['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = df_train.loc[df_train['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

In [None]:
train_merge = df_train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day':'sum'})
train_merge.columns = ['item_cnt_month']
train_merge.reset_index(inplace=True)
# merge grid and train
train_merge = pd.merge(grid, train_merge, on = index_cols, how='left').fillna(0)
train_merge['item_cnt_month'] = train_merge['item_cnt_month'].clip(0, 40)

In [None]:
items_prepare = pd.merge(df_items, df_item_categories, on='item_category_id')
train_merge = pd.merge(train_merge, items_prepare, on = ['item_id'], how = 'left')

In [None]:
test_temp = df_test.copy()
test_temp['date_block_num'] = 34
test_temp.drop('ID', axis=1, inplace=True)
# merge with items and item_category
test_temp = test_temp.merge(df_items, how='left', on='item_id')
test_temp = test_temp.merge(df_item_categories, how='left', on='item_category_id')
test_temp.drop('item_name', axis=1, inplace=True)

In [None]:
train_merge = pd.concat([train_merge, test_temp], axis=0, ignore_index=True, keys=index_cols)
train_merge.fillna(0, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
map_dict = {
            'Чистые носители (штучные)': 'Чистые носители',
            'Чистые носители (шпиль)' : 'Чистые носители',
            'PC ': 'Аксессуары',
            'Служебные': 'Служебные '
            }
# extract common categories
train_merge['item_category'] = train_merge['item_category_name'].apply(lambda x: x.split('-')[0])
train_merge['item_category'] = train_merge['item_category'].apply(lambda x: map_dict[x] if x in map_dict.keys() else x)
# encoding common categories
train_merge['item_category_common'] = LabelEncoder().fit_transform(train_merge['item_category'])

In [None]:
df_shops['city'] = df_shops['shop_name'].apply(lambda x: x.split()[0].lower())
df_shops.loc[df_shops.city == '!якутск', 'city'] = 'якутск'
df_shops['city_code'] = LabelEncoder().fit_transform(df_shops['city'])
# add coordinates of cities
coords = dict()
coords['якутск'] = (62.028098, 129.732555, 4)
coords['адыгея'] = (44.609764, 40.100516, 3)
coords['балашиха'] = (55.8094500, 37.9580600, 1)
coords['волжский'] = (53.4305800, 50.1190000, 3)
coords['вологда'] = (59.2239000, 39.8839800, 2)
coords['воронеж'] = (51.6720400, 39.1843000, 3)
coords['выездная'] = (0, 0, 0)
coords['жуковский'] = (55.5952800, 38.1202800, 1)
coords['интернет-магазин'] = (0, 0, 0)
coords['казань'] = (55.7887400, 49.1221400, 4)
coords['калуга'] = (54.5293000, 36.2754200, 4)
coords['коломна'] = (55.0794400, 38.7783300, 4)
coords['красноярск'] = (56.0183900, 92.8671700, 4)
coords['курск'] = (51.7373300, 36.1873500, 3)
coords['москва'] = (55.7522200, 37.6155600, 1)
coords['мытищи'] = (55.9116300, 37.7307600, 1)
coords['н.новгород'] = (56.3286700, 44.0020500, 4)
coords['новосибирск'] = (55.0415000, 82.9346000, 4)
coords['омск'] = (54.9924400, 73.3685900, 4)
coords['ростовнадону'] = (47.2313500, 39.7232800, 3)
coords['спб'] = (59.9386300, 30.3141300, 2)
coords['самара'] = (53.2000700, 50.1500000, 4)
coords['сергиев'] = (56.3000000, 38.1333300, 4)
coords['сургут'] = (61.2500000, 73.4166700, 4)
coords['томск'] = (56.4977100, 84.9743700, 4)
coords['тюмень'] = (57.1522200, 65.5272200, 4)
coords['уфа'] = (54.7430600, 55.9677900, 4)
coords['химки'] = (55.8970400, 37.4296900, 1)
coords['цифровой'] = (0, 0, 0)
coords['чехов'] = (55.1477000, 37.4772800, 4)
coords['ярославль'] = (57.6298700, 39.8736800, 2) 

df_shops['city_coord_1'] = df_shops['city'].apply(lambda x: coords[x][0])
df_shops['city_coord_2'] = df_shops['city'].apply(lambda x: coords[x][1])
df_shops['country_part'] = df_shops['city'].apply(lambda x: coords[x][2])

df_shops = df_shops[['shop_id', 'city_code', 'city_coord_1', 'city_coord_2', 'country_part']]

In [None]:
train_merge = pd.merge(train_merge, df_shops, on = ['shop_id'], how='left')
train_merge.drop(['item_name', 'item_category_name', 'item_category'], axis=1, inplace=True)
train_merge.head()

In [None]:
def lag_feature(data, lags, column):
    temp = data[['date_block_num', 'shop_id', 'item_id', column]]
    for lag in lags:
        shifted = temp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', column + '_lag_' + str(lag)]
        shifted['date_block_num'] += lag
        data = pd.merge(data, shifted, on=['date_block_num','shop_id','item_id'], how='left')
        data[column+'_lag_'+str(lag)] = data[column+'_lag_'+str(lag)].astype('float16')
    return data
# add sales lags for last 3 months
train_merge = lag_feature(train_merge, [1, 2, 3], 'item_cnt_month')


In [None]:
train_merge.info()

In [None]:
def value_reduction(data):
    for column in data.columns:
        if data[column].dtype == 'float64':
            data[column] = data[column].astype(np.float32)
        if (data[column].dtype == 'int64' or data[column].dtype == 'int32') and (data[column].max() < 32767 and data[column].min() > -32768) and data[column].isnull().sum()==0:
            data[column] = data[column].astype(np.int16)
    return data

train_merge = value_reduction(train_merge)
# add mean encoding for items for last 3 month
item_id_target_mean = train_merge.groupby(['date_block_num','item_id'])['item_cnt_month'].mean().reset_index().rename(columns={"item_cnt_month": "item_target_enc"}, errors="raise")
train_merge = pd.merge(train_merge, item_id_target_mean, on=['date_block_num','item_id'], how='left')
train_merge['item_target_enc'] = (train_merge['item_target_enc']
                                .fillna(0)
                                .astype(np.float16))

train_merge = lag_feature(train_merge, [1, 2, 3], 'item_target_enc')
train_merge.drop(['item_target_enc'], axis=1, inplace=True)

In [None]:
item_id_target_mean = train_merge.groupby(['date_block_num','item_id', 'city_code'])['item_cnt_month'].mean().reset_index().rename(columns={
    "item_cnt_month": "item_loc_target_enc"}, errors="raise")
train_merge = pd.merge(train_merge, item_id_target_mean, on=['date_block_num','item_id', 'city_code'], how='left')

train_merge['item_loc_target_enc'] = (train_merge['item_loc_target_enc']
                                .fillna(0)
                                .astype(np.float16))

train_merge = lag_feature(train_merge, [1, 2, 3], 'item_loc_target_enc')
train_merge.drop(['item_loc_target_enc'], axis=1, inplace=True)

In [None]:
item_id_target_mean = train_merge.groupby(['date_block_num','item_id', 'shop_id'])['item_cnt_month'].mean().reset_index().rename(columns={
    "item_cnt_month": "item_shop_target_enc"}, errors="raise")

train_merge = pd.merge(train_merge, item_id_target_mean, on=['date_block_num','item_id', 'shop_id'], how='left')

train_merge['item_shop_target_enc'] = (train_merge['item_shop_target_enc']
                                .fillna(0)
                                .astype(np.float16))

train_merge = lag_feature(train_merge, [1, 2, 3], 'item_shop_target_enc')
train_merge.drop(['item_shop_target_enc'], axis=1, inplace=True)
# interaction features
first_item_block = train_merge.groupby(['item_id'])['date_block_num'].min().reset_index()
first_item_block['item_first_interaction'] = 1

first_shop_item_buy_block = train_merge[train_merge['date_block_num'] > 0].groupby(['shop_id', 'item_id'])['date_block_num'].min().reset_index()
first_shop_item_buy_block['first_date_block_num'] = first_shop_item_buy_block['date_block_num']

In [None]:
train_merge = pd.merge(train_merge, first_item_block[['item_id', 'date_block_num', 'item_first_interaction']], on=['item_id', 'date_block_num'], how='left')
train_merge = pd.merge(train_merge, first_shop_item_buy_block[['item_id', 'shop_id', 'first_date_block_num']], on=['item_id', 'shop_id'], how='left')

In [None]:
train_merge['first_date_block_num'].fillna(100, inplace=True)
train_merge['shop_item_sold_before'] = (train_merge['first_date_block_num'] < train_merge['date_block_num']).astype('int8')
train_merge.drop(['first_date_block_num'], axis=1, inplace=True)

train_merge['item_first_interaction'].fillna(0, inplace=True)
train_merge['shop_item_sold_before'].fillna(0, inplace=True)
 
train_merge['item_first_interaction'] = train_merge['item_first_interaction'].astype('int8')  
train_merge['shop_item_sold_before'] = train_merge['shop_item_sold_before'].astype('int8') 

In [None]:
item_id_target_mean = train_merge[train_merge['item_first_interaction'] == 1].groupby(['date_block_num','item_category_id'])['item_cnt_month'].mean().reset_index().rename(columns={'item_cnt_month': 'new_item_cat_avg'}, errors='raise')

train_merge = pd.merge(train_merge, item_id_target_mean, on=['date_block_num','item_category_id'], how='left')

train_merge['new_item_cat_avg'] = (train_merge['new_item_cat_avg']
                                .fillna(0)
                                .astype(np.float16))

train_merge = lag_feature(train_merge, [1, 2, 3], 'new_item_cat_avg')
train_merge.drop(['new_item_cat_avg'], axis=1, inplace=True)

In [None]:
train_merge.isna().sum()

In [None]:
train_merge.fillna(0, inplace=True)
# take data only after 3 since the most lag month interval is 3
train_merge = train_merge[train_merge['date_block_num'] > 2]
# save finished dataset to pickle
train_merge.to_pickle('train_merge.pkl')

In [None]:
X_train = train_merge[train_merge.date_block_num < 33].drop(['item_cnt_month'], axis=1)
y_train = train_merge[train_merge.date_block_num < 33]['item_cnt_month']
X_valid = train_merge[train_merge.date_block_num == 33].drop(['item_cnt_month'], axis=1)
y_valid = train_merge[train_merge.date_block_num == 33]['item_cnt_month']
X_test = train_merge[train_merge.date_block_num == 34].drop(['item_cnt_month'], axis=1)
print('Shape X_train: {}'.format(X_train.shape))
print()
print('Shape y_train: {}'.format(y_train.shape))
print()
print('Shape X_valid: {}'.format(X_valid.shape))
print()
print('Shape y_valid: {}'.format(y_valid.shape))
print()
print('Shape X_test: {}'.format(X_test.shape))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 1, max_depth=10, max_features='sqrt', min_samples_leaf=7,
                      min_samples_split=11, n_estimators=75)
rf.fit(X_train, y_train)

In [None]:
import pickle
pickle.dump(rf, open('rf.sav', 'wb'))
predict_rf_train = rf.predict(X_train)
predict_rf_valid = rf.predict(X_valid)
predict_rf_test = rf.predict(X_test)
print('Train rmse for RandomForest:', np.sqrt(mean_squared_error(y_train, predict_rf_train)))
print('Validation rmse for RandomForest:', np.sqrt(mean_squared_error(y_valid, predict_rf_valid)))