In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
INPUT_DIR = '/kaggle/input/competitive-data-science-predict-future-sales/'

sales_train_df = pd.read_csv(f'{INPUT_DIR}/sales_train.csv')
item_categories_df = pd.read_csv(f'{INPUT_DIR}/item_categories.csv')
items_df = pd.read_csv(f'{INPUT_DIR}/items.csv')
shops_df = pd.read_csv(f'{INPUT_DIR}/shops.csv')
test_df = pd.read_csv(f'{INPUT_DIR}/test.csv')
submission_df = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')

# Sales train

In [None]:
sales_train_df.info()
sales_train_df

In [None]:
sales_train_df.describe()

In [None]:
#Number of na's
sales_train_df.isna().sum()

# Item Categories

In [None]:
item_categories_df.info()

In [None]:
#Number of na's
item_categories_df.isna().sum()


# Items

In [None]:
items_df.info()

In [None]:
items_df.isna().sum()

# Shops

In [None]:
shops_df.info()
shops_df.head()

In [None]:
shops_df.isna().sum()

# Test

In [None]:
test_df.info()
test_df

test = test_df.copy()

# Outliers

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=sales_train_df.item_cnt_day)

plt.figure(figsize=(10,4))
plt.xlim(sales_train_df.item_price.min(), sales_train_df.item_price.max()*1.1)
sns.boxplot(x=sales_train_df.item_price)

In [None]:
train = sales_train_df.copy()
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

# Price below zero

In [None]:
median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

# Duplicated shops

In [None]:
shops_df

In [None]:
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

# Cities and item types

In [None]:
shops = shops_df.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

In [None]:
shops.head()

In [None]:
cats = item_categories_df.copy()
items = items_df.copy()
cats.head()

In [None]:
cats['split'] = cats['item_category_name'].str.split('-')
cats['type'] = cats['split'].map(lambda x: x[0].strip())
cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
# if subtype is nan then type
cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_code', 'subtype_code']]

items.drop(['item_name'], axis=1, inplace=True)

In [None]:
cats

# Generate matrix

In [None]:
from itertools import product
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)

In [None]:
matrix

In [None]:
train['revenue'] = train['item_price'] *  train['item_cnt_day']

In [None]:
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20)
                                .astype(np.float16))

In [None]:
matrix

# Adding Test Set

In [None]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

In [None]:
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True)

In [None]:
matrix

# Merge with shops/items/categ.

In [None]:
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, cats, on=['item_category_id'], how='left')
matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['type_code'] = matrix['type_code'].astype(np.int8)
matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)

In [None]:
matrix

# Adding lags

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
matrix = lag_feature(matrix, [1,2,3,6,9,12], 'item_cnt_month')

# Adding mean

In [None]:
group = matrix.groupby(['date_block_num']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num'], how='left')
matrix['date_avg_item_cnt'] = matrix['date_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_avg_item_cnt')
matrix.drop(['date_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_cnt'] = matrix['date_item_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1,2,3,6,9,12], 'date_item_avg_item_cnt')
matrix.drop(['date_item_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_shop_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_avg_item_cnt'] = matrix['date_shop_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1,2,3,6,9,12], 'date_shop_avg_item_cnt')
matrix.drop(['date_shop_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_cat_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_category_id'], how='left')
matrix['date_cat_avg_item_cnt'] = matrix['date_cat_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_cat_avg_item_cnt')
matrix.drop(['date_cat_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_cat_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
matrix['date_shop_cat_avg_item_cnt'] = matrix['date_shop_cat_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_cat_avg_item_cnt')
matrix.drop(['date_shop_cat_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'shop_id', 'type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_type_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'type_code'], how='left')
matrix['date_shop_type_avg_item_cnt'] = matrix['date_shop_type_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_type_avg_item_cnt')
matrix.drop(['date_shop_type_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'shop_id', 'subtype_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_subtype_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'subtype_code'], how='left')
matrix['date_shop_subtype_avg_item_cnt'] = matrix['date_shop_subtype_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_subtype_avg_item_cnt')
matrix.drop(['date_shop_subtype_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'city_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_city_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'city_code'], how='left')
matrix['date_city_avg_item_cnt'] = matrix['date_city_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_city_avg_item_cnt')
matrix.drop(['date_city_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'item_id', 'city_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_city_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'item_id', 'city_code'], how='left')
matrix['date_item_city_avg_item_cnt'] = matrix['date_item_city_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_item_city_avg_item_cnt')
matrix.drop(['date_item_city_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'type_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_type_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'type_code'], how='left')
matrix['date_type_avg_item_cnt'] = matrix['date_type_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_type_avg_item_cnt')
matrix.drop(['date_type_avg_item_cnt'], axis=1, inplace=True)

In [None]:
group = matrix.groupby(['date_block_num', 'subtype_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_subtype_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'subtype_code'], how='left')
matrix['date_subtype_avg_item_cnt'] = matrix['date_subtype_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_subtype_avg_item_cnt')
matrix.drop(['date_subtype_avg_item_cnt'], axis=1, inplace=True)

In [None]:
matrix = matrix[matrix.date_block_num > 11]

In [None]:
def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

matrix = fill_na(matrix)

In [None]:
matrix.columns

# train & test

In [None]:
data = matrix.copy()

In [None]:
X = data[data.date_block_num <= 33].drop(['item_cnt_month'], axis=1)
y = data[data.date_block_num <= 33]['item_cnt_month']
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

# Delete data that is not needed

In [None]:
del matrix
del sales_train_df
del train
del shops_df
del shops
del cats
del item_categories_df
del items_df
del items

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train,y_train)

print('Linear Regression RMSE:', mean_squared_error(y_valid, reg.predict(X_valid), squared = False))

# Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [None]:
'''parameters = {'alpha':[0, 1, 10, 100, 1000, 10000, 100000, 1000000]}

model = Ridge()
Ridge_reg= GridSearchCV(model, parameters, scoring='neg_root_mean_squared_error',cv=5)

Ridge_reg.fit(X,y)
print(Ridge_reg.best_estimator_)'''

In [None]:
reg_ridge = Ridge(alpha = 1000)
reg_ridge.fit(X_train,y_train)

print('Ridge RMSE:', mean_squared_error(y_valid, reg_ridge.predict(X_valid), squared = False))

In [None]:
reg_ridge.fit(X, y)
Y_test = reg_ridge.predict(X_test).clip(0, 20)

submissionRidge = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})

# LightGBM

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from lightgbm import LGBMRegressor

In [None]:
lgbm_model = LGBMRegressor(
    n_jobs=-1,
    learning_rate=0.2,
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42
)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=7)
eval_set = [(X_valid, y_valid)]

lgbm_model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=eval_set,
    verbose=True, 
    early_stopping_rounds = 3)

In [None]:
Y_test = lgbm_model.predict(X_test).clip(0, 20)

submissionLGBM = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})

# LSTM

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=7)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
from keras.layers import LSTM, Dense, Dropout
from keras.models import Sequential

In [None]:
tf.keras.backend.clear_session()

model = Sequential()
model.add(LSTM(units=64, input_shape=(35, 1)))
model.add(Dropout(0.3))
model.add(Dense(1))

model.compile(loss='mse',
              optimizer='adam',
              metrics=['mean_squared_error'])
model.summary()

In [None]:
X_train = np.expand_dims(X_train.values,axis = 2)
X_test = np.expand_dims(X_test.values,axis = 2)
X_valid = np.expand_dims(X_valid.values,axis = 2)



print(X_train.shape, X_test.shape)

In [None]:
'''X_train = X.values
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))

X_valid = np.reshape(X_valid.values, (X_valid.shape[0], 1, X_valid.shape[1]))

eval_set = (X_valid, y_valid)


history = model.fit(X_train, y_train, batch_size=2048, epochs=5, validation_data=eval_set)
'''

In [None]:
from keras.utils import plot_model

In [None]:
'''Y_test = model.predict(X_test).clip(0, 20)
print(Y_test)

submissionLSTM = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test.ravel()
})'''

In [None]:
#plt.plot(history.history['val_loss'], label = 'Training Loss')
#plt.legend(loc = 'best', shadow = True)

In [None]:
submission_df['item_cnt_month'] = ( 0.1*submissionRidge['item_cnt_month'] + 0.9*submissionLGBM['item_cnt_month'])
submission_df.to_csv('submission.csv', index=False)
print('output file :','submission','saved')