In [17]:
import pandas as pd

train_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
train_monthly_df = train_df.groupby(['date_block_num','shop_id','item_id']).agg(item_cnt_month=('item_cnt_day','sum')).reset_index()
train_monthly_df['shop_item'] = train_monthly_df['shop_id'].astype('string') + '-' + train_monthly_df['item_id'].astype('string')
shop_item_list = pd.unique(train_monthly_df['shop_item'])

In [28]:
import numpy as np

# get the sequence of each shop_item
data_seq_dict = {}
for shop_item in shop_item_list:
    data_seq_dict[shop_item] = [0.0 for i in range(0,34)]
for _, row in train_monthly_df.iterrows():
    shop_item = str(row['shop_id']) + '-' + str(row['item_id'])
    data_seq_dict[shop_item][int(row['date_block_num'])] = row['item_cnt_month']

# get the mean of each shop_item
data_mean_dict = {}
for shop_item in shop_item_list:
    data_mean_dict[shop_item] = sum(data_seq_dict[shop_item]) / len(data_seq_dict[shop_item])

# get the norm sequence of each shop_item
data_norm_seq_dict = {}
for shop_item in shop_item_list:
    if data_mean_dict[shop_item] != 0.0:
        data_norm_seq_dict[shop_item] = [e / data_mean_dict[shop_item] for e in data_seq_dict[shop_item]]
    else:
        data_norm_seq_dict[shop_item] = data_seq_dict[shop_item]

data_norm_seq_array = np.array(list(data_norm_seq_dict.values()))

In [None]:
# # get the norm sequence of cold items
# item_list = pd.unique(train_monthly_df['item_id'])
# for item in item_list:
    

In [29]:
from sklearn.model_selection import train_test_split

X = data_norm_seq_array[:,0:-1]
Y = data_norm_seq_array[:,1:]
X = X.reshape((X.shape[0], X.shape[1], 1))
Y = Y.reshape((Y.shape[0], Y.shape[1], 1))
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.3)

In [35]:
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

model = Sequential()
model.add(LSTM(1, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.compile(loss=root_mean_squared_error, optimizer=Adam(learning_rate=0.1), metrics=[RootMeanSquaredError()])
model.summary()

model.fit(x=X_train, y=Y_train, validation_data=(X_val, Y_val), epochs=10, batch_size=1024*256)

In [47]:
test_df = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')
test_df['shop_item'] = test_df['shop_id'].astype('string') + '-' + test_df['item_id'].astype('string')

X_test = []
means = []
for _, row in test_df.iterrows():
    shop_item = str(row['shop_id']) + '-' + str(row['item_id'])
    if shop_item in data_norm_seq_dict.keys():
        X_test.append(data_norm_seq_dict[shop_item])
        means.append(data_mean_dict[shop_item])
    else:
        X_test.append([0.0 for i in range(0,34)])
        means.append(0.0)
X_test = np.asarray(X_test)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [48]:
pred = model.predict(X_test)

pred = pred[:,-1:,:]
pred = pred.reshape((pred.shape[0]))
assert len(pred) == len(means)
pred = [pred[i] * means[i] for i in range(0, len(pred))]

In [49]:
submission_df = pd.DataFrame(pred, columns=['item_cnt_month'])
submission_df.index.name = 'ID'
submission_df

In [50]:
submission_df.to_csv('/kaggle/working/submission.csv')