# 1. Import related modules

In [None]:
import numpy as np
import pandas as pd

In [None]:
import warnings
warnings.filterwarnings('ignore') #Ignore warnings.

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout ,Activation

* **Sequential**：Sequential model. It is the simplest linear, it is a linear stack of multiple network layers.  
* **LSTM**：Recurrent neural network layer.  
* **Dense**：Fully connected layer.  
* **Dropout**：Delete some neurons to prevent overfitting.  
* **Activation**：Activation function, it will modify the result of the previous layer through functions such as 'Relu' and 'Softmax'.

Import the ploting module.

In [None]:
import matplotlib.pyplot as plt

# 2. Import and clean the data set.

### Meaning of each data set  
1. **sales_train.csv**: Training set. Daily historical data from Jan 2013 to Oct 2015.  
2. **test.csv**: Test set. It is necessary to predict the sales of these stores and products in Nov 2015.  
3. **shops.csv**: Supplementary information about the store.  
4. **item_categories.csv**: Supplementary information about item categories.  
5. **sample_submission.csv**: Sample submission documents in the correct format.  
6. **items.csv**: Supplementary information about items.

In [None]:
sales=pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv',parse_dates=['date']) #The data column is automatically parsed and converted into a date format.
test=pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv') 
shops=pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv') 
item_cat=pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv') 
sub=pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv') 
item=pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv') 

## View information of each data set.

### Meaning of each variable
1. **ID**: The ID of the (shop, product) tuple in the test set.  
2. **shop_id**: The unique identifier of the store. 
3. **item_id**: The unique identifier of the product.  
4. **item_category_id**: The unique identifier of the product category.
5. **item_cnt_day**: The number of sold products.  
6. **item_price**: The current price of the item.  
7. **date**: The date format is yyyy-mm-dd.  
8. **date_block_num**: Consecutive month numbers For convenience, 0 is Jan 2013, 1 is Feb 2013, ..., and 33 is Oct 2015.
9. **item_name**: The product name.  
10. **shop_name**: The store name.  
11. **item_category_name**: The product category name.

### Define a function.

In [None]:
def basic_information(df):
    print('----------------The first 5 rows of data----------------')
    print(df.head())
    print('----------------Brief summary----------------')
    print(df.info())
    print('----------------Data set size----------------')
    print(df.shape)
    print('----------------Column name----------------')
    print(df.columns)
    print('----------------Numerical features----------------')
    print(df.describe())
    print('----------------Missing data----------------')
    print(df.isnull().sum())

In [None]:
basic_information(sales)

In [None]:
basic_information(test)

In [None]:
basic_information(shops)

In [None]:
basic_information(item_cat)

In [None]:
basic_information(sub)

In [None]:
basic_information(item)

## Processing the 'sales_train' data set

In [None]:
dataset = sales.pivot_table(index = ['shop_id','item_id'],values = ['item_cnt_day'],columns = ['date_block_num'],
                            fill_value = 0,aggfunc=[np.sum])
dataset.head() #Observe the front data.

**pivot_table**:  
1. **index**: index. There are two levels of index here, and the order is the index order. The first level is the store code (shop_id), and the second level is the product code (item_id).  
2. **values**: The row grouping key, the column name or other grouping key used for grouping, is used as the row index of the result DataFrame to filter out the required data. The number of products to be sold(item_cnt_day).  
3. **columns**: The column grouping key, the column name or other grouping key used for grouping, as the column index of the result DataFrame. Here is a breakdown of the number of products sold under each consecutive month number.  
4. **fill_value**: Set the null value to 0.  
5. **aggfunc**: Set the function operation when aggregating data. The default is averaging. Use np.sum to sum up.

In [None]:
dataset.reset_index(inplace = True)
dataset.head() #Observe the front data.

**reset_index**：Reset the pivot table index. Among them, '**inplace = True**' means to modify the original variable without creating a new variable.

In [None]:
dataset.drop(['shop_id','item_id'],inplace = True, axis = 1)
dataset.head()

In [None]:
dataset.shape

**drop**：Delete some data. Here delete the '**shop_id**' and '**item_id**' columns' data , because in the data training, these data are irrelevant data and are only defined and established when the data set is created.  
**axis=1**：Delete the column.

# 3. Recurrent neural network-LSTM 

### (1). Divide training set and test set.

In [None]:
X_train = np.expand_dims(dataset.values[:,:-1],axis = 2) #Set all columns's data except the last column as training values.
y_train = dataset.values[:,-1:] #Set the last column as label.
X_test = np.expand_dims(dataset.values[:,1:],axis = 2) ##Set all columns except the first column as predicted values.
print(X_train.shape,y_train.shape) #View the shape of the training data set to make sure that the corresponding shapes of each data set are the same.

expand_dims：Expand the array shape.

### (2). Build the model.

In [None]:
model_LSTM = Sequential()
model_LSTM.add(LSTM(units = 64,input_shape = (33,1)))
#model_LSTM.add(LSTM(units =70,input_shape = (X_train.shape[1], X_train.shape[2]))) #units：Output dimension；input_shape：Input dimension.
model_LSTM.add(Dropout(0.2)) #Represent the proportion of missing input.

#model_LSTM.add(LSTM(units=50, return_sequences=False))
#model_LSTM.add(Dropout(0.2))

model_LSTM.add(Dense(1)) #Output dimension.
model_LSTM.add(Activation('relu'))
model_LSTM.compile(loss = 'mse',optimizer = 'adam', metrics = ['accuracy']) 

model_LSTM.summary()

Add layers to the model.  
**compile**：When configuring the training method, set the optimizer, loss function, and accuracy evaluation standard used during training.  
1. The loss function is the mean square error.  
2. The optimizer is Adaptive Moment Estimation, which is a variant of the gradient descent algorithm. However, the learning rate of each iteration parameter has a certain range, and the learning rate (step size) will not become too large because of the large gradient, and the value of the parameter is relatively stable.  
3. The accuracy rate evaluation standard is the accuracy rate.

### (3). Train the model.

In [None]:
#history_lstm = model_LSTM.fit(X_train,y_train,batch_size = 4000,epochs = 12)
#history_lstm = model_LSTM.fit(X_train,y_train,batch_size = 4096,epochs = 10)
history_lstm = model_LSTM.fit(X_train,y_train,batch_size = 4096,epochs = 20)

### (4). Draw loss and score curves during model training.

In [None]:
plt.plot(history_lstm.history['loss'])

In [None]:
plt.plot(history_lstm.history['accuracy'])

In [None]:
print(model_LSTM.predict(X_test))

In [None]:
data_1=model_LSTM.predict(X_test).tolist()
data_1=[i[0] for i in data_1]

In [None]:
data_2=y_train.tolist()
data_2=[i[0] for i in data_2]

In [None]:
plt.rcParams['figure.figsize'] = 11.7,8.27
plt.scatter(x=range(len(data_1)),y=data_1,s=1)
plt.scatter(x=range(len(data_2)),y=data_2,s=1)

In [None]:
abs(len(data_2)-len(data_1))

In [None]:
sum1 = 1
for i in range(len(data_2)):
    if abs(data_2[i]-data_1[i]) == 0:
        sum1 += 1
print(sum1)

In [None]:
sum1/len(data_1)*100

It can be seen that the prediction rate of this model training is around 85%.