In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Setup notebook
from pathlib import Path
from learntools.time_series.style import *  # plot style settings

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression

**把数据的解释说明复制过来供xdm参考**

**File Descriptions and Data Field Information**

**train.csv**
The training data, comprising time series of features store_nbr, family, and onpromotion as well as the target sales.

store_nbr identifies the store at which the products are sold.

family identifies the type of product sold.

sales gives the total sales for a product family at a particular store at a given date. Fractional values are possible since products can be sold in fractional units (1.5 kg of cheese, for instance, as opposed to 1 bag of chips).

onpromotion gives the total number of items in a product family that were being promoted at a store at a given date.

**test.csv**
The test data, having the same features as the training data. You will predict the target sales for the dates in this file.The dates in the test data are for the 15 days after the last date in the training data.

**sample_submission.csv**
A sample submission file in the correct format.

**stores.csv**
Store metadata, including city, state, type, and cluster.
cluster is a grouping of similar stores.

**oil.csv**
Daily oil price. Includes values during both the train and test data timeframes. (Ecuador is an oil-dependent country and it's economical health is highly vulnerable to shocks in oil prices.)

**holidays_events.csv**
Holidays and Events, with metadata

NOTE: Pay special attention to the transferred column. A holiday that is transferred officially falls on that calendar day, but was moved to another date by the government. A transferred day is more like a normal day than a holiday. To find the day that it was actually celebrated, look for the corresponding row where type is Transfer. For example, the holiday Independencia de Guayaquil was transferred from 2012-10-09 to 2012-10-12, which means it was celebrated on 2012-10-12. Days that are type Bridge are extra days that are added to a holiday (e.g., to extend the break across a long weekend). These are frequently made up by the type Work Day which is a day not normally scheduled for work (e.g., Saturday) that is meant to payback the Bridge.
Additional holidays are days added a regular calendar holiday, for example, as typically happens around Christmas (making Christmas Eve a holiday).

**Additional Notes**
Wages in the public sector are paid every two weeks on the 15 th and on the last day of the month. Supermarket sales could be affected by this.
A magnitude 7.8 earthquake struck Ecuador on April 16, 2016. People rallied in relief efforts donating water and other first need products which greatly affected supermarket sales for several weeks after the earthquake.

In [None]:
#读取训练数据
folder_path='../input/store-sales-time-series-forecasting'       #文件夹地址

dtype = {
    'store_nbr': 'category',
    'family': 'category',
    'sales': 'float32',
    'onpromotion': 'uint64',
} #设置数据类型

train_data=pd.read_csv(folder_path+'/train.csv',
                       dtype=dtype,
                       parse_dates=['date'],
                       infer_datetime_format=True) #读取训练数据

sales_data=train_data.copy() # 复制一遍数据
sales_data=sales_data.set_index('date').to_period('D') #将日期设置为index，转为日期类型
sales_data #打印输出

In [None]:
# 查看有无缺失值
sales_data.isnull().sum()
# 无缺失值

In [None]:
# 销售额与促销品种的数量的相关性较高
sales_data.corr()

In [None]:
# 所有店一共有33个品类的商品
sales_data.family.unique()

In [None]:
# 每家店都有33种商品的销量数据
number_of_products_of_each_store=[]
for i in sales_data.groupby('store_nbr').family.unique():
    number_of_products_of_each_store.append(len(i))
number_of_products_of_each_store

In [None]:
# 计算每天所有店的销售总额
total_sales=sales_data.groupby('date').sum()['sales']
ax = total_sales.plot(**plot_params)
ax.set(title="Total Sales of all stores", ylabel="Sales") 

元旦，几乎所有店关门，所以接近0

数据看上去具备季节性，整体趋势是上升的，可能和经济走势挂钩（后续跟油价走势对比一下）

In [None]:
# 平稳性检验
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(total_sales).show()
# 自相关系数长期大于零，具有很强的长期相关性

In [None]:
# 平滑按照30天滑窗处理

trend = total_sales.rolling(window=30,center=True,min_periods=15).mean()

ax = total_sales.plot(**plot_params, alpha=0.5)
ax = trend.plot(ax=ax, linewidth=3)

In [None]:
# 接下来看一下 transaction.csv
# 读取交易数据
transaction=pd.read_csv(folder_path+'/transactions.csv',
                       parse_dates=['date'],
                       infer_datetime_format=True) #读取训练数据
transaction=transaction.set_index('date').to_period('D') #设置index，转化为day类型

In [None]:
# 计算每天所有店的交易总笔数
total_transactions=transaction.groupby('date').sum()['transactions']
ax = total_transactions.plot(**plot_params)
ax.set(title="Total Transactions of all stores", ylabel="Transactions") 
# 按年为单位呈现明显的周期性

In [None]:
# 计算单笔交易的销售额 ARPT = average sales per transaction
sales_and_transaction = pd.concat([total_sales,total_transactions], axis=1)
sales_and_transaction['ASPT']=sales_and_transaction.sales/sales_and_transaction.transactions
sales_and_transaction.ASPT.plot(**plot_params)

# 2017年之后逐步稳定，一种思路：或许可以从预测交易数+预测ASPT出发，两者相乘即得Sales

In [None]:
# 接下来看一下 stores.csv
# 读取门店数据
dtype = {'store_nbr': 'category'} 
store=pd.read_csv(folder_path+'/stores.csv',dtype=dtype) #读取训练数据
stores=store.copy()
stores=stores.set_index('store_nbr') #设置索引
stores
# 一共54家店 五种type的店 可能和销售额会有相关性 下面做一个分析
# 所在city会受到不同的local holidays的影响

In [None]:
# 通过store_nbr列将训练集sales的数据和stores的type数据结合
sales_data['type']=sales_data.store_nbr.map(stores.type)
sales_data

In [None]:
# 计算每天不同type的店的销售总额
sales_data_type=sales_data.groupby(['date',sales_data['type']])['sales'].sum().reset_index()
sales_data_type=sales_data_type.set_index('date')
sales_data_type

In [None]:
# 画出不同type商店的销售额情况
for i in sales_data_type.type.unique():
    sales_data_type.loc[sales_data_type.type==i].plot(label='type'+i)
plt.legend()
plt.show()

# 不知道怎么叠加在一张图上 sorry
# 可以看到虽然纵坐标确实有区别，但是整体的走势基本一致，因此后续的预测不需要根据type进行细分
# 由于不同地区有不同的节日，因此根据不同的地区分别进行预测，最后再加和，或许会好一些

In [None]:
# 读取holidays_events.csv数据
holiday=pd.read_csv(folder_path+'/holidays_events.csv')
holiday

由于不同地区有不同的节日，因此根据不同的地区分别进行预测，最后再加和，或许会好一些

从2016-4-16号开始便是Terremoto Manabi地震，可能对后续一个月的销售额都会产生影响，因此训练时建议去除这部分数据，通过之前的数据重新拟合一个正常情况下的结果再用来预测

In [None]:
# 最后看一下 oil.csv
# 读取交易数据
oil=pd.read_csv(folder_path+'/oil.csv',
                       parse_dates=['date'],
                       infer_datetime_format=True) #读取训练数据
oil=oil.set_index('date').to_period('D') #设置index，转化为day类型
oil.plot(**plot_params) #油价走势

In [None]:
# 对比下便利店销售额走势
trend.plot(linewidth=1)
# 就算考虑到有一定滞后性，貌似油价跌得越多，销售额越高
# 可能是经济下行导致国民更多地去便利店购买便宜的日用品，而消费不起更贵的产品？
# 建议作为宏观变量放进模型