In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler, LabelEncoder,OneHotEncoder
from matplotlib import pyplot

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# load
def read_data(PATH):
    print('Reading files...')
    calendar = pd.read_csv(f'{PATH}/calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))

    sell_prices = pd.read_csv(f'{PATH}/sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))

    sales_train_validation = pd.read_csv(f'{PATH}/sales_train_validation.csv')
    sales_train_validation = reduce_mem_usage((sales_train_validation))
    sales_train_validation = reduce_mem_usage(sales_train_validation[(sales_train_validation['cat_id'] == 'FOODS') & (sales_train_validation['id']=='FOODS_3_090_CA_3_validation')])
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))

    # submission = pd.read_csv(f'{PATH}/sample_submission.csv')
    # submission = reduce_mem_usage((submission))

    return calendar, sell_prices, sales_train_validation

food_calendar, food_sell_prices, food_sales_train_validation = read_data("/kaggle/input/m5-forecasting-accuracy/")

In [None]:
# 聚合
# 销售数据雷转列
food_sales_train_validation_melt = reduce_mem_usage(pd.melt(food_sales_train_validation, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='day', value_name='demand'))
food_sales_train_validation_melt.head().append(food_sales_train_validation_melt.tail())
# merge为大宽表
food_sales_train_validation_melt_merge = reduce_mem_usage(pd.merge(food_sales_train_validation_melt, food_calendar, left_on="day", right_on="d", how="left"))
food_sales_train_validation_melt_merge = reduce_mem_usage(pd.merge(food_sales_train_validation_melt_merge, food_sell_prices, left_on=["store_id", "item_id", "wm_yr_wk"],right_on=["store_id", "item_id", "wm_yr_wk"], how="left"))

In [None]:
one_item_sales = food_sales_train_validation_melt_merge

# 特征工程

## 日期相关特征

In [None]:
# 增加滞后数据
cols = []
colNames = []
# 输入序列 (t-n, ... t-1)
for i in range(28, 0, -1):
    cols.append(one_item_sales['demand'].shift(i))
    colNames.append('demand_T-'+ str(i))
# # 预测序列 (t, t+1, ... t+n)
# for i in range(1, 28, 1):
#     cols.append(one_item_sales['demand'].shift(-i))
#     colNames.append('demand_T+'+ str(i))
# 信息汇总
lags = pd.concat(cols, axis=1)

# 删除带有空数据的行
# agg.dropna(inplace=True)

# 修改列名
lags.columns=colNames
lags.head()

In [None]:
# 增加移动平均数据
width = 28
window = one_item_sales['demand'].rolling(window=width)
wins_28 = pd.concat([window.min(), window.mean(), window.max(),window.std()], axis=1)
wins_28.columns = ['min_T-28', 'mean_T-28', 'max_T-28', 'std_T-28']

width = 7
window = one_item_sales['demand'].rolling(window=width)
wins_7 = pd.concat([window.min(), window.mean(), window.max(),window.std()], axis=1)
wins_7.columns = ['min_T-7', 'mean_T-7', 'max_T-7', 'std_T-7']

In [None]:
df_train_one_item = pd.concat([one_item_sales, lags, wins_28,wins_7],axis=1)

In [None]:
# 删除带有空数据的行
colList  = [col for col in df_train_one_item.columns if 'demand_T'  in col]
df_train_one_item.dropna(subset=colList,inplace=True)


In [None]:
df_train_one_item = df_train_one_item.drop(['weekday'],axis=1)
df_train_one_item['date'] = pd.to_datetime(df_train_one_item.date)
df_train_one_item = df_train_one_item.set_index(df_train_one_item.date)

date_features = {
    "wday": "weekday",
    "week": "weekofyear",
    "month": "month",
    "quarter": "quarter",
    "year": "year",
    "mday": "day",
}
for date_feat_name, date_feat_func in date_features.items():
    if date_feat_name in df_train_one_item.columns:
        df_train_one_item[date_feat_name] = df_train_one_item[date_feat_name].astype("int16")
    else:
        df_train_one_item[date_feat_name] = getattr(df_train_one_item["date"].dt, date_feat_func).astype("int16")

## Encode

In [None]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id','state_id']
for cc in cat_feats:
    encoder = LabelEncoder()
    df_train_one_item[cc] = encoder.fit_transform(df_train_one_item[cc])

In [None]:
nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
for feature in nan_features:
    df_train_one_item[feature].fillna('unknown', inplace = True)

cat = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']
for feature in cat:
    encoder = LabelEncoder()
    df_train_one_item[feature] = encoder.fit_transform(df_train_one_item[feature])

In [None]:
onehotencoder = OneHotEncoder()
cat_feats = pd.DataFrame(onehotencoder.fit_transform(pd.DataFrame(df_train_one_item[cat])).toarray())
cat_feats.head()

# 特征缩放

# 数据集划分

In [None]:
features = [
            'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week',  'wday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
            'snap_CA', 'snap_TX', 'snap_WI',
            'sell_price',
            'demand_T-28', 'demand_T-27', 'demand_T-26', 'demand_T-25', 'demand_T-24', 'demand_T-23', 'demand_T-22','demand_T-21','demand_T-20',
            'demand_T-19','demand_T-18', 'demand_T-17', 'demand_T-16', 'demand_T-15', 'demand_T-14', 'demand_T-13', 'demand_T-12','demand_T-11','demand_T-10',
            'demand_T-9','demand_T-8', 'demand_T-7', 'demand_T-6', 'demand_T-5', 'demand_T-4', 'demand_T-3', 'demand_T-2','demand_T-1',
            #'min', 'mean', 'max', 'std'
            'min_T-28', 'mean_T-28', 'max_T-28', 'std_T-28','min_T-7', 'mean_T-7', 'max_T-7', 'std_T-7'
            ]

# target = ['demand'] + [col for col in  df_train_one_item.columns if  'demand_T+'  in col]
target = ['demand']

In [None]:
df_train_one_item[df_train_one_item['demand']>0].index.min()
df_train_one_item = df_train_one_item[df_train_one_item.index >= '2011-09-25']

In [None]:
# 划分训练与测试
test_size = int(df_train_one_item.shape[0] * 0.2)
train,test = df_train_one_item[:-test_size], df_train_one_item[-test_size:]

# train 数据集
X_train, y_train = train[features], train[target]

# test 数据集
X_test,y_test = test[features], test[target]

# 模型训练 测试 验证

## 线性回归

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [None]:
def LassoRegression(degree, alpha):
    return Pipeline([
        ("poly", PolynomialFeatures(degree=degree)),
        ("std_scaler", StandardScaler()),
        ("lasso_reg", Lasso(alpha=alpha))
    ])

In [None]:
lasso1_reg = LassoRegression(3, 0.1)
lasso1_reg.fit(X_train, y_train)

y_predict = lasso1_reg.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_predict))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_predict)))
print("R2:", r2_score(y_test, y_predict))
print("mape:",mean_absolute_percentage_error(y_test, y_predict))
print('mae:',mean_absolute_error(y_test, y_predict))

In [None]:
lasso1_reg['lasso_reg'].coef_

In [None]:
# 绘制预测结果图
y_predict = pd.DataFrame(y_predict,columns={'demand'},index=y_test.index)
plt.figure(figsize=(40, 10))
pyplot.plot(y_test['demand'], label='Expected')
pyplot.plot(y_predict['demand'], label='Predicted')
pyplot.legend()
pyplot.show()