In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as plt
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dates = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/calendar.csv")
data = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv")
sale_data = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sell_prices.csv")
submission = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sample_submission.csv")

print(dates.head())
print(data.head())
print(sale_data.head())
print(submission.head())

# First we want to see how sales are changing over time
This shows that sales are growing over time, but there also looks like there is some cyclic increase and decrease.

In [None]:
last_date = 1913

original_features = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

total_sales = data.mean()
x_data = np.array([int(x[2:]) for x in total_sales.index])
y_data = np.array(total_sales.array)
m, b = np.polyfit(x_data, y_data, 1)

plt.pyplot.scatter(x_data, total_sales, s=1)
plt.pyplot.plot([0, last_date], [b, m*last_date + b], linewidth=3)
print("Gradient: ", m, "Intercept:", b)

plt.pyplot.figure()
plt.pyplot.scatter(x_data[400:1250], total_sales.iloc[400:1250], s=5)

# Seeing how this is per state
The data shows that most sales are highly similar.
* CA the highest sales.
* TX had the lowest sales, and slowest growth.
* WI has the fastest growing sales.

In [None]:
state_groups = data.groupby('state_id')
state_data = state_groups.mean()
print(state_data)
x_data = range(1, last_date+1)

for i, g in enumerate(state_groups.groups.keys()):
    y_data = [state_data['d_' + str(x)].iloc[i] for x in x_data]
    plt.pyplot.scatter(x_data, y_data, s=5, alpha=0.3)
    m, b = np.polyfit(x_data, y_data, 1)
    print("Group " + str(i) + ":", g, "Gradient:", m, "Intercept:", b)
    plt.pyplot.plot([0, last_date], [b, m*last_date + b], linewidth=4, label=g)
plt.pyplot.legend(loc="upper left")

# Now we look at category
The data shows that categorical spending in food is drastically different. Hobbies and household sales are similar.
* FOOD the highest sales.
* HOBBIES had the lowest sales, and slowest growth.
* HOUSEHOLD has the fastest growing sales (barely).

There isn't much significance found in breaking down categories by state.

In [None]:
category_groups = data.groupby('cat_id')
category_data = category_groups.mean()

x_data = range(1, last_date+1)

for i, c in enumerate(category_groups.groups.keys()):
    y_data = [category_data['d_' + str(x)].iloc[i] for x in x_data]
    plt.pyplot.scatter(x_data, y_data, s=5, alpha=0.3)
    m, b = np.polyfit(x_data, y_data, 1)
    print("Category ", c, "Gradient:", m, "Intercept:", b)
    plt.pyplot.plot([0, last_date], [b, m*last_date + b], linewidth=4, label=c)
plt.pyplot.legend(loc="upper left")

cs_group = data.groupby(['state_id','cat_id']).mean().reset_index()
colours = ['blue', 'darkorange', 'green']
for state in state_groups.groups.keys():
    plt.pyplot.figure(figsize=(20, 12))
    for i, c in enumerate(category_groups.groups.keys()):
        plt.pyplot.subplot(int('33' + str(i+1)))
        y_data = cs_group[(cs_group.state_id == state) & (cs_group.cat_id == c)].iloc[0].tail(last_date).to_list()
        plt.pyplot.scatter(x_data, y_data, s=5, alpha=0.3, c=colours[i])
        
        m, b = np.polyfit(x_data, y_data, 1)
        print("State: " + state, "Category: " + c, "Gradient:", m, "Intercept:", b)
        plt.pyplot.plot([0, last_date], [b, m*last_date + b], linewidth=4, label=c, c=colours[i])
        plt.pyplot.legend(loc="upper left")
        plt.pyplot.title(state)


# Breakdown by store
* Stores are clearly very different.
* Unable to ignore stores
* **Something weird happened after day 500 for WI_1 and WI_2**

In [None]:
ss_group = data.groupby(['state_id','store_id']).mean().reset_index()

for state in state_groups.groups.keys():
    store_group = ss_group[ss_group.state_id == state].groupby('store_id')
    plt.pyplot.figure(figsize=(16, 4))
    for i, s in enumerate(store_group.groups.keys()):
        y_data = ss_group[(ss_group.store_id == s)].iloc[0].tail(last_date).to_list()
        plt.pyplot.scatter(x_data, y_data, s=6, alpha=0.4)
        
        m, b = np.polyfit(x_data, y_data, 1)
        print("State: " + state, "Store: " + s, "Gradient:", m, "Intercept:", b)
        plt.pyplot.plot([0, last_date], [b, m*last_date + b], linewidth=4, label=s)
        plt.pyplot.legend(loc="upper left")
        plt.pyplot.title(state)

# Displaying anomalies
plt.pyplot.figure(figsize=(16, 4))
plt.pyplot.subplot(121)
y_data = ss_group[(ss_group.store_id == 'WI_1')].iloc[0].tail(last_date).to_list()
plt.pyplot.scatter(x_data, y_data, s=6, alpha=0.4, label='WI_1')
plt.pyplot.legend(loc="upper left")
plt.pyplot.title("WI_1 Anomaly at day 700")

plt.pyplot.subplot(122)
y_data = ss_group[(ss_group.store_id == 'WI_2')].iloc[0].tail(last_date).to_list()
plt.pyplot.scatter(x_data, y_data, s=6, alpha=0.4, label='WI_2',c='darkorange')
plt.pyplot.legend(loc="upper left")
plt.pyplot.title("WI_2 Anomaly at day 500")

# Category by store
* All stores are different enough to justify adding storeId as a parameter. 
* This should mean state_id doesn't really need to affect the model.
* Anomaly in CA_2 detected too

In [None]:
store_group = data.groupby('store_id')
sc_group = data.groupby(['store_id', 'cat_id']).mean().reset_index()

prev_state = ''
state_count = 1
for store in store_group.groups.keys():
    if (prev_state != store[0:2]):
            plt.pyplot.figure(figsize=(16, 8))
            prev_state = store[0:2]
            state_count = 1
    plt.pyplot.subplot(int('22' + str(state_count)))
    state_count += 1
    for i, c in enumerate(category_groups.groups.keys()):
        
        y_data = sc_group[(sc_group.store_id == store) & (sc_group.cat_id == c)].iloc[0].tail(last_date).to_list()
        plt.pyplot.scatter(x_data, y_data, s=6, alpha=0.4)
        
        m, b = np.polyfit(x_data, y_data, 1)
        plt.pyplot.plot([0, last_date], [b, m*last_date + b], linewidth=4, label=c)
        plt.pyplot.legend(loc="upper left")
        plt.pyplot.title(store)
        

# Anomalies
* WI_1 in FOODS
* WI_2 in FOODS
* WI_2 in HOUSEHOLD
* CA_2 in FOODS

In [None]:
plt.pyplot.figure(figsize=(16, 8))
plt.pyplot.subplot(221)

y_data = sc_group[(sc_group.store_id == 'WI_1') & (sc_group.cat_id == 'FOODS')].iloc[0].tail(last_date-700).to_list()
plt.pyplot.scatter(range(700, last_date), y_data, s=6, alpha=0.4, label='FOODS')
plt.pyplot.legend(loc="upper left")
plt.pyplot.title("WI_1 Anomaly at day 700")

plt.pyplot.subplot(222)
y_data = sc_group[(sc_group.store_id == 'WI_2') & (sc_group.cat_id == 'FOODS')].iloc[0].tail(last_date-550).to_list()
plt.pyplot.scatter(range(550, last_date), y_data, s=6, alpha=0.4, label='FOODS')
plt.pyplot.legend(loc="upper left")
plt.pyplot.title("WI_2 Anomaly at day 500")

plt.pyplot.subplot(223)
y_data = sc_group[(sc_group.store_id == 'WI_2') & (sc_group.cat_id == 'HOUSEHOLD')].iloc[0].tail(last_date-550).to_list()
plt.pyplot.scatter(range(550, last_date), y_data, s=6, alpha=0.4, label='HOUSEHOLD')
plt.pyplot.legend(loc="upper left")
plt.pyplot.title("WI_2 Anomaly at day 500")

plt.pyplot.subplot(224)
y_data = sc_group[(sc_group.store_id == 'CA_2') & (sc_group.cat_id == 'FOODS')].iloc[0].tail(last_date-1600).to_list()
plt.pyplot.scatter(range(1600, last_date), y_data, s=6, alpha=0.4, label='FOODS')
plt.pyplot.legend(loc="upper left")
plt.pyplot.title("CA_2 Anomaly at day 1600")


# Differences in day of the week
* Generally Sat or Sun has the most sales. Rarely Fridays.
* Generally it lowers throughout the middle of the week.

In [None]:
# Don't warn us for too many figures
plt.rcParams.update({'figure.max_open_warning': 0})
store_group = data.groupby('store_id')
sc_group = data.groupby(['store_id', 'cat_id']).mean().reset_index()

prev_state = ''
state_count = 1
days = ['Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri']
for store in store_group.groups.keys():
    if (prev_state != store[0:2]):
            prev_state = store[0:2]
            state_count = 1
    state_count += 1
    for i, c in enumerate(category_groups.groups.keys()):
        y = pd.DataFrame()
        y_data = pd.Series(sc_group[(sc_group.store_id == store) & (sc_group.cat_id == c)].iloc[0].tail(last_date).to_list())
        for j in range(0, len(days)):
            y[days[j]] = y_data[y_data.index % len(days) == j].reset_index(drop=True)
        y.plot.box()
        plt.pyplot.title(store + " - " + c)
        

# Differences in months
No significant trend shown for month of the year or day of the month

In [None]:
m_data = total_sales.tail(last_date)
mdates = dates.set_index('d')
month_alloc = pd.concat([pd.Series([mdates.loc[d, 'month'] for d in total_sales.index]), m_data.reset_index(drop=True)], axis=1)
month_grp = month_alloc.groupby(0)

plt.pyplot.figure(figsize=(12,4))
plt.pyplot.scatter(month_grp.groups.keys(), month_grp.mean(), label = "mean")
plt.pyplot.scatter(month_grp.groups.keys(), month_grp.median(), label = "median")
plt.pyplot.scatter(month_grp.groups.keys(), month_grp.max(), label = "max")
plt.pyplot.scatter(month_grp.groups.keys(), month_grp.min(), label = "min")
plt.pyplot.title("Mean sales per month")
plt.pyplot.legend(loc="upper left")
month_alloc = pd.concat([pd.Series([mdates.loc[d, 'date'][-2:] for d in total_sales.index]), m_data.reset_index(drop=True)], axis=1)
month_grp = month_alloc.groupby(0)

plt.pyplot.figure(figsize=(12,4))
plt.pyplot.scatter(month_grp.groups.keys(), month_grp.mean(), label = "mean")
plt.pyplot.scatter(month_grp.groups.keys(), month_grp.median(), label = "median")
plt.pyplot.scatter(month_grp.groups.keys(), month_grp.max(), label = "max")
plt.pyplot.scatter(month_grp.groups.keys(), month_grp.min(), label = "min")
plt.pyplot.title("Mean sales per month day")
plt.pyplot.legend(loc="upper left")

In [None]:
cat_data = category_groups.mean()
for cat in category_groups.groups.keys():   
    m_data = cat_data.loc[cat].tail(last_date)
    mdates = dates.set_index('d')
    month_alloc = pd.concat([pd.Series([mdates.loc[d, 'month'] for d in total_sales.index]), m_data.reset_index(drop=True)], axis=1)
    month_grp = month_alloc.groupby(0)

    plt.pyplot.figure(figsize=(16,4))
    plt.pyplot.subplot(121)
    plt.pyplot.scatter(month_grp.groups.keys(), month_grp.mean(), label="mean")
    plt.pyplot.scatter(month_grp.groups.keys(), month_grp.median(), label="median")
    plt.pyplot.scatter(month_grp.groups.keys(), month_grp.max(), label="max")
    plt.pyplot.scatter(month_grp.groups.keys(), month_grp.min(), label="min")
    plt.pyplot.title("Sales per month for " + cat)
    plt.pyplot.legend(loc="upper left")

    month_alloc = pd.concat([pd.Series([mdates.loc[d, 'date'][-2:] for d in total_sales.index]), m_data.reset_index(drop=True)], axis=1)
    month_grp = month_alloc.groupby(0)

    plt.pyplot.subplot(122)
    plt.pyplot.title("Sales per month day " + cat)
    plt.pyplot.scatter(month_grp.groups.keys(), month_grp.mean(), label="mean")
    plt.pyplot.scatter(month_grp.groups.keys(), month_grp.median(), label="median")
    plt.pyplot.scatter(month_grp.groups.keys(), month_grp.max(), label="max")
    plt.pyplot.scatter(month_grp.groups.keys(), month_grp.min(), label="min")
    plt.pyplot.title("Sales per month day " + cat)
    plt.pyplot.legend(loc="upper left")

# Let's just see how a few products behave
This has an example of:
1. Frequently purchased items
2. Rarely purchased items
3. Items with anomalies

In [None]:
p1 = data[data.id == 'FOODS_2_001_CA_3_validation'].transpose().tail(98).reset_index(drop=True)
fig = plt.pyplot.figure(figsize=(16, 4))
ax = fig.gca()
ax.set_xticks(range(0, 98, 7))
plt.pyplot.scatter(p1.index, p1)

p2 = data[data.id == 'HOBBIES_2_100_WI_3_validation'].transpose().tail(98).reset_index(drop=True)
fig = plt.pyplot.figure(figsize=(16, 4))
ax = fig.gca()
ax.set_xticks(range(0, 98, 7))
plt.pyplot.scatter(p2.index, p2)

p3 = data[data.id == 'HOUSEHOLD_1_100_TX_2_validation'].transpose().tail(175).reset_index(drop=True)
fig = plt.pyplot.figure(figsize=(16, 4))
ax = fig.gca()
ax.set_xticks(range(0, 175, 7))
plt.pyplot.scatter(p3.index, p3)


# Now with sales price

In [None]:
wk_map = {}
for day in data.columns:
    if day.startswith('d_'):
        wk_map[day] = dates[dates.d == day].wm_yr_wk.iloc[0]
        wk_map[int(day[2:])] = wk_map[day]

for day in range(last_date, last_date+48):
    wk_map[day] = dates[dates.d == 'd_'+str(day)].wm_yr_wk.iloc[0]

mean_price_lookup = sale_data.groupby(['item_id', 'store_id']).mean().sell_price
sales_lookup = sale_data.set_index(['item_id', 'store_id', 'wm_yr_wk']).sort_index()

print()
x_data = range(0, last_date)
ys = [sales_lookup.loc['HOBBIES_1_001','CA_1', wk_map[x+1]] if (('HOBBIES_1_001','CA_1', wk_map[x+1]) in sales_lookup.index) else 2.5 for x in x_data]

y = data[data.id == 'HOBBIES_1_001_CA_1_validation'].iloc[0].tail(-6)
plt.pyplot.figure(figsize=(12,4))
plt.pyplot.subplot(121)
plt.pyplot.scatter(x_data, y)
plt.pyplot.subplot(122)
plt.pyplot.scatter(x_data, ys)

# Creating a model

To create this model, a basic model can be made by training each `id` in combination with `weekday`.  
We must also account for the anomalies so some data cleanup must be performed.

In [None]:
pred_int = 28
last_date = 1913
train_data = data.rename(mapper=(lambda x: int(x[2:]) if x[0:2] == 'd_' else x),axis='columns').iloc[:,0:-pred_int]
def evaluate(preds, data):
    def calc_err(row):
        item_id = row.iloc[0]
        used_data = row.tail(last_date)
        actual = used_data.tail(pred_int).to_list()
        
        hist_err = used_data.head(-pred_int).diff().tail(-1).pow(2).sum()
        pred = preds[preds.id == item_id].iloc[0, :].tail(-1).to_list()
        pred_err = sum([(pred[i] - actual[i])**2 for i in range(0,pred_int)])
        
        den = hist_err/(last_date-pred_int-1)
        rmsse = np.sqrt((1/pred_int)*pred_err/den)
        if (int(row.name) % 305 == 0):
            print('.',end='')
        return [item_id, rmsse]
    print('.'*25)
    print('.'*50)
    print('.'*100)
    result = data.apply(calc_err, result_type='expand',axis=1).rename(columns={0: 'id', 1: 'err'})
    print()
    print("Mean error: ", result.err.mean())
    print("Median error: ", result.err.median())
    print("Largest errors:")
    print(result.nlargest(5, 'err'))
    return result

# evaluate(submission, data)
train_data

In [None]:
features = 6
def train(estimator, data):
    print("*"*100)
    res = data.apply(estimator, axis=1, result_type='expand') \
            .rename(mapper=(lambda n: 'F' + str(n+1)), axis=1)
    print()
    formatted = pd.concat([data.id, res], axis=1)
    return formatted

def basic_model(row):
    return [np.round(row.tail(-features).mean())]*pred_int

# preds = train(basic_model, train_data)
# res = evaluate(preds, data)
# res

# Create linear model

In [None]:
from sklearn.linear_model import LinearRegression

def simple_linear_model(row):
    data = row.tail(-features)
    rid = row.iloc[0]
    
    days = data.index.to_list()
    wkday1 = [np.sin(2*np.pi*x/7) for x in days]
    wkday2 = [np.cos(2*np.pi*x/7) for x in days]
    wkday3 = [np.sin(2*np.pi*x/14) for x in days]
    wkday4 = [np.cos(2*np.pi*x/14) for x in days]
    wkday5 = [np.sin(2*np.pi*x/3.5) for x in days]
    wkday6 = [np.cos(2*np.pi*x/3.5) for x in days]
#     wkday5 = [np.sin(2*np.pi*x/14) for x in days]
#     wkday4 = [np.cos(2*np.pi*x/14) for x in days]

    X_train = np.column_stack((wkday1, wkday2))
    Y = data
    regressor = LinearRegression().fit(X_train, Y)
    
    plt.pyplot.figure(figsize=(24, 4))
    plt.pyplot.plot(days, Y)
    plt.pyplot.plot(days, regressor.predict(X_train))
    
#     days = range(days[-1]+1, days[-1]+pred_int+1)
    wkday1 = [np.sin(2*np.pi*x/7) for x in days]
    wkday2 = [np.cos(2*np.pi*x/7) for x in days]
    X_pred = np.column_stack((wkday1, wkday2))
    print(regressor.intercept_)
    reg2 = LinearRegression().fit(np.column_stack((wkday1, wkday2)), Y - regressor.intercept_)
    
    plt.pyplot.figure(figsize=(24, 4))
    plt.pyplot.plot(days, Y - regressor.intercept_)
    plt.pyplot.plot(days, regressor.predict(np.column_stack((wkday1, wkday2))))
    return regressor.predict(X_pred)
   
# preds = train(simple_linear_model, train_data)
# preds
x = simple_linear_model(train_data.iloc[0, -300:])

In [None]:
def linear_model(row):
    data = row.tail(-features)
    rid = row.iloc[0]
    item_id = row.iloc[1]
    store_id = row.iloc[4]
    days = len(data)
    
    if (rid.startswith('HOUSEHOLD') and rid.endswith('WI_2_validation')):
        data = data.tail(-700)
    elif (rid.startswith('FOODS') and rid.endswith('WI_2_validation')):
        data = data.tail(-500)
    elif (rid.startswith('FOODS') and rid.endswith('WI_1_validation')):
        data = data.tail(-500)
    elif (rid.startswith('FOODS') and rid.endswith('CA_2_validation')):
        data = data.tail(320)

    changes = (data.diff(1) != 0).astype('int').cumsum()
    zeroes = changes.groupby(changes).size()
    zgroup = zeroes.where(zeroes > 28).dropna().index.to_list()

    points = changes[~changes.isin(zgroup)].index.to_list()
    # If less than 3 weeks of data to use OR last data points were more than 3 weeks ago.
    if (len(points) < 21 or points[-1] <= days - 21):
        if (len(points) < 2):
            return row.tail(14).to_list()*(pred_int//14)
        if (points[-1] > days - 21):
            first_used_date = [x for x in points if x > days-21][0]
            return [row.tail(-first_used_date+1).mean()]*pred_int
        else:
            return row.tail(14).to_list()*(pred_int//14)
    
    maxp = mean_price_lookup.loc[item_id, store_id]*1.75
    prices = [sales_lookup.loc[(wk_map[p+1], item_id, store_id)] if (wk_map[p+1], item_id, store_id) in sales_lookup.index else maxp for p in points]
    wkday1 = [np.sin(2*np.pi*x/7) for x in points]
    wkday2 = [np.cos(2*np.pi*x/7) for x in points]
    
    X_train = np.column_stack((points, wkday1, wkday2, prices))
    Y = data[points]
    regressor = LinearRegression().fit(X_train, Y)
    
    test_points = range(points[-1]+1, points[-1]+pred_int+1)
    test_prices = [sales_lookup.loc[(wk_map[p+1], item_id, store_id)] if (wk_map[p+1], item_id, store_id) in sales_lookup.index else maxp for p in test_points]
    test_wkday1 = [np.sin(2*np.pi*x/7) for x in test_points]
    test_wkday2 = [np.cos(2*np.pi*x/7) for x in test_points]
    X_pred = np.column_stack((test_points, test_wkday1, test_wkday2, test_prices))
    
    if (int(row.name) % 305 == 0):
        print("*", end='')
    return regressor.predict(X_pred)

# preds = train(linear_model, train_data)
# print()
# print(train_data)
# res = evaluate(preds, data)
# res

# This definitely did not do what I wanted it to do:
Here we learn to **ALWAYS** visualise predictions when possible

In [None]:
x = simple_linear_model(train_data.iloc[0, -300:])
# plt.pyplot.plot(range(0,28), data.iloc[5,-28:])
# plt.pyplot.plot(range(0,28), preds.iloc[5,-28:])

In [None]:
num = 200
anomalies = ['HOUSEHOLD_1_032_TX_1_validation', 'HOUSEHOLD_1_020_CA_3_validation', 'HOUSEHOLD_1_020_CA_3_validation', 'FOODS_3_827_CA_4_validation', 'HOUSEHOLD_1_400_CA_4_validation']
for rid in anomalies:
    plt.pyplot.figure(figsize=(16, 4))
    p = train_data[train_data.id == rid].iloc[0].tail(num)
    p2 = data[data.id == rid].iloc[0].tail(num)
    plt.pyplot.subplot(121)
    plt.pyplot.scatter(p.index.to_list(), p)
    plt.pyplot.subplot(122)
    plt.pyplot.scatter(p2.index.to_list(), p2)

# Using Autocorrelation

In [None]:
x_corr = range(0, data.iloc[0, -28:].size)
plt.pyplot.bar(x_corr, [data.iloc[0,features:].astype(float).autocorr(lag=i) for i in x_corr])

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
def sarimax_model(row):
    data = row.tail(-features)
    rid = row.iloc[0]
    model = SARIMAX(data.astype(float), order=(1, 1, 1), seasonal_order=(1, 1, 0, 8))
    result = model.fit()
    if (int(row.name) % 305 == 0):
        print("*", end='')
    return result.forecast(28)

def test_sarimax_model(row):
    data = row.tail(-features)
    rid = row.iloc[0]
    model = SARIMAX(data.astype(float), order=(1, 1, 1), seasonal_order=(1, 1, 0, 8))
    result = model.fit()
    return (result.fittedvalues, result)

points = train_data.iloc[0, -300:].reset_index(drop=True)
result, model = test_sarimax_model(points)
fcast = model.forecast(steps=28)
plt.pyplot.figure(figsize=(24,4))
plt.pyplot.plot(result.index.to_list(), result)
plt.pyplot.plot(points.index.to_list(), points)
plt.pyplot.plot(fcast.index.to_list(), fcast)
actual = data.iloc[0,-28:].reset_index(drop=True)

print("Test errors: ")
print([sum([(result.loc[i] - points.loc[i-1])**2 for i in range(9,300)]), sum([(points.loc[i] - points.mean())**2 for i in range(9,300)])])
print("Forecast errors")
print([sum([(actual[i] - fcast.reset_index(drop=True)[i])**2 for i in range(0, 28)]), sum([(actual[i] - points.mean())**2 for i in range(0, 28)])])

# predictions = train(sarimax_model, train_data)
# THIS is wayyyy too slow.


# Try lightGBM model

In [None]:
del category_groups
del category_data
del sc_group
del state_groups
del ss_group
del sales_lookup
del month_alloc
del mean_price_lookup
del wk_map
del total_sales
gc.collect()

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error 

cols = ['id', 'store_id', 'item_id'] + ['d_' + str(day) for day in range(1, 1914)]
def get_prediction_subset(dataset):
    cols = ['id', 'store_id', 'item_id'] + ['d_' + str(day) for day in range(1, 1914)]
    dataset = pd.melt(dataset, id_vars=['id', 'store_id', 'item_id'], value_name='demand', var_name='day')
    dataset = pd.merge(dataset, dates[['d', 'wm_yr_wk', 'wday']], how = 'left', left_on = ['day'], right_on = ['d'])
    dataset = pd.merge(dataset, sale_data[['store_id', 'item_id', 'wm_yr_wk', 'sell_price']], how = 'left', on=['store_id', 'item_id', 'wm_yr_wk'])
    dataset.drop(['store_id', 'item_id', 'd'], axis=1, inplace=True)
    dataset['day'] = dataset['day'].map(lambda x: int(x[2:]))
    dataset['id'] = dataset['id'].astype('category')
    print(dataset)
    
    x_train = dataset[dataset['day'] <= 1885][['id', 'day', 'wm_yr_wk', 'wday', 'sell_price', 'demand']]
    y_train = x_train['demand']
    x_train.drop('demand', axis=1, inplace=True)
    x_val = dataset[(dataset['day'] > 1885)][['id', 'day', 'wm_yr_wk', 'wday', 'sell_price', 'demand']]
    y_val = x_val['demand']
    x_val.drop('demand', axis=1, inplace=True)
    
    train_set = lgb.Dataset(x_train, y_train, categorical_feature=['id', 'wday'])
    val_set = lgb.Dataset(x_val, y_val, categorical_feature=['id', 'wday'])
    
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 236,
        'learning_rate': 0.1,
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75 \
    }
    model = lgb.train(params, 
                      train_set, 
                      num_boost_round = 2500, 
                      early_stopping_rounds = 50, 
                      valid_sets = [train_set, val_set], 
                      verbose_eval = 100)
    val_pred = model.predict(x_val)
    val_score = np.sqrt(mean_squared_error(val_pred, y_val))
    print(val_score)
    
    x_val['preds'] = pd.Series(val_pred, index=x_val.index)
    preds = x_val[['id', 'preds']]
    
    result = pd.DataFrame(x_val.groupby('id')\
        .apply(lambda x: x.preds.to_list())).reset_index(level=0)\
        .apply(lambda x: [x.iloc[0]] + x.iloc[1], axis=1, result_type='expand')\
        .rename(mapper=lambda x: 'F'+str(x) if x > 0 else 'id', axis=1)
    
    return result
    

In [None]:
result = get_prediction_subset(data[cols].iloc[0:10000, :])
for low, high in [(10000, 17000), (17000, 24000), (24000, 30490)]:
    result = pd.concat([result, get_prediction_subset(data[cols].iloc[low:high, :])], ignore_index=True)
result

In [None]:
result.to_csv('lgbm.csv', index=False)
result

In [None]:
final = train(linear_model, data.rename(mapper=(lambda x: int(x[2:]) if x[0:2] == 'd_' else x),axis='columns'))
final_sub = pd.concat([final, submission.tail(30490)])
final_sub.to_csv('linear2.csv', index=False)

In [None]:
final_sub