<a href="https://colab.research.google.com/github/spberry4/KaggleTimeSeriesComp/blob/main/BooksSold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install darts

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting darts
  Downloading darts-0.22.0-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 5.1 MB/s 
[?25hCollecting statsmodels>=0.13.0
  Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 43.8 MB/s 
[?25hCollecting nfoursid>=1.0.0
  Downloading nfoursid-1.0.1-py3-none-any.whl (16 kB)
Collecting shap>=0.40.0
  Downloading shap-0.41.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (569 kB)
[K     |████████████████████████████████| 569 kB 27.9 MB/s 
Collecting tbats>=1.1.0
  Downloading tbats-1.1.1-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 1.6 MB/s 
Collecting catboost>=1.0.6
  Downloading catboost-1.1-cp37-none-manylinux1_x86_64.whl (76.8 MB)
[K     |████████████████████████████████| 76.8 MB 13 kB/s 
Collecting pytorch

In [None]:
import pandas as pd
from darts import TimeSeries
from darts.models import RegressionModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train = pd.read_csv('/content/drive/MyDrive/Data/train.csv', index_col="date", infer_datetime_format=True)

In [None]:
train.tail()

Unnamed: 0_level_0,row_id,country,store,product,num_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-12-31,70123,Spain,KaggleMart,Kaggle for Kids: One Smart Goose,614
2020-12-31,70124,Spain,KaggleRama,Kaggle Advanced Techniques,215
2020-12-31,70125,Spain,KaggleRama,Kaggle Getting Started,158
2020-12-31,70126,Spain,KaggleRama,Kaggle Recipe Book,135
2020-12-31,70127,Spain,KaggleRama,Kaggle for Kids: One Smart Goose,202


In [None]:
train.index = pd.to_datetime(train.index)
train["num_sold"] = pd.to_numeric(train["num_sold"])

In [None]:
train.index

DatetimeIndex(['2017-01-01', '2017-01-01', '2017-01-01', '2017-01-01',
               '2017-01-01', '2017-01-01', '2017-01-01', '2017-01-01',
               '2017-01-01', '2017-01-01',
               ...
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31'],
              dtype='datetime64[ns]', name='date', length=70128, freq=None)

In [None]:
train.dtypes

row_id       int64
country     object
store       object
product     object
num_sold     int64
dtype: object

In [None]:
print(train.shape)

(70128, 5)


In [None]:
px.line(train, x = train.index, y = "num_sold", color = "product", title = "Books Sold")

In [None]:
train.nunique()

row_id      70128
country         6
store           2
product         4
num_sold      699
dtype: int64

In [None]:
train.isnull().sum()

row_id      0
country     0
store       0
product     0
num_sold    0
dtype: int64

In [None]:
train["product"].unique()

array(['Kaggle Advanced Techniques', 'Kaggle Getting Started',
       'Kaggle Recipe Book', 'Kaggle for Kids: One Smart Goose'],
      dtype=object)

In [None]:
train[train["product"] == 'Kaggle Advanced Techniques']

Unnamed: 0_level_0,row_id,country,store,product,num_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-01,0,Belgium,KaggleMart,Kaggle Advanced Techniques,663
2017-01-01,4,Belgium,KaggleRama,Kaggle Advanced Techniques,240
2017-01-01,8,France,KaggleMart,Kaggle Advanced Techniques,610
2017-01-01,12,France,KaggleRama,Kaggle Advanced Techniques,220
2017-01-01,16,Germany,KaggleMart,Kaggle Advanced Techniques,700
...,...,...,...,...,...
2020-12-31,70108,Italy,KaggleRama,Kaggle Advanced Techniques,217
2020-12-31,70112,Poland,KaggleMart,Kaggle Advanced Techniques,606
2020-12-31,70116,Poland,KaggleRama,Kaggle Advanced Techniques,219
2020-12-31,70120,Spain,KaggleMart,Kaggle Advanced Techniques,561


In [None]:
#performing a decomposition and cleaning on all the products based on the unique products available
def clean_decomp_books(df, model):
  books = df["product"].unique()
  for word in books:
    #cleaning and prepping the data for time series decomposition analysis
    df_new = df[df["product"] == word]
    df_new = df_new.groupby(df_new.index).sum()
    df_new = df_new[["num_sold"]]
    df_new.asfreq('d')

    #Using the statmodel seasonal decomposition function to break apart the time series
    decompose_data = seasonal_decompose(df_new, model = model)
    
    #plotting each of the books decompositions
    fig = make_subplots(rows=2, cols=2, subplot_titles=("Observed", "Trend", "Seasonal", "Residuals"))

    fig.add_trace(
    go.Scatter(x = decompose_data.observed.index, y = decompose_data.observed, name = "Observed"),
    row=1, col=1)

    fig.add_trace(
    go.Scatter(x = decompose_data.trend.index, y = decompose_data.trend, name = "Trend"),
    row=1, col=2)

    fig.add_trace(
    go.Scatter(x = decompose_data.seasonal.index, y = decompose_data.seasonal, name = "Seasonal"),
    row=2, col=1)

    fig.add_trace(
    go.Scatter(x = decompose_data.resid.index, y = decompose_data.resid, name = "Residuals"),
    row=2, col=2)

    fig.update_layout(hovermode="x unified",
                 showlegend=False,
                 title = f"{word} Time Series Decomposition")


    fig.show()

In [None]:
clean_decomp_books(train, "additive")

In [None]:
def feature_extraction(df):
  books = df["product"].unique()
  for word in books:
    df_new = df[df["product"] == word]
    df_new = df_new.groupby(df_new.index).sum()
    #df_new = df_new[["num_sold"]]
    df_new.asfreq('d')

    df_new["month"] = pd.DatetimeIndex(df_new["date"]).month
    df_new["year"] = pd.DatetimeIndex(df_new["date"]).year

    

In [None]:
train.head()

Unnamed: 0_level_0,date,country,store,product,num_sold,month,year
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663,1,2017
1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615,1,2017
2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480,1,2017
3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710,1,2017
4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240,1,2017


In [None]:
y_all = TimeSeries.from_group_dataframe(train,
                                        group_cols=["country", "store", "product"],
                                        time_col="date",
                                        value_cols="num_sold")

In [None]:
future_cov_all = TimeSeries.from_group_dataframe(train,
                                        group_cols=["country", "store", "product"],
                                        time_col="date",
                                        value_cols=["month", "year"])

In [None]:
model = RegressionModel(lags = [-1,-8, -12],
                        lags_future_covariates=[0],
                        model = LinearRegression())

In [None]:
model.fit(y_all, future_covariates=future_cov_all)

<darts.models.forecasting.regression_model.RegressionModel at 0x7f121a629ed0>

In [None]:
model.predict(n=12,
              series = y_all,
              future_covariates = future_cov_all)

ERROR:main_logger:ValueError: The corresponding future_covariate of the series at index 0 isn't sufficiently long. Given horizon `n=12`, `min(lags_future_covariates)=0`, `max(lags_future_covariates)=0` and `output_chunk_length=1`
the future_covariate has to range from 2021-01-01 00:00:00 until 2021-01-12 00:00:00 (inclusive), but it ranges only from 2017-01-01 00:00:00 until 2020-12-31 00:00:00.


ValueError: ignored