In [1]:
%load_ext watermark

import ipywidgets as widgets
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List

from ipywidgets import interact, interact_manual


sns.set_style()

LAGGED_SALES_DATA = 'lagged_sales_data.csv'

In [2]:
df_lag = pd.read_csv(LAGGED_SALES_DATA)
df_lag.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107 entries, 0 to 1106
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   1107 non-null   int64  
 1   week                   1107 non-null   int64  
 2   article_count          1107 non-null   float64
 3   sales_price            1107 non-null   float64
 4   original_price         1107 non-null   float64
 5   discount               1107 non-null   float64
 6   stock_begin_week       1107 non-null   float64
 7   month                  1107 non-null   int64  
 8   transaction_date       1107 non-null   object 
 9   article_count_minus-1  1107 non-null   float64
 10  article_count_minus-2  1107 non-null   float64
 11  article_count_minus-3  1107 non-null   float64
 12  article_count_minus-4  1107 non-null   float64
 13  article_7754922460402  1107 non-null   int64  
 14  article_7754922460403  1107 non-null   int64  
 15  arti

## Start and End Dates of Train-Test Period

In [3]:
START_DATE = '2019-10-01'
END_DATE = '2020-03-31'

In [4]:
list_of_dates = list(df_lag.transaction_date.unique())
date_range = sorted([dt for dt in list_of_dates if dt >= START_DATE and dt <= END_DATE])

### Check Date Range

In [5]:
for dt in date_range:
    extract = df_lag[df_lag.transaction_date == dt]
    if len(extract) == 0:
        print(f'No records for date = {dt}')

## Train-Test

In [6]:
RESPONSE = ['article_count']
PREDICTORS = ['year', 
              'week', 
              'month', 
              'original_price',
              'article_count_minus-1', 
              'country_country_0', 
              'country_country_1', 
              'country_country_2',
              'article_7754922460402',
              'article_7754922460403',
              'article_7754922460404',
              'article_7754922460405',
              'article_7754922460406',
              'article_7754922460407',
              'article_7754922710402',
              'article_7754922710403',
              'article_7754922710404',
              'article_7754922710405',
              'article_7754922710406',
              'article_7754922710407']

In [7]:
from itertools import product
from xgboost import XGBRegressor


# (article_id, country) -> {transaction_date, actual, predicted}}
article_prediction_dict = {}

articles = ['article_7754922460402',
            'article_7754922460403',
            'article_7754922460404',
            'article_7754922460405',
            'article_7754922460406',
            'article_7754922460407',
            'article_7754922710402',
            'article_7754922710403',
            'article_7754922710404',
            'article_7754922710405',
            'article_7754922710406',
            'article_7754922710407']

countries = ['country_country_0', 
             'country_country_1', 
             'country_country_2']

for dt in date_range:
    train_data = df_lag[df_lag.transaction_date < dt]
    test_data = df_lag[df_lag.transaction_date == dt]
    
    X_train, y_train = train_data[PREDICTORS], train_data[RESPONSE]
   
    
    model = XGBRegressor()
    model.fit(X_train, y_train)
    
    for article, country in product(articles, countries):
        # Get test data for (article, country) combination
        X_test = test_data[(test_data[article] == 1) & (test_data[country] == 1)][PREDICTORS]
        y_test = test_data[(test_data[article] == 1) & (test_data[country] == 1)][RESPONSE]
        
        # Predict for (article, country) combination
        y_pred = model.predict(X_test)
        
        actual = np.nan
        predicted = np.nan 
        
        if len(y_pred) > 0 and len(y_test) > 0:
            predicted = round(y_pred[0], 0)
            actual = y_test.values[0][0]
            
        
        # Record actuals and predicted for (article, country) combination
        if (article, country) in article_prediction_dict:
            article_prediction_dict[(article, country)].append({'transaction_date': dt, 
                                                                 'actual': actual, 
                                                                 'predicted': predicted})
        else:
            article_prediction_dict[(article, country)] = [{'transaction_date': dt, 
                                                             'actual': actual, 
                                                             'predicted': predicted}]





In [8]:
@interact
def plot_actuals_and_predicted(article=articles, country=countries):
    # Extract data for (article, country)
    art_c_data = article_prediction_dict[(article, country)]
    
    # Sort the data by transaction date
    sorted_data = sorted(art_c_data, key = lambda x: x['transaction_date'])
    
    # Extract the values
    x_vals = [entry['transaction_date'] for entry in sorted_data]
    actual_vals = [entry['actual'] for entry in sorted_data]
    predicted_vals = [entry['predicted'] for entry in sorted_data]
    
    if len(actual_vals) > 0:
        mean_mae = sum([abs(a - p) for a, p in zip(actual_vals, predicted_vals)]) / len(actual_vals)
    else: 
        mean_mae = np.nan
    
    # Plot!
    fig, ax = plt.subplots(1, 1, figsize=(14, 7))
    ax.plot(x_vals, actual_vals, label='Actual')
    ax.plot(x_vals, predicted_vals, label='Predicted')
    
    ax.set_xlabel('transaction_date')
    
    plt.title(f'Mean MAE = {mean_mae}')
    plt.xticks(rotation=90)
    plt.legend()
    plt.show()

interactive(children=(Dropdown(description='article', options=('article_7754922460402', 'article_7754922460403…