In [43]:
# Libraries setup
import os
import warnings
import pandas as pd
import numpy as np

from dotenv import load_dotenv
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm

warnings.filterwarnings('ignore')
load_dotenv()

True

### Data Load Pipeline
The data load pipeline is almost the same as in EDA. For the purpose of demonstration, it is not a new module, but placed as a function.

In [44]:
# Define paths
fpath_data = os.environ.get("FPATH_DATA")
fpath_dicts = os.environ.get("FPATH_DICTS")

def get_data_order_items():
    # Load order items data
    order_items = pd.read_csv(fpath_data+"order_items.csv")
    # Convert shipping_limit_date to timestamps
    order_items['shipping_limit_date'] = pd.to_datetime(order_items['shipping_limit_date'])
    # Lets load data regarding product category, drop redundant columns, check the data, and then join the data to order_items
    product_categories = pd.read_csv(fpath_data+"products.csv")
    product_categories = product_categories[["product_id", "product_category_name"]]
    # Drop nulls
    product_categories = product_categories.dropna()
    # Map categories to english ones
    categories_translation_data = pd.read_csv(fpath_data+"product_category_name_translation.csv")
    product_categories = pd.merge(product_categories, categories_translation_data, how='left', on='product_category_name')
    # Instead of dropping 13 rows w/o translation, use the same data as originally
    product_categories['product_category_name_english'] = product_categories['product_category_name_english'].fillna(product_categories['product_category_name'])
    # Join product category to orders
    order_items = pd.merge(order_items, product_categories, how='left', on='product_id')
    # I decided to drop those without any category prescribed as those might be incorrect data
    order_items = order_items.dropna(subset=['product_category_name'])
    # Join data regarding purchase timestamp - first, read data
    orders = pd.read_csv(fpath_data+'orders.csv')
    # Keep only relevant columns
    orders = orders[['order_id','order_purchase_timestamp']]
    # Check for nulls
    orders.isnull().sum()
    # Transform order_purchase_timestamp to a timestamp object
    orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])
    # Join orders data via inner - we could do also left join, but it would need to double check for nulls
    order_items = pd.merge(order_items, orders, how='inner', on='order_id')
    # Drop redundant data, not related to this task
    # As this is just a demonstration, we won't be adding information such as holidays for a different set of countries the customers are from etc
    order_items = order_items.drop(columns=['freight_value','shipping_limit_date','product_category_name'])
    # Add a set of information regarding sale timestamp
    order_items['Purchase Date'] = pd.to_datetime(order_items['order_purchase_timestamp']).dt.floor('D')
    return order_items

In [45]:
order_items = get_data_order_items()

dataset = order_items.groupby(['product_category_name_english', 'Purchase Date']).agg(
    sold_products_quantity=('order_item_id', 'size')
    # sellers_count=('seller_id', 'count')
).sort_values(by=['product_category_name_english', 'Purchase Date'], ascending=True).reset_index()
dataset

Unnamed: 0,product_category_name_english,Purchase Date,sold_products_quantity
0,agro_industry_and_commerce,2017-01-23,2
1,agro_industry_and_commerce,2017-01-31,1
2,agro_industry_and_commerce,2017-02-05,1
3,agro_industry_and_commerce,2017-02-08,1
4,agro_industry_and_commerce,2017-02-12,1
...,...,...,...
18499,watches_gifts,2018-08-25,3
18500,watches_gifts,2018-08-26,2
18501,watches_gifts,2018-08-27,2
18502,watches_gifts,2018-08-28,1


### Features Engineering
Feature engineering is the process of creating new features or transforming existing ones to improve 
the performance of machine learning models. 

##### Definition of new features functions

In [46]:
# Time Based Features
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        X_['dayofweek'] = X_.index.dayofweek
        X_['quarter'] = X_.index.quarter
        X_['month'] = X_.index.month
        X_['year'] = X_.index.year
        X_['dayofyear'] = X_.index.dayofyear
        X_['dayofmonth'] = X_.index.day
        X_['weekofyear'] = X_.index.isocalendar().week
        X_['product_category_name_english'] = X_['product_category_name_english'].astype('category')
        return X_

# # Lag Features
# def create_lag_features(df, column, lags):
#     for lag in lags:
#         df[f'{column}_lag_{lag}'] = df.groupby('product_category_name_english')[column].shift(lag)
#     return df

# # Rolling Statistics
# def create_rolling_features(df, column, windows):
#     for window in windows:
#         df[f'{column}_rolling_mean_{window}'] = df.groupby('product_category_name_english')[column].rolling(window=window).mean().reset_index(0,drop=True)
#         df[f'{column}_rolling_std_{window}'] = df.groupby('product_category_name_english')[column].rolling(window=window).std().reset_index(0,drop=True)
#     return df

# # Exponential Moving Average
# def create_ema_features(df, column, spans):
#     for span in spans:
#         df[f'{column}_ema_{span}'] = df.groupby('product_category_name_english')[column].ewm(span=span).mean().reset_index(0,drop=True)
#     return df

# # Fourier Terms for Seasonality
# def fourier_series(dates, period, order):
#     t = (dates - pd.Timestamp("1970-01-01")) / pd.Timedelta('1D')
#     return pd.DataFrame({f'fourier_cos_{period}_{n}': np.cos(2 * n * np.pi * t / period),
#                          f'fourier_sin_{period}_{n}': np.sin(2 * n * np.pi * t / period)}
#                         for n in range(1, order + 1))

# # Price-related Features
# def price_related_features(df):
#     df['price_rolling_mean'] = df.groupby('product_category_name_english')['price'].rolling(window=7).mean().reset_index(0,drop=True)
#     df['price_relative_to_mean'] = df['price'] / df['price_rolling_mean']
#     return df

# # Sold Items Quantity Features
# def quantity_related_features(df):
#     # Sort the dataframe by timestamp and category
#     df = df.sort_values(['product_category_name_english', 'order_purchase_timestamp'])
#     # Calculate rolling count of sold items quantity
#     df['quantity_rolling_count'] = df.groupby('product_category_name_english')['order_item_id'].transform(
#         lambda x: x.rolling(window='7D').count()
#     )
#     # Calculate daily quantity
#     df['daily_quantity'] = df.groupby(['product_category_name_english', df['order_purchase_timestamp'].dt.date])['order_item_id'].transform('count')
#     # Calculate quantity relative to rolling count
#     df['quantity_relative_to_count'] = df['daily_quantity'] / df['quantity_rolling_count']
#     # Calculate cumulative sum of sold items within each category
#     df['cumulative_quantity'] = df.groupby('product_category_name_english')['daily_quantity'].cumsum()
#     return df

##### Final data pipeline that leverages functions written above

In [47]:
def prepare_data(dataset):
    dataset['Purchase Date'] = pd.to_datetime(dataset['Purchase Date'])
    daily_sales = dataset.groupby(['Purchase Date', 'product_category_name_english'])['sold_products_quantity'].sum().reset_index()
    return daily_sales.sort_values(['product_category_name_english', 'Purchase Date'])

def create_forecast_data(last_date, forecast_end, category):
    forecast_start = last_date + pd.Timedelta(days=1)
    forecast_dates = pd.date_range(start=forecast_start, end=forecast_end, freq='D')
    forecast_data = pd.DataFrame({'Purchase Date': forecast_dates, 'product_category_name_english': category})
    forecast_data.set_index('Purchase Date', inplace=True)
    return forecast_data

##### Evaluation metrics

In [48]:
def wmae(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred) * np.arange(1, len(y_true)+1)) / np.sum(np.arange(1, len(y_true)+1))

def bias(y_true, y_pred):
    return np.mean(y_pred - y_true)

##### Main function

In [49]:
def xgboost_forecast(data, category, forecast_end='2018-09-24'):
    data = data.set_index('Purchase Date')
    
    # Define features
    features = ['product_category_name_english', 'dayofweek', 'quarter', 'month', 'year', 'dayofyear', 'dayofmonth', 'weekofyear']
    
    # Create pipeline
    pipeline = Pipeline([
        ('feature_engineer', FeatureEngineer()),
        ('xgb', XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=42, enable_categorical=True))
    ])
    
    # Split data into train and test
    split_date = data.index.max() - pd.Timedelta(days=30)  # Use last 30 days as test set
    train_data = data[data.index <= split_date]
    test_data = data[data.index > split_date]
    
    # Fit the pipeline
    pipeline.fit(train_data[['product_category_name_english']], train_data['sold_products_quantity'])
    
    # Make in-sample predictions
    test_data_transformed = pipeline.named_steps['feature_engineer'].transform(test_data[['product_category_name_english']])
    in_sample_forecast = pipeline.named_steps['xgb'].predict(test_data_transformed[features])
    
    # Prepare and make out-of-sample forecast
    last_date = data.index.max()
    forecast_data = create_forecast_data(last_date, forecast_end, category)
    forecast_data_transformed = pipeline.named_steps['feature_engineer'].transform(forecast_data)
    out_of_sample_forecast = pipeline.named_steps['xgb'].predict(forecast_data_transformed[features])
    forecast_data['forecast'] = out_of_sample_forecast
    
    return test_data, in_sample_forecast, forecast_data

##### Main execution

In [50]:
data = prepare_data(dataset)

forecasts = {}
metrics = {}

for category in tqdm(data['product_category_name_english'].unique(), desc="Processing categories"):
    category_data = data[data['product_category_name_english'] == category]
    
    # Check last available date for the category
    last_date = category_data['Purchase Date'].max()
    print(f"\nCategory: {category}, Last date: {last_date}")
    
    if last_date >= pd.Timestamp('2018-09-24'):
        print(f"Skipping forecast for {category} as data is available up to or beyond 2018-09-24")
        continue
    
    test_data, in_sample_forecast, forecast_data = xgboost_forecast(category_data, category, forecast_end='2018-09-24')
    
    # Calculate metrics
    mae = mean_absolute_error(test_data['sold_products_quantity'], in_sample_forecast)
    wmae_score = wmae(test_data['sold_products_quantity'].values, in_sample_forecast)
    bias_score = bias(test_data['sold_products_quantity'].values, in_sample_forecast)
    
    metrics[category] = {
        'MAE': mae,
        'WMAE': wmae_score,
        'Bias': bias_score,
        'Last_Date': last_date
    }
    
    forecasts[category] = forecast_data['forecast']

Processing categories:   0%|          | 0/73 [00:00<?, ?it/s]


Category: agro_industry_and_commerce, Last date: 2018-08-26 00:00:00


Processing categories:   1%|▏         | 1/73 [00:00<00:26,  2.76it/s]


Category: air_conditioning, Last date: 2018-08-25 00:00:00


Processing categories:   3%|▎         | 2/73 [00:00<00:30,  2.34it/s]


Category: art, Last date: 2018-08-27 00:00:00


Processing categories:   4%|▍         | 3/73 [00:01<00:27,  2.51it/s]


Category: arts_and_craftmanship, Last date: 2018-08-24 00:00:00


Processing categories:   5%|▌         | 4/73 [00:01<00:22,  3.12it/s]


Category: audio, Last date: 2018-08-28 00:00:00


Processing categories:   7%|▋         | 5/73 [00:01<00:26,  2.58it/s]


Category: auto, Last date: 2018-08-28 00:00:00


Processing categories:   8%|▊         | 6/73 [00:02<00:30,  2.22it/s]


Category: baby, Last date: 2018-08-28 00:00:00


Processing categories:  10%|▉         | 7/73 [00:03<00:31,  2.07it/s]


Category: bed_bath_table, Last date: 2018-08-28 00:00:00


Processing categories:  11%|█         | 8/73 [00:03<00:35,  1.83it/s]


Category: books_general_interest, Last date: 2018-08-27 00:00:00


Processing categories:  12%|█▏        | 9/73 [00:04<00:36,  1.74it/s]


Category: books_imported, Last date: 2018-08-07 00:00:00


Processing categories:  14%|█▎        | 10/73 [00:04<00:31,  1.99it/s]


Category: books_technical, Last date: 2018-08-23 00:00:00


Processing categories:  16%|█▋        | 12/73 [00:05<00:24,  2.50it/s]


Category: cds_dvds_musicals, Last date: 2018-04-20 00:00:00

Category: christmas_supplies, Last date: 2018-08-19 00:00:00


Processing categories:  18%|█▊        | 13/73 [00:05<00:22,  2.67it/s]


Category: cine_photo, Last date: 2018-08-25 00:00:00


Processing categories:  19%|█▉        | 14/73 [00:05<00:19,  2.95it/s]


Category: computers, Last date: 2018-08-22 00:00:00


Processing categories:  21%|██        | 15/73 [00:06<00:19,  2.92it/s]


Category: computers_accessories, Last date: 2018-08-29 00:00:00


Processing categories:  22%|██▏       | 16/73 [00:06<00:22,  2.49it/s]


Category: consoles_games, Last date: 2018-08-28 00:00:00


Processing categories:  23%|██▎       | 17/73 [00:07<00:24,  2.28it/s]


Category: construction_tools_construction, Last date: 2018-08-27 00:00:00


Processing categories:  25%|██▍       | 18/73 [00:07<00:26,  2.07it/s]


Category: construction_tools_lights, Last date: 2018-08-28 00:00:00


Processing categories:  26%|██▌       | 19/73 [00:08<00:24,  2.18it/s]


Category: construction_tools_safety, Last date: 2018-08-24 00:00:00


Processing categories:  27%|██▋       | 20/73 [00:08<00:23,  2.27it/s]


Category: cool_stuff, Last date: 2018-08-28 00:00:00


Processing categories:  29%|██▉       | 21/73 [00:09<00:24,  2.15it/s]


Category: costruction_tools_garden, Last date: 2018-08-28 00:00:00


Processing categories:  30%|███       | 22/73 [00:09<00:22,  2.28it/s]


Category: costruction_tools_tools, Last date: 2018-08-22 00:00:00


Processing categories:  33%|███▎      | 24/73 [00:10<00:16,  2.98it/s]


Category: diapers_and_hygiene, Last date: 2018-08-26 00:00:00

Category: drinks, Last date: 2018-08-27 00:00:00


Processing categories:  34%|███▍      | 25/73 [00:10<00:17,  2.69it/s]


Category: dvds_blu_ray, Last date: 2018-08-23 00:00:00


Processing categories:  36%|███▌      | 26/73 [00:11<00:18,  2.48it/s]


Category: electronics, Last date: 2018-08-26 00:00:00


Processing categories:  37%|███▋      | 27/73 [00:11<00:22,  2.06it/s]


Category: fashio_female_clothing, Last date: 2018-08-22 00:00:00


Processing categories:  38%|███▊      | 28/73 [00:11<00:19,  2.35it/s]


Category: fashion_bags_accessories, Last date: 2018-08-28 00:00:00


Processing categories:  40%|███▉      | 29/73 [00:12<00:21,  2.03it/s]


Category: fashion_childrens_clothes, Last date: 2018-06-03 00:00:00


Processing categories:  41%|████      | 30/73 [00:12<00:17,  2.46it/s]


Category: fashion_male_clothing, Last date: 2018-08-17 00:00:00


Processing categories:  42%|████▏     | 31/73 [00:13<00:16,  2.55it/s]


Category: fashion_shoes, Last date: 2018-08-21 00:00:00


Processing categories:  44%|████▍     | 32/73 [00:13<00:17,  2.34it/s]


Category: fashion_sport, Last date: 2018-08-19 00:00:00


Processing categories:  45%|████▌     | 33/73 [00:13<00:14,  2.73it/s]


Category: fashion_underwear_beach, Last date: 2018-08-09 00:00:00


Processing categories:  47%|████▋     | 34/73 [00:14<00:14,  2.75it/s]


Category: fixed_telephony, Last date: 2018-08-13 00:00:00


Processing categories:  48%|████▊     | 35/73 [00:14<00:15,  2.49it/s]


Category: flowers, Last date: 2018-07-19 00:00:00


Processing categories:  49%|████▉     | 36/73 [00:15<00:13,  2.77it/s]


Category: food, Last date: 2018-08-29 00:00:00


Processing categories:  51%|█████     | 37/73 [00:15<00:16,  2.21it/s]


Category: food_drink, Last date: 2018-08-23 00:00:00


Processing categories:  52%|█████▏    | 38/73 [00:16<00:16,  2.10it/s]


Category: furniture_bedroom, Last date: 2018-08-25 00:00:00


Processing categories:  53%|█████▎    | 39/73 [00:16<00:14,  2.33it/s]


Category: furniture_decor, Last date: 2018-08-28 00:00:00


Processing categories:  55%|█████▍    | 40/73 [00:17<00:18,  1.78it/s]


Category: furniture_living_room, Last date: 2018-08-24 00:00:00


Processing categories:  56%|█████▌    | 41/73 [00:18<00:18,  1.73it/s]


Category: furniture_mattress_and_upholstery, Last date: 2018-07-03 00:00:00


Processing categories:  58%|█████▊    | 42/73 [00:18<00:14,  2.09it/s]


Category: garden_tools, Last date: 2018-08-27 00:00:00


Processing categories:  59%|█████▉    | 43/73 [00:19<00:16,  1.81it/s]


Category: health_beauty, Last date: 2018-08-29 00:00:00


Processing categories:  60%|██████    | 44/73 [00:19<00:17,  1.63it/s]


Category: home_appliances, Last date: 2018-08-25 00:00:00


Processing categories:  62%|██████▏   | 45/73 [00:20<00:17,  1.61it/s]


Category: home_appliances_2, Last date: 2018-08-23 00:00:00


Processing categories:  63%|██████▎   | 46/73 [00:21<00:16,  1.62it/s]


Category: home_comfort_2, Last date: 2018-07-28 00:00:00


Processing categories:  64%|██████▍   | 47/73 [00:21<00:13,  1.93it/s]


Category: home_confort, Last date: 2018-08-24 00:00:00


Processing categories:  66%|██████▌   | 48/73 [00:21<00:14,  1.77it/s]


Category: home_construction, Last date: 2018-08-26 00:00:00


Processing categories:  67%|██████▋   | 49/73 [00:22<00:13,  1.75it/s]


Category: housewares, Last date: 2018-08-28 00:00:00


Processing categories:  68%|██████▊   | 50/73 [00:23<00:13,  1.72it/s]


Category: industry_commerce_and_business, Last date: 2018-08-26 00:00:00


Processing categories:  70%|██████▉   | 51/73 [00:23<00:12,  1.80it/s]


Category: kitchen_dining_laundry_garden_furniture, Last date: 2018-09-03 00:00:00


Processing categories:  73%|███████▎  | 53/73 [00:24<00:08,  2.22it/s]


Category: la_cuisine, Last date: 2018-04-17 00:00:00

Category: luggage_accessories, Last date: 2018-08-27 00:00:00


Processing categories:  74%|███████▍  | 54/73 [00:25<00:09,  1.97it/s]


Category: market_place, Last date: 2018-08-22 00:00:00


Processing categories:  77%|███████▋  | 56/73 [00:25<00:07,  2.29it/s]


Category: music, Last date: 2018-08-19 00:00:00

Category: musical_instruments, Last date: 2018-08-22 00:00:00


Processing categories:  78%|███████▊  | 57/73 [00:26<00:08,  2.00it/s]


Category: office_furniture, Last date: 2018-08-28 00:00:00


Processing categories:  79%|███████▉  | 58/73 [00:27<00:08,  1.79it/s]


Category: party_supplies, Last date: 2018-08-29 00:00:00


Processing categories:  82%|████████▏ | 60/73 [00:27<00:05,  2.56it/s]


Category: pc_gamer, Last date: 2018-08-18 00:00:00

Category: perfumery, Last date: 2018-08-27 00:00:00


Processing categories:  84%|████████▎ | 61/73 [00:28<00:05,  2.06it/s]


Category: pet_shop, Last date: 2018-08-26 00:00:00


Processing categories:  85%|████████▍ | 62/73 [00:29<00:05,  1.86it/s]


Category: portateis_cozinha_e_preparadores_de_alimentos, Last date: 2018-08-27 00:00:00


Processing categories:  88%|████████▊ | 64/73 [00:29<00:03,  2.64it/s]


Category: security_and_services, Last date: 2017-09-17 00:00:00

Category: signaling_and_security, Last date: 2018-08-24 00:00:00


Processing categories:  89%|████████▉ | 65/73 [00:29<00:03,  2.49it/s]


Category: small_appliances, Last date: 2018-08-22 00:00:00


Processing categories:  90%|█████████ | 66/73 [00:30<00:03,  2.09it/s]


Category: small_appliances_home_oven_and_coffee, Last date: 2018-08-23 00:00:00


Processing categories:  92%|█████████▏| 67/73 [00:30<00:02,  2.39it/s]


Category: sports_leisure, Last date: 2018-08-29 00:00:00


Processing categories:  93%|█████████▎| 68/73 [00:31<00:02,  2.10it/s]


Category: stationery, Last date: 2018-08-27 00:00:00


Processing categories:  95%|█████████▍| 69/73 [00:32<00:02,  1.92it/s]


Category: tablets_printing_image, Last date: 2018-06-09 00:00:00


Processing categories:  96%|█████████▌| 70/73 [00:32<00:01,  2.18it/s]


Category: telephony, Last date: 2018-08-28 00:00:00


Processing categories:  97%|█████████▋| 71/73 [00:33<00:01,  1.93it/s]


Category: toys, Last date: 2018-08-29 00:00:00


Processing categories:  99%|█████████▊| 72/73 [00:33<00:00,  1.70it/s]


Category: watches_gifts, Last date: 2018-08-29 00:00:00


Processing categories: 100%|██████████| 73/73 [00:34<00:00,  2.11it/s]


In [51]:
# Create DataFrames with all forecasts and metrics
forecast_df = pd.DataFrame(forecasts)
metrics_df = pd.DataFrame(metrics).T

print("\nForecasts up to 2018-09-24:")
print(forecast_df)
print("\nMetrics for each category:")
print(metrics_df)

forecast_df.to_csv('XGBoost_category_demand_forecasts.csv')
metrics_df.to_csv('XGBoost_category_demand_metrics.csv')
print("Forecasts have been saved to 'XGBoost_category_demand_forecasts.csv'")
print("Metrics have been saved to 'XGBoost_category_demand_metrics.csv'")


Forecasts up to 2018-09-24:
               agro_industry_and_commerce  air_conditioning       art  \
Purchase Date                                                           
2017-09-18                            NaN               NaN       NaN   
2017-09-19                            NaN               NaN       NaN   
2017-09-20                            NaN               NaN       NaN   
2017-09-21                            NaN               NaN       NaN   
2017-09-22                            NaN               NaN       NaN   
...                                   ...               ...       ...   
2018-09-20                       2.094255          1.039503  1.002913   
2018-09-21                       2.508539          1.139947  0.978940   
2018-09-22                       1.822733          0.958454  1.105342   
2018-09-23                       2.015668          1.043854  1.122530   
2018-09-24                       1.655123          1.137339  1.130967   

               arts_a