In [1]:
from river import (
    base,
    compose, 
    metrics, 
    drift,
    forest,
    cluster,
    preprocessing,
    dummy
)
from typing import Any, Dict, Hashable
import datetime as dt

In [2]:
x = {'concept_drift_stage': 0,
     'day_of_week': 1,
     'event_id': '0eb35a0c-8d25-47de-8dda-78f42dc0bf69',
     'is_holiday': False,
     'is_promotion_active': True,
     'month': 9,
     'product_id': 'SKU_00013',
     'promotion_id': 'PROMO_2023_9_SKU',
     'quantity_sold': 32,
     'store_id': 'STORE_005',
     'timestamp': '2023-09-05T23:56:11.782186+00:00',
     #'total_sales_amount': 1086.4,
     'unit_price': 33.95}

In [3]:
class CustomOrdinalEncoder:
    """
    An incremental ordinal encoder that is picklable and processes dictionaries.
    Assigns a unique integer ID to each unique category encountered for each feature.
    """
    def __init__(self):
        # Dictionary to store mappings for each feature.
        # Keys are feature names (from input dictionary), values are dictionaries
        # mapping category value to integer ID for that feature.
        self._feature_mappings: Dict[Hashable, Dict[Any, int]] = {}
        # Dictionary to store the next available integer ID for each feature.
        # Keys are feature names, values are integers.
        self._feature_next_ids: Dict[Hashable, int] = {}
    def learn_one(self, x: Dict[Hashable, Any]):
        """
        Learns categories from a single sample dictionary.
        Iterates through the dictionary's items and learns each category value
        for its corresponding feature.
        Args:
            x: A dictionary representing a single sample.
               Keys are feature names, values are feature values.
               Assumes categorical features are present in this dictionary.
        """
        for feature_name, category_value in x.items():
            # Ensure the category value is hashable (dictionaries/lists are not)
            # You might need more sophisticated type checking or handling
            # if your input dictionaries contain complex unhashable types
            if not isinstance(category_value, Hashable):
                 print(f"Warning: Skipping unhashable value for feature '{feature_name}': {category_value}")
                 continue # Skip this feature for learning
            # If this is the first time we see this feature, initialize its mapping and counter
            if feature_name not in self._feature_mappings:
                self._feature_mappings[feature_name] = {}
                self._feature_next_ids[feature_name] = 0
            # Get the mapping and counter for this specific feature
            feature_map = self._feature_mappings[feature_name]
            feature_next_id = self._feature_next_ids[feature_name]
            # Check if the category value is already in the mapping for this feature
            if category_value not in feature_map:
                # If it's a new category for this feature, assign the next available ID
                feature_map[category_value] = feature_next_id
                # Increment the counter for the next new category for this feature
                self._feature_next_ids[feature_name] += 1
    def transform_one(self, x: Dict[Hashable, Any]) -> Dict[Hashable, int]:
        """
        Transforms categorical features in a single sample dictionary into integer IDs.
        Args:
            x: A dictionary representing a single sample.
               Keys are feature names, values are feature values.
        Returns:
            A new dictionary containing the transformed integer IDs for the
            categorical features that the encoder has seen. Features not
            seen by the encoder are excluded from the output dictionary.
        Raises:
            KeyError: If a feature is seen but a specific category value
                      within that feature has not been seen during learning.
                      You might want to add logic here to handle unseen categories
                      (e.g., return a default value like -1 or NaN for that feature).
        """
        transformed_sample: Dict[Hashable, int] = {}
        for feature_name, category_value in x.items():
            # Only attempt to transform features that the encoder has seen
            if feature_name in self._feature_mappings:
                feature_map = self._feature_mappings[feature_name]

                # Check if the category value for this feature has been seen
                if category_value in feature_map:
                    # Transform the category value using the feature's mapping
                    transformed_sample[feature_name] = feature_map[category_value]
                else:
                    # Handle unseen category values for a known feature
                    # By default, this will raise a KeyError as per the docstring.
                    # Example: return a placeholder value instead of raising error:
                    # transformed_sample[feature_name] = -1 # Or some other indicator
                    # print(f"Warning: Unseen category '{category_value}' for feature '{feature_name}' during transform.")
                    # Or raise the error explicitly:
                    raise KeyError(f"Unseen category '{category_value}' for feature '{feature_name}' during transform.")
            # Features not in self._feature_mappings are ignored in the output.
            # If you need to include them (e.g., original numerical features),
            # you would copy them over here. This encoder only outputs encoded features.
        return transformed_sample
    def get_feature_mappings(self) -> Dict[Hashable, Dict[Any, int]]:
        """Returns the current mappings for all features."""
        return self._feature_mappings
    def get_feature_next_ids(self) -> Dict[Hashable, int]:
        """Returns the next available IDs for all features."""
        return self._feature_next_ids
    def __repr__(self) -> str:
        """String representation of the encoder."""
        num_features = len(self._feature_mappings)
        feature_details = ", ".join([f"{name}: {len(mapping)} categories" for name, mapping in self._feature_mappings.items()])
        return f"CustomPicklableOrdinalEncoder(features={num_features} [{feature_details}])"
    
class DictImputer(base.Transformer):
    """
    Imputes missing values (None or missing keys) for specified features in a dictionary.

    Parameters
    ----------
    on
        List of feature names to impute.
    fill_value
        The value to use for imputation.
    """
    def __init__(self, on: list, fill_value):
        self.on = on
        self.fill_value = fill_value
    def transform_one(self, x: dict):
        x_transformed = x.copy()
        for feature in self.on:
            if x_transformed.get(feature) is None:
                x_transformed[feature] = self.fill_value
        return x_transformed
    

def extract_device_info(x):
    x_ = x['device_info']
    return {
        'os': x_['os'],
        'browser': x_['browser'],
    }

def extract_timestamp_info(x):
    x_ = dt.datetime.strptime(
        x['timestamp'],
        "%Y-%m-%dT%H:%M:%S.%f%z")
    return {
        'year': x_.year,
        'month': x_.month,
        'day': x_.day,
        'hour': x_.hour,
        'minute': x_.minute,
        'second': x_.second
    }

def extract_coordinates(x):
    x_ = x['location']
    return {
        'lat': x_['lat'],
        'lon': x_['lon'],
    }


In [4]:
encoders = {
    "one_hot_encoder": preprocessing.OneHotEncoder(),
    "standard_scaler": preprocessing.StandardScaler(),
}

In [5]:
from pprint import pprint

def process_sample(x, encoders):
    pipe1 = compose.Select(
        'concept_drift_stage',
        'day_of_week',
        'is_holiday',
        'is_promotion_active',
        'month',
        #'total_sales_amount',
        'unit_price'
    )
    pipe1.learn_one(x)
    x1 = pipe1.transform_one(x)
    pipe2a = compose.Select(
        "timestamp",
    )
    pipe2a.learn_one(x)
    x_pipe_2 = pipe2a.transform_one(x)
    pipe2b = compose.FuncTransformer(
        extract_timestamp_info,
    )
    pipe2b.learn_one(x_pipe_2)
    x2 = pipe2b.transform_one(x_pipe_2)
    pipe3a = compose.Select(
        'product_id',
        'promotion_id',
        'store_id'
    )
    pipe3a.learn_one(x)
    x3 = pipe3a.transform_one(x)
    x_to_process = x1 | x2 | x3
    pprint(x_to_process)
    numerical_features = [
        'unit_price',
        #'total_sales_amount',
    ]
    categorical_features = [
        'is_promotion_active',
        'is_holiday',
        'day_of_week',
        'concept_drift_stage',
        'year',
        'month',
        'day',
        #'hour',
        #'minute',
        #'second',
        'product_id',
        'promotion_id',
        'store_id',
    ]
    pipe_num = compose.Select(*numerical_features)
    pipe_num.learn_one(x_to_process)
    x_num = pipe_num.transform_one(x_to_process)
    pipe_cat = compose.Select(*categorical_features)
    pipe_cat.learn_one(x_to_process)
    x_cat = pipe_cat.transform_one(x_to_process)
    encoders["standard_scaler"].learn_one(x_num)
    x_num = encoders["standard_scaler"].transform_one(x_num)
    encoders["one_hot_encoder"].learn_one(x_cat)
    x_cat = encoders["one_hot_encoder"].transform_one(x_cat)
    return x_num | x_cat, {
        "one_hot_encoder": encoders["one_hot_encoder"],
        "standard_scaler": encoders["standard_scaler"],
    }


In [6]:
processed_x = process_sample(x, encoders)[0]
processed_x

{'concept_drift_stage': 0,
 'day': 5,
 'day_of_week': 1,
 'hour': 23,
 'is_holiday': False,
 'is_promotion_active': True,
 'minute': 56,
 'month': 9,
 'product_id': 'SKU_00013',
 'promotion_id': 'PROMO_2023_9_SKU',
 'second': 11,
 'store_id': 'STORE_005',
 'unit_price': 33.95,
 'year': 2023}


{'unit_price': 0.0,
 'store_id_STORE_005': 1,
 'year_2023': 1,
 'promotion_id_PROMO_2023_9_SKU': 1,
 'is_holiday_False': 1,
 'is_promotion_active_True': 1,
 'product_id_SKU_00013': 1,
 'day_of_week_1': 1,
 'concept_drift_stage_0': 1,
 'month_9': 1,
 'day_5': 1}

In [7]:
from river import time_series, linear_model

regressor_snarimax = linear_model.PARegressor(
        C = 0.01, 
        mode = 1)
model = time_series.SNARIMAX(
    p = 2,          # Start with a slightly lower non-seasonal AR
    d = 1,          # For trend
    q = 1,          # Start with a slightly lower non-seasonal MA
    m = 7,          # Weekly seasonality
    sp = 1,         # Seasonal AR
    sd = 0,         # No seasonal differencing initially
    sq = 1,         # Seasonal MA
    regressor = regressor_snarimax # The pipeline defined above
)


In [8]:
model.learn_one(x = processed_x, y = x["quantity_sold"])

In [9]:
y_pred = model.forecast(horizon = 1, xs = [processed_x])

In [10]:
y_pred

[32.0]

In [11]:
import pandas as pd

In [12]:
data = pd.read_parquet("../fastapi_app/data/sales_forecasting.parquet")

In [13]:
data.tail(20)

Unnamed: 0,event_id,timestamp,product_id,store_id,quantity_sold,unit_price,total_sales_amount,is_promotion_active,promotion_id,day_of_week,month,is_holiday,concept_drift_stage
9982,d552d8e9-bccc-400f-aab7-3460fc28f06d,2023-08-06T09:55:10.407069+00:00,SKU_00004,STORE_005,96,19.69,1890.24,True,PROMO_2023_8_SKU,6,8,False,0
9983,be09786e-cbcb-4dad-9161-373d94fcb6e6,2023-08-06T10:26:03.407069+00:00,SKU_00009,STORE_001,61,182.5,11132.5,False,,6,8,False,0
9984,7274cea3-48c9-41a9-9e40-f24188d0a6e0,2023-08-06T10:26:42.407069+00:00,SKU_00004,STORE_003,28,24.57,687.96,False,,6,8,False,0
9985,96d021b6-8d30-42d2-be9b-6c7243440041,2023-08-06T10:27:29.407069+00:00,SKU_00011,STORE_001,100,76.35,7635.0,False,,6,8,False,0
9986,205557e7-9596-483e-bc8d-d16b324f8936,2023-08-06T11:09:45.407069+00:00,SKU_00002,STORE_003,19,165.07,3136.33,False,,6,8,False,0
9987,ec51a9be-0e38-4854-8b1b-c051db37b9cd,2023-08-06T11:14:08.407069+00:00,SKU_00006,STORE_002,9,7.51,67.59,False,,6,8,False,0
9988,a46e8b01-f876-477f-8e96-a0e7a19b9c8f,2023-08-06T11:15:36.407069+00:00,SKU_00016,STORE_001,30,190.76,5722.8,False,,6,8,False,0
9989,579c6248-810a-416c-ba76-b1bbdbe61998,2023-08-06T11:16:32.407069+00:00,SKU_00005,STORE_001,35,190.53,6668.55,False,,6,8,False,0
9990,169c7c75-5b42-48b5-9848-dad294397c9a,2023-08-06T11:59:46.407069+00:00,SKU_00011,STORE_001,102,76.35,7787.7,False,,6,8,False,0
9991,dc12a630-8064-4540-bf9c-86ee93f7a40f,2023-08-06T12:03:09.407069+00:00,SKU_00003,STORE_003,63,14.36,904.68,False,,6,8,False,0


In [14]:
x_prev = {}
for row in data.iterrows():
    x_prev = row[1].to_dict()
    print(x_prev)
    break


{'event_id': 'c59603a7-14a7-4798-a0b1-ec159ec496e7', 'timestamp': '2023-05-12T15:59:24.036464+00:00', 'product_id': 'SKU_00014', 'store_id': 'STORE_005', 'quantity_sold': 26, 'unit_price': 184.67, 'total_sales_amount': 4801.42, 'is_promotion_active': False, 'promotion_id': None, 'day_of_week': 4, 'month': 5, 'is_holiday': False, 'concept_drift_stage': 0}


In [1]:
from pydantic import BaseModel

class Car(BaseModel):
    brand: str
    model: str
    year: int
    color: str
    price: float

    def get_car_information(self) -> str:
        return f"Brand: {self.brand}, Model: {self.model}, Year: {self.year}, Color: {self.color}, Price: {self.price}"

In [2]:
car = Car(brand="Toyota", model="Camry", year=2022, color="Blue", price=25000)
print(car.get_car_information())

Brand: Toyota, Model: Camry, Year: 2022, Color: Blue, Price: 25000.0
