In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm  # Import tqdm for progress tracking
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [2]:
ds = pd.read_csv("../dataset/sales.csv", parse_dates=['release_date'])
ds = ds.sample(n=1)
ds.head(1)

Unnamed: 0.1,Unnamed: 0,external_code,retail,season,category,color,image_path,fabric,release_date,restock,...,2,3,4,5,6,7,8,9,10,11
86239,86239,4474,12,AW19,culottes,black,AI19/04474.png,technical,2019-07-15,22,...,2.0,2.0,4.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0


In [3]:
ds.shape

(1, 22)

In [4]:
df = pd.DataFrame(ds)

In [5]:
# Convert release_date to datetime format
df["release_date"] = pd.to_datetime(df["release_date"])

# Extract date, month, year, and quarter from release_date
df["date"] = df["release_date"].dt.day
df["month"] = df["release_date"].dt.month
df["year"] = df["release_date"].dt.year
df["quarter"] = df["release_date"].dt.quarter

# Remove the release_date column
df.drop("release_date", axis=1, inplace=True)

In [6]:
# Encode specific columns using Label Encoder
label_encoder = LabelEncoder()
df["season"] = label_encoder.fit_transform(df["season"])
df["category"] = label_encoder.fit_transform(df["category"])
df["color"] = label_encoder.fit_transform(df["color"])
df["fabric"] = label_encoder.fit_transform(df["fabric"])

In [7]:
# Function to frame the time series data
def frame_series(df, train_window=2, forecast_horizon=1):
    X, y = [], []
    selected_features = ['external_code', 'retail', 'season','category','color','fabric']
    for i in tqdm(range(df.shape[0]), desc="Framing time series"):
        sales = df.iloc[i, -12:].values  # Extract the sales data for 12 weeks
        additional_features = df[selected_features].iloc[0].values
        for j in range(len(sales) - train_window - forecast_horizon + 1):
            features = list(sales[j : j + train_window]) + list(additional_features)
            target = sales[j + train_window : j + train_window + forecast_horizon]
            X.append(features)
            y.append(target)

    return X, y

In [8]:
# Frame the time series data
X, y = frame_series(df)

Framing time series: 100%|██████████| 1/1 [00:00<00:00, 500.04it/s]


In [9]:
X

[[4.0, 1.0, 4474, 12, 0, 0, 0, 0],
 [1.0, 1.0, 4474, 12, 0, 0, 0, 0],
 [1.0, 0.0, 4474, 12, 0, 0, 0, 0],
 [0.0, 1.0, 4474, 12, 0, 0, 0, 0],
 [1.0, 0.0, 4474, 12, 0, 0, 0, 0],
 [0.0, 0.0, 4474, 12, 0, 0, 0, 0],
 [0.0, 0.0, 4474, 12, 0, 0, 0, 0],
 [0.0, 15, 4474, 12, 0, 0, 0, 0],
 [15, 7, 4474, 12, 0, 0, 0, 0],
 [7, 2019, 4474, 12, 0, 0, 0, 0]]

In [10]:
y

[array([1.0], dtype=object),
 array([0.0], dtype=object),
 array([1.0], dtype=object),
 array([0.0], dtype=object),
 array([0.0], dtype=object),
 array([0.0], dtype=object),
 array([15], dtype=object),
 array([7], dtype=object),
 array([2019], dtype=object),
 array([3], dtype=object)]

In [11]:

# Combine X and y using numpy hstack
combined_data = np.hstack((X, y))

In [12]:
combined_data

array([[4.0, 1.0, 4474.0, 12.0, 0.0, 0.0, 0.0, 0.0, 1.0],
       [1.0, 1.0, 4474.0, 12.0, 0.0, 0.0, 0.0, 0.0, 0.0],
       [1.0, 0.0, 4474.0, 12.0, 0.0, 0.0, 0.0, 0.0, 1.0],
       [0.0, 1.0, 4474.0, 12.0, 0.0, 0.0, 0.0, 0.0, 0.0],
       [1.0, 0.0, 4474.0, 12.0, 0.0, 0.0, 0.0, 0.0, 0.0],
       [0.0, 0.0, 4474.0, 12.0, 0.0, 0.0, 0.0, 0.0, 0.0],
       [0.0, 0.0, 4474.0, 12.0, 0.0, 0.0, 0.0, 0.0, 15],
       [0.0, 15.0, 4474.0, 12.0, 0.0, 0.0, 0.0, 0.0, 7],
       [15.0, 7.0, 4474.0, 12.0, 0.0, 0.0, 0.0, 0.0, 2019],
       [7.0, 2019.0, 4474.0, 12.0, 0.0, 0.0, 0.0, 0.0, 3]], dtype=object)

In [9]:
# Create and train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X, y)

  rf_model.fit(X, y)


RandomForestRegressor(random_state=42)

In [16]:
print("Forecasted sales for the next week:", forecast)

Forecasted sales for the next week: [0.5832652  0.74540094 0.84420343 ... 0.5832652  0.5832652  0.5832652 ]
