In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm  # Import tqdm for progress tracking
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
ds = pd.read_csv("../../dataset/sales.csv", parse_dates=['release_date'])
ds = ds.sample(n=1000)
ds.head(1)

Unnamed: 0.1,Unnamed: 0,external_code,retail,season,category,color,image_path,fabric,release_date,restock,...,2,3,4,5,6,7,8,9,10,11
98189,98189,5063,7,AW19,kimono dress,brown,AI19/05063.png,nice,2019-10-28,13,...,0.0,2.0,0.0,0.0,0.0,0.0,1.0,-1.0,2.0,1.0


In [3]:
ds.shape

(1000, 22)

In [4]:
df = pd.DataFrame(ds)
df = df.sort_values(by="release_date")

In [5]:
# Convert release_date to datetime format
df["release_date"] = pd.to_datetime(df["release_date"])

# Extract date, month, year, and quarter from release_date
df["date"] = df["release_date"].dt.day
df["month"] = df["release_date"].dt.month
df["year"] = df["release_date"].dt.year
df["quarter"] = df["release_date"].dt.quarter

# Remove the release_date column
df.drop("release_date", axis=1, inplace=True)

In [6]:
# Encode specific columns using Label Encoder
label_encoder = LabelEncoder()
df["season"] = label_encoder.fit_transform(df["season"])
df["category"] = label_encoder.fit_transform(df["category"])
df["color"] = label_encoder.fit_transform(df["color"])
df["fabric"] = label_encoder.fit_transform(df["fabric"])

In [7]:
# Function to frame the time series data
def frame_series(df, train_window=2, forecast_horizon=1):
    X, y = [], []   
    for i in tqdm(range(df.shape[0]), desc="Framing time series"):
        sales = df.iloc[i, -12:].values  # Extract the sales data for 12 weeks    
        additional_features = df[['external_code', 'retail', 'season','category','color','fabric'
                                 ,'year','quarter','month','date']].iloc[i].values
        for j in range(len(sales) - train_window - forecast_horizon + 1):
            features = list(sales[j : j + train_window]) + list(additional_features)
            target = sales[j + train_window : j + train_window + forecast_horizon]
            X.append(features)
            y.append(target)

    return X, y

In [8]:
# Frame the time series data
X, y = frame_series(df)

Framing time series: 100%|██████████| 1000/1000 [00:00<00:00, 1259.45it/s]


In [9]:

# Combine X and y using numpy hstack
combined_data = np.hstack((X, y))

In [10]:
combined_data

array([[0.0, 0.0, 2.0, ..., 12.0, 5.0, 1.0],
       [0.0, 1.0, 2.0, ..., 12.0, 5.0, 0.0],
       [1.0, 0.0, 2.0, ..., 12.0, 5.0, 1.0],
       ...,
       [0.0, 23.0, 5574.0, ..., 12.0, 23.0, 12],
       [23.0, 12.0, 5574.0, ..., 12.0, 23.0, 2019],
       [12.0, 2019.0, 5574.0, ..., 12.0, 23.0, 4]], dtype=object)

In [11]:
# Assuming X and y are your NumPy arrays containing features and target values, respectively
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
# Create and train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X, y)

  rf_model.fit(X, y)


In [13]:
y_pred = rf_model.predict(X_test)

In [14]:
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 8.097080024230682
