# Environment Setup

## Import Libraries

In [6]:
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


# Import Data

In [2]:
generated_data_filepath = '../../data/generated/demand-forecast/'

vendor_data = pd.read_pickle(f'{generated_data_filepath}vendors.pkl')
purchase_history_data = pd.read_pickle(f'{generated_data_filepath}purchase_history.pkl')
product_demand_data = pd.read_pickle(f'{generated_data_filepath}product_demand.pkl')

# Feature Engineering

In [3]:
# Assuming vendors_data is already prepared
vendors_data_encoded = pd.get_dummies(vendor_data, columns=['location', 'ingredient_supplied', 'category'], drop_first=True)

In [4]:
# Extracting date features
purchase_history_data['purchase_date'] = pd.to_datetime(purchase_history_data['purchase_date'])
purchase_history_data['purchase_day'] = purchase_history_data['purchase_date'].dt.day
purchase_history_data['purchase_month'] = purchase_history_data['purchase_date'].dt.month
purchase_history_data['purchase_year'] = purchase_history_data['purchase_date'].dt.year

# Encoding categorical variables
purchase_history_data_encoded = pd.get_dummies(purchase_history_data, columns=['vendor_name', 'ingredient', 'category'], drop_first=True)


In [5]:
# Extracting date features
product_demand_data['timestamp'] = pd.to_datetime(product_demand_data['timestamp'])
product_demand_data['forecast_day'] = product_demand_data['timestamp'].dt.day
product_demand_data['forecast_month'] = product_demand_data['timestamp'].dt.month
product_demand_data['forecast_year'] = product_demand_data['timestamp'].dt.year

# Encoding categorical variables if needed
demand_forecast_data_encoded = pd.get_dummies(product_demand_data, columns=['product_id'], drop_first=True)


In [9]:
# Selecting relevant features and target from purchase history data

X = purchase_history_data_encoded.drop(['purchase_date', 'quantity'], axis=1)  # Features
y = purchase_history_data_encoded['quantity']  # Target

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Initializing and training the model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions on test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Now, using demand_forecast_data_encoded for predicting future demand
forecast_features = demand_forecast_data_encoded.drop(['timestamp'], axis=1)  # Assuming timestamp is dropped for prediction

forecast_predictions = model.predict(forecast_features)

Mean Squared Error (MSE): 207.29073636987562


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- demand
- economic_indicator
- forecast_day
- forecast_month
- forecast_year
- ...
Feature names seen at fit time, yet now missing:
- category_Fruit
- category_Grain
- category_Herb
- category_Legume
- category_Meat
- ...


In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

NameError: name 'y_test' is not defined