# Environment Setup

## Import Libraries

In [12]:
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


# Import Data

In [13]:
generated_data_filepath = '../../data/generated/demand-forecast/'

vendor_data = pd.read_pickle(f'{generated_data_filepath}vendors.pkl')
purchase_history_data = pd.read_pickle(f'{generated_data_filepath}purchase_history.pkl')
product_demand_data = pd.read_pickle(f'{generated_data_filepath}product_demand.pkl')

In [20]:
vendor_data.head()

Unnamed: 0,vendor_name,location,ingredient_supplied,category
0,Longo's,Grahamville,Almonds,Nut
1,FreshCo,North Lori,Rosemary,Herb
2,Rexall,North Kevin,Onions,Vegetable
3,No Frills,Greentown,Yogurt,Dairy
4,Longo's,South Michaelmouth,Olive Oil,Oil


In [21]:
purchase_history_data.head()

Unnamed: 0,vendor_name,ingredient,category,quantity,purchase_date
0,IGA,Flour,Grain,20.0,2022-01-24 16:30:42.811353
1,Real Canadian Superstore,Tomatoes,Vegetable,11.0,2019-10-22 12:24:44.940169
2,Co-op,Garlic,Vegetable,18.0,2020-11-09 00:17:37.230027
3,T&T Supermarket,Cucumber,Vegetable,18.0,2022-05-06 07:41:38.109536
4,Giant Tiger,Olive Oil,Oil,1.0,2021-02-10 01:29:29.674654


In [22]:
product_demand_data.head()

Unnamed: 0,timestamp,product_id,demand,price,promotion,temperature,economic_indicator,social_media_sentiment,previous_demand
0,2023-01-01,Bell Peppers,94,60.891186,0,38.630601,51.370497,-0.157013,16
1,2023-01-01,Cheese,98,22.10871,0,23.877024,59.302178,-0.927554,97
2,2023-01-02,Salt,33,2.60925,0,-4.800778,1.240155,-0.965119,182
3,2023-01-02,Basil,78,30.68136,1,-13.291511,18.838625,0.044827,240
4,2023-01-03,Bell Peppers,32,26.702867,0,-15.476022,69.579086,-0.436603,280


# Feature Engineering

In [30]:

# Merge datasets based on relevant keys
from sklearn.calibration import LabelEncoder
from sklearn.discriminant_analysis import StandardScaler


# Merge datasets based on relevant keys
merged_data = pd.merge(purchase_history_data, vendor_data, on='vendor_name', how='left')
merged_data = pd.merge(merged_data, product_demand_data, left_on='ingredient', right_on='product_id', how='left')

# Handle missing values in merged_data if any
merged_data.fillna(method='ffill', inplace=True)  # Forward fill missing values


# # Check if 'category_y' exists in merged_data
# if 'category_y' in merged_data.columns:
#     merged_data['category_encoded'] = encoder.fit_transform(merged_data['category_y'])
# else:
#     print("Column 'category_y' not found in merged_data.")

# # Drop unnecessary columns
# merged_data.drop(['category_x', 'category_y'], axis=1, inplace=True)


# Data preprocessing
encoder = LabelEncoder()
merged_data['vendor_name'] = encoder.fit_transform(merged_data['vendor_name'])
merged_data['location'] = encoder.fit_transform(merged_data['location'])
merged_data['ingredient_supplied'] = encoder.fit_transform(merged_data['ingredient_supplied'])
merged_data['category_x'] = encoder.fit_transform(merged_data['category_x'])

# Convert datetime columns to numerical features
merged_data['purchase_date'] = pd.to_datetime(merged_data['purchase_date']).astype(int) // 10**9  # Convert to Unix timestamp
merged_data['timestamp'] = pd.to_datetime(merged_data['timestamp']).astype(int) // 10**9  # Convert to Unix timestamp

# Convert datetime columns to numeric (Unix timestamp)
merged_data['purchase_date'] = pd.to_datetime(merged_data['purchase_date']).astype(int) // 10**9
merged_data['timestamp'] = pd.to_datetime(merged_data['timestamp']).astype(int) // 10**9

# Drop unnecessary columns for modeling
X = merged_data.drop(['quantity', 'vendor_name', 'ingredient', 'product_id'], axis=1)

# Handle remaining categorical columns if any using one-hot encoding
X = pd.get_dummies(X)


# Scale numerical features
scaler = StandardScaler()
X[['demand', 'price', 'temperature', 'economic_indicator', 'social_media_sentiment', 'previous_demand']] = scaler.fit_transform(X[['demand', 'price', 'temperature', 'economic_indicator', 'social_media_sentiment', 'previous_demand']])

# Target variable
y = merged_data['quantity']

merged_data.head()

  merged_data.fillna(method='ffill', inplace=True)  # Forward fill missing values


Unnamed: 0,vendor_name,ingredient,category_x,quantity,purchase_date,location,ingredient_supplied,category_y,timestamp,product_id,demand,price,promotion,temperature,economic_indicator,social_media_sentiment,previous_demand
0,6,Flour,2,20.0,1,92,2,Fruit,-10,,,,,,,,
1,6,Flour,2,20.0,1,63,10,Nut,-10,,,,,,,,
2,6,Flour,2,20.0,1,65,22,Vegetable,-10,,,,,,,,
3,6,Flour,2,20.0,1,98,27,Fruit,-10,,,,,,,,
4,6,Flour,2,20.0,1,84,0,Nut,-10,,,,,,,,


In [32]:
# Train-test split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Model training and evaluation
model = RandomForestRegressor()
model.fit(X_train, y_train)


In [39]:
y_pred = model.predict(X_test)


Mean Squared Error: 210.08146407496594


In [41]:

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_log_error, explained_variance_score

def evaluate_model_performance():
    
    # Compute evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    msle = mean_squared_log_error(y_test, y_pred)
    evs = explained_variance_score(y_test, y_pred)
    
    # Print metrics
    print(f"Mean Absolute Error: {mae}")
    print(f"R^2 Score: {r2}")
    print(f"Mean Squared Logarithmic Error: {msle}")
    print(f"Explained Variance Score: {evs}")


In [42]:
evaluate_model_performance()

Mean Absolute Error: 12.506239598174144
R^2 Score: -0.005788541274223791
Mean Squared Logarithmic Error: 0.6685557250769816
Explained Variance Score: -0.005785111395540943


In [38]:
import pickle


with open('../../models/demand-forecast-analysis/final_model.pkl', 'wb') as file:
   pickle.dump(model, file)

In [18]:
merged_data.columns

Index(['vendor_name', 'ingredient', 'category_x', 'quantity', 'purchase_date',
       'location', 'ingredient_supplied', 'category_y', 'timestamp',
       'product_id', 'demand', 'price', 'promotion', 'temperature',
       'economic_indicator', 'social_media_sentiment', 'previous_demand'],
      dtype='object')

In [15]:
from sklearn.model_selection import train_test_split

# Prepare data for model training
X = merged_data.drop(['quantity', 'purchase_date'], axis=1)  # Features
y = merged_data['quantity']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
from sklearn.ensemble import RandomForestRegressor

# Example: Train a RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


ValueError: could not convert string to float: 'No Frills'

In [None]:
from sklearn.metrics import mean_squared_error

# Example: Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


In [None]:
def predict_quantity(model, input_data):
    # Example function to predict quantity using the trained model
    prediction = model.predict(input_data)
    return prediction

# Example usage:
input_data = X_test.iloc[0].values.reshape(1, -1)  # Take the first row of test data as input
predicted_quantity = predict_quantity(model, input_data)
print(f'Predicted Quantity: {predicted_quantity}')


In [4]:
# Assuming vendors_data is already prepared
vendors_data_encoded = pd.get_dummies(vendor_data, columns=['location', 'ingredient_supplied', 'category'], drop_first=True)

In [10]:
# Extracting date features
from sklearn.calibration import LabelEncoder

purchase_history_data['purchase_date'] = pd.to_datetime(purchase_history_data['purchase_date'])
purchase_history_data['purchase_day'] = purchase_history_data['purchase_date'].dt.day
purchase_history_data['purchase_month'] = purchase_history_data['purchase_date'].dt.month
purchase_history_data['purchase_year'] = purchase_history_data['purchase_date'].dt.year

# Encode categorical variables
encoder = LabelEncoder()
purchase_history_data['vendor_name'] = encoder.fit_transform(purchase_history_data['vendor_name'])
purchase_history_data['ingredient'] = encoder.fit_transform(purchase_history_data['ingredient'])
purchase_history_data['category'] = encoder.fit_transform(purchase_history_data['category'])

In [11]:
purchase_history_data.head()

Unnamed: 0,vendor_name,ingredient,category,quantity,purchase_date,purchase_day,purchase_month,purchase_year
0,1,46,9,15.0,2018-05-29 05:50:59.787032,29,5,2018
1,8,19,11,46.0,2018-11-09 18:59:55.983225,9,11,2018
2,10,38,3,12.0,2022-06-04 02:00:25.382733,4,6,2022
3,8,14,4,11.0,2022-06-19 02:10:15.708447,19,6,2022
4,15,34,11,27.0,2021-03-07 22:32:44.949837,7,3,2021


In [6]:
# Extracting date features
product_demand_data['timestamp'] = pd.to_datetime(product_demand_data['timestamp'])
product_demand_data['forecast_day'] = product_demand_data['timestamp'].dt.day
product_demand_data['forecast_month'] = product_demand_data['timestamp'].dt.month
product_demand_data['forecast_year'] = product_demand_data['timestamp'].dt.year

# Encoding categorical variables if needed
demand_forecast_data_encoded = pd.get_dummies(product_demand_data, columns=['product_id'], drop_first=True)


In [7]:
# Selecting relevant features and target from purchase history data

X = purchase_history_data_encoded.drop(['purchase_date', 'quantity'], axis=1)  # Features
y = purchase_history_data_encoded['quantity']  # Target

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Initializing and training the model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train, y_train)

# Making predictions on test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Now, using demand_forecast_data_encoded for predicting future demand
forecast_features = demand_forecast_data_encoded.drop(['timestamp'], axis=1)  # Assuming timestamp is dropped for prediction

forecast_predictions = model.predict(forecast_features)

Mean Squared Error (MSE): 209.37291535002086


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- demand
- economic_indicator
- forecast_day
- forecast_month
- forecast_year
- ...
Feature names seen at fit time, yet now missing:
- category_Fruit
- category_Grain
- category_Herb
- category_Legume
- category_Meat
- ...


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

NameError: name 'y_test' is not defined