In [15]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('product_demand_prediction_dataset.csv')


In [4]:
# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Extract useful features from the 'Date' column
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)  # 5, 6 are Saturday, Sunday

# Drop the original 'Date' column
df = df.drop('Date', axis=1)


In [5]:
# Fill missing values
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

for column in df.select_dtypes(include=['object']).columns:
    df[column] = df[column].fillna(df[column].mode()[0])

In [6]:
# Create new features
df['PriceChange'] = df['CompetitorPrice'] - df['Price']
df['SalesMovingAvg'] = df['Sales'].rolling(window=3, min_periods=1).mean()

In [7]:
# Standardize continuous variables
scaler = StandardScaler()
continuous_cols = ['Sales', 'Price', 'CompetitorPrice', 'EconomicIndicator', 'StockLevel', 'Demand', 'PriceChange', 'SalesMovingAvg']
df[continuous_cols] = scaler.fit_transform(df[continuous_cols])


In [8]:
# Save the scaler for the Demand column
demand_scaler = StandardScaler()
df['Demand'] = demand_scaler.fit_transform(df[['Demand']])

In [9]:
# Encode categorical variables
label_enc_cols = ['ProductID', 'StoreID']
le = LabelEncoder()

for col in label_enc_cols:
    df[col] = le.fit_transform(df[col])

# One-hot encoding for other categorical variables
categorical_cols = ['Promotion', 'Season', 'Holiday', 'Weather']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [10]:
# Split data into features (X) and target (y)
X = df.drop('Demand', axis=1)  # Features
y = df['Demand']  # Target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
rf_model = RandomForestRegressor()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')

# Fit the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Get the best estimator
best_rf_model = grid_search.best_estimator_

# Predict on the test set
y_pred_scaled = best_rf_model.predict(X_test)

# Inverse transform the predicted demand to original format
y_pred_original = demand_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))
y_test_original = demand_scaler.inverse_transform(y_test.values.reshape(-1, 1))

Fitting 3 folds for each of 216 candidates, totalling 648 fits


324 fits failed out of a total of 648.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
324 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sk

In [17]:
# Evaluate the model
print('Random Forest:')
print(f'MAE: {mean_absolute_error(y_test_original, y_pred_original)}')
print(f'MSE: {mean_squared_error(y_test_original, y_pred_original)}')
print(f'R-squared: {r2_score(y_test_original, y_pred_original)}\n')

# Create a DataFrame to compare actual and predicted demand
comparison_df = pd.DataFrame({
    'Actual Demand': y_test_original.flatten(),
    'Predicted Demand': y_pred_original.flatten()
})

# Display comparison for the first few rows
print(comparison_df.head())

Random Forest:
MAE: 0.8554157794424238
MSE: 0.987268979337604
R-squared: -0.0077740230118388265

   Actual Demand  Predicted Demand
0       0.554564          0.063318
1       0.554564          0.103088
2       1.155903         -0.120759
3      -0.313277          0.074691
4       0.985068          0.158434
