In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error

In [None]:
# Load the data (assuming data is loaded from CSV files or a similar source)
# Replace these with the actual data loading methods as per your setup
sales_value = pd.read_csv('sales_value.csv')
sales_units = pd.read_csv('sales_units.csv')
price_per_unit = pd.read_csv('price_per_unit.csv')
distribution = pd.read_csv('distribution.csv')

In [None]:
# Basic data engineering: Replace blanks with zeroes
sales_value.fillna(0, inplace=True)
sales_units.fillna(0, inplace=True)
price_per_unit.fillna(0, inplace=True)
distribution.fillna(0, inplace=True)

# Remove seasonality by differencing
sales_value_diff = sales_value.diff(axis=1).fillna(0)
sales_units_diff = sales_units.diff(axis=1).fillna(0)
price_per_unit_diff = price_per_unit.diff(axis=1).fillna(0)
distribution_diff = distribution.diff(axis=1).fillna(0)

# Normalize all input features
scaler = MinMaxScaler()
sales_value_normalized = scaler.fit_transform(sales_value_diff)
sales_units_normalized = scaler.fit_transform(sales_units_diff)
price_per_unit_normalized = scaler.fit_transform(price_per_unit_diff)
distribution_normalized = scaler.fit_transform(distribution_diff)

# Combine the normalized features into a single dataset
data = np.stack([sales_value_normalized, sales_units_normalized, price_per_unit_normalized, distribution_normalized], axis=-1)

In [None]:
# Prepare the target variable (Estimated sales units for 52 weeks)
# Assuming we want to forecast the last 52 weeks from the normalized sales_units
target = sales_units_diff.iloc[:, -52:].sum(axis=1)

In [None]:
# Split the dataset into train and test sets (60%, 40% split)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.4, random_state=42)

In [None]:
# Perform hyperparameter tuning with auto_arima for ARIMA and SARIMA parameters
predicted_sales_units = []
for i in range(X_train.shape[0]):
    train_series = y_train.iloc[i]
    
    # Apply auto_arima to find the best parameters
    model = auto_arima(
        train_series, 
        start_p=0, max_p=5, 
        start_q=0, max_q=5,
        d=1,  # Differencing parameter
        seasonal=True,
        start_P=0, max_P=5,
        start_Q=0, max_Q=5,
        D=1,  # Seasonal differencing parameter
        m=52,  # Seasonal period (52 weeks for weekly data with yearly seasonality)
        trace=True,  # Output the process of model selection
        error_action='ignore',  # Ignore if any errors during the process
        suppress_warnings=True,  # Suppress warnings to keep the output clean
        stepwise=True,  # Use stepwise approach for faster computation
        random_state=42,
        n_fits=50  # Number of models to try in random search
    )
    
    # Fit the model
    model_fit = model.fit(train_series)
    
    # Forecast the next 52 weeks
    forecast = model_fit.predict(n_periods=52)
    
    # Sum the forecasted values to get the total sales units for the next 52 weeks
    predicted_total_units = np.sum(forecast)
    predicted_sales_units.append(predicted_total_units)

In [None]:
# Convert predictions to a numpy array
predicted_sales_units = np.array(predicted_sales_units)

# Calculate accuracy using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, predicted_sales_units)

print(f'Mean Squared Error: {mse}')
print(f'Predicted sales units: {predicted_sales_units}')

# If you want the sum of all predicted sales units across all products:
total_predicted_sales_units = np.sum(predicted_sales_units)
print(f'Total Predicted Sales Units: {total_predicted_sales_units}')