In [13]:
import numpy as np
import pandas as pd
import matplotlib as mp
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [14]:
data = pd.read_csv('https://raw.githubusercontent.com/qmdismnp/Schulich_DS_MBAN/refs/heads/main/dataset.csv?')

In [15]:
data

Unnamed: 0,order_date,requested_delivery_date,Customer Country Code,Product Code,Description,order_type,Customer Order Code,value,Curr,items,Route
0,13.07.2009,28.01.2010,RU,L10705000,Parka Outdoor Lifestyle STD,VO,3200435553,2337.00,RUB,6,RU0001
1,15.07.2009,24.03.2010,RU,L10705000,Parka Outdoor Lifestyle STD,VO,3200435694,10160.25,RUB,23,RU0001
2,16.07.2009,04.02.2010,RU,L10705000,Parka Outdoor Lifestyle STD,VO,3200435741,2992.50,RUB,7,RU0001
3,17.07.2009,04.02.2010,RU,L10705000,Parka Outdoor Lifestyle STD,VO,3200435907,4061.25,RUB,9,RU0001
4,21.07.2009,01.02.2010,RU,L10705000,Parka Outdoor Lifestyle STD,VO,3200435963,2208.75,RUB,5,RU0001
...,...,...,...,...,...,...,...,...,...,...,...
2415,13.07.2011,15.02.2012,HR,L12919200,Parka Outdoor Lifestyle STD,VO,3200819196,128.52,EUR,12,FI0003
2416,13.07.2011,15.02.2012,HR,L12919200,Parka Outdoor Lifestyle STD,VO,3200819201,128.52,EUR,12,FI0003
2417,13.07.2011,15.02.2012,HR,L12919200,Parka Outdoor Lifestyle STD,VO,3200819206,128.52,EUR,12,FI0003
2418,13.07.2011,15.02.2012,HR,L12919200,Parka Outdoor Lifestyle STD,VO,3200819210,107.10,EUR,10,FI0003


In [16]:
# Handle missing or invalid values
data.replace(r'\\N', np.nan, regex=True, inplace=True)  # Replace invalid strings
data.fillna(0, inplace=True)  # Replace NaN values with 0 (or use appropriate imputation)

Q1

In [17]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

def calculate_monthly_orders_with_sarima(data):
    """
    Groups transactional data by month to calculate the number of unique orders.
    Applies SARIMA model for forecasting and evaluates its performance.
    Forecasts for the next five months and the next two months.
    """
    # Step 1: Preprocessing
    data['order_date'] = pd.to_datetime(data['order_date'], format='%d.%m.%Y')
    data['year_month'] = data['order_date'].dt.to_period('M')
    monthly_orders = (
        data.groupby('year_month')['Customer Order Code']
        .nunique()
        .reset_index(name='distinct_orders')
    )

    # Prepare data for SARIMA
    monthly_orders['year_month'] = pd.to_datetime(monthly_orders['year_month'].astype(str))
    monthly_orders.set_index('year_month', inplace=True)

    # Split data into training and testing sets
    train_size = int(len(monthly_orders)) - 5
    train_data = monthly_orders.iloc[:train_size]
    test_data = monthly_orders.iloc[train_size:]

    # Step 2: Fit SARIMA model
    sarima_model = SARIMAX(train_data['distinct_orders'],
                           order=(1, 0, 2),
                           seasonal_order=(1, 1, 1, 12))
    sarima_result = sarima_model.fit(disp=False)

    # Step 3: Forecast for the test set (existing data)
    forecast_test = sarima_result.forecast(steps=len(test_data))
    
    # Step 4: Forecast for future months
    future_forecast_5_months = sarima_result.forecast(steps=5)


    return future_forecast_5_months

Q2

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def classify_and_evaluate_product_demand(data, months=5):
    """
    Prepares data for a classification model by encoding features, trains a logistic regression model,
    and evaluates it. Forecasts demand for the next specified number of months.
    """
    # Add seasonality based on the order date
    def get_season(month):
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Fall'

    data['order_date'] = pd.to_datetime(data['order_date'], format='%d.%m.%Y')
    data['Season'] = data['order_date'].dt.month.apply(get_season)

    # Encode categorical variables
    encoded_data = pd.get_dummies(data, columns=['Season', 'Customer Country Code','Curr', 'Route', 'order_type'], drop_first=True)

    # Define features and target variable
    X = encoded_data.drop(columns=['Product Code', 'year_month', 'order_date', 'requested_delivery_date', 'Customer Order Code', 'Description'])
    y = encoded_data['Product Code']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a logistic regression model
    logistic_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
    logistic_model.fit(X_train, y_train)

    # Make predictions
    y_pred_logistic = logistic_model.predict(X_test)

    # Forecast demand for the next `months`
    future_forecast_demand = logistic_model.predict(X_test.sample(n=months, random_state=42))

    return y_pred_logistic,future_forecast_demand

Q3

In [19]:
import pandas as pd
import numpy as np

def simulate_quantity_demand(data, n_months=5):
    """
    Recalculates 25th and 75th percentiles for each product and simulates demand for the next n months.
    :param data: Pandas DataFrame with columns 'Product Code' and 'items'.
    :param n_months: Number of months to simulate demand for.
    :return: DataFrame with simulated demand for each product.
    """
    # Recalculate percentiles for each product
    # Ensure the 'items' column is numeric
    data['items'] = pd.to_numeric(data['items'], errors='coerce')
    # Drop rows with NaN in the 'items' column after conversion
    data = data.dropna(subset=['items'])
    # Group by 'Product Code' and calculate the quantiles
    quantity_bounds = data.groupby('Product Code')['items'].quantile([0.25, 0.5, 0.75]).unstack().reset_index()
    # Rename the columns for clarity
    quantity_bounds.columns = ['Product Code', '25th Percentile','50th Percentile', '75th Percentile']


    # Simulate demand for the next n_months
    simulated_demand = []

    for _, row in quantity_bounds.iterrows():
        mean_quantity = row['50th Percentile']
        std_dev_quantity = (row['75th Percentile'] - row['25th Percentile']) / 6  # Assuming normal distribution
        product_demand = np.random.normal(mean_quantity, std_dev_quantity, n_months).clip(0)  # Ensure no negative values
        simulated_demand.append(product_demand)

    # Create a DataFrame for the simulated demand
    simulated_demand_data = pd.DataFrame(
        simulated_demand,
        columns=[f"Month {i+1}" for i in range(n_months)],
        index=quantity_bounds['Product Code']
    ).reset_index()

    # Rename columns for clarity
    simulated_demand_data.rename(columns={'index': 'Product Code'}, inplace=True)

    return simulated_demand_data

In [20]:
data.describe()

Unnamed: 0,Customer Order Code,value
count,2420.0,2420.0
mean,3200672000.0,1162.976624
std,100440.8,2560.595118
min,3200435000.0,-0.03
25%,3200614000.0,33.03
50%,3200711000.0,70.49
75%,3200729000.0,1363.2
max,3201062000.0,38937.5


In [21]:
# Drop rows with NaN in 'lead_time'
df = df.dropna(subset=['lead_time'])
    
df['year_month'] = df['order_date'].dt.to_period('M')
# Group by 'Product Code' and calculate quantiles
lead_time_bounds = df.groupby('year_month')['lead_time'].quantile([0.05, 0.5, 0.95]).unstack().reset_index()

lead_time_bounds

NameError: name 'df' is not defined

In [None]:
# Rename columns for clarity
lead_time_bounds.columns = ['year_month', '5th Percentile', '50th Percentile', '95th Percentile']

lead_time_bounds

Unnamed: 0,year_month,5th Percentile,50th Percentile,95th Percentile
0,2009-07,6.166667,6.5,7.9
1,2009-08,5.7,6.866667,9.833333
2,2009-09,5.97,6.15,8.666667
3,2009-10,5.266667,6.366667,7.866667
4,2009-11,7.7,7.7,7.7
5,2009-12,6.191667,8.533333,11.363333
6,2010-01,5.666667,7.466667,8.61
7,2010-02,5.186667,7.0,8.263333
8,2010-03,4.856667,6.133333,8.466667
9,2010-04,2.966667,4.266667,7.153333


In [None]:
simulated_leadtime = {}

for _, row in lead_time_bounds.iterrows():
        q25 = row['5th Percentile']
        q75 = row['95th Percentile']
        
        # Calculate mean and std
        mean = row['50th Percentile']
        std_dev = (q75 - q25) / 1.35
        
        # Generate samples
        samples = np.random.normal(loc=mean, scale=std_dev, size=1).clip(min=0)
        
        # Store in dictionary using the year-month as key
        simulated_leadtime[row['year_month']] = samples


In [None]:
# Convert the simulated_leadtime dictionary to a DataFrame
simulated_leadtime_df = pd.DataFrame.from_dict(
    simulated_leadtime, orient='index'
).reset_index()

# Rename columns for clarity
simulated_leadtime_df.columns = ['year_month'] + [f"Lead Time" ]

# Display the DataFrame
print(simulated_leadtime_df)

   year_month  Lead Time
0     2009-07   6.691526
1     2009-08   4.882458
2     2009-09   1.543551
3     2009-10   5.469893
4     2009-11   7.700000
5     2009-12   2.998189
6     2010-01   5.071348
7     2010-02   7.351076
8     2010-03   4.273749
9     2010-04   3.883436
10    2010-05   6.433333
11    2010-06   2.833333
12    2010-12  13.612714
13    2011-01   6.757844
14    2011-02   7.342526
15    2011-03   3.616200
16    2011-04   7.512580
17    2011-05   1.629396
18    2011-06   9.176285
19    2011-07  11.037648
20    2011-08   5.190213
21    2011-09   8.355481
22    2011-10   3.962408
23    2011-11   0.577703
24    2012-01   5.065594
25    2012-04   6.000000


Q4

In [22]:
import pandas as pd
import numpy as np

def calculate_and_simulate_lead_time(data, months=5, n_samples=1):
    """
    Calculate lead time bounds and simulate lead time using normal distribution.

    Parameters:
        data (pd.DataFrame): Input data containing 'order_date' and 'requested_delivery_date'.
        months (int): Number of months to filter data for lead time calculation.
        n_samples (int): Number of lead time samples to generate per month.

    Returns:
        pd.DataFrame: Simulated lead times for the filtered months.
    """

    # Step 2: Convert date columns and calculate lead time in months
    data['order_date'] = pd.to_datetime(data['order_date'], format='%d.%m.%Y')
    data['requested_delivery_date'] = pd.to_datetime(data['requested_delivery_date'], format='%d.%m.%Y')
    data['lead_time'] = (data['requested_delivery_date'] - data['order_date']).dt.days / 30

    # Step 3: Drop rows with NaN in 'lead_time'
    data = data.dropna(subset=['lead_time'])

    # Step 4: Group by 'year_month' and calculate lead time quantiles
    data['year_month'] = data['order_date'].dt.to_period('M')
    lead_time_bounds = data.groupby('year_month')['lead_time'].quantile([0.05, 0.5, 0.95]).unstack().reset_index()
    lead_time_bounds.columns = ['year_month', '5th Percentile', '50th Percentile', '95th Percentile']

    # Step 5: Simulate lead time using normal distribution
    simulated_leadtime = {}
    for _, row in lead_time_bounds.iterrows():
        q05 = row['5th Percentile']
        q95 = row['95th Percentile']

        # Calculate mean and std deviation
        mean = row['50th Percentile']
        std_dev = (q05 - q95) / 1.35

        # Generate samples
        samples = np.random.normal(mean, scale=std_dev, size=n_samples).clip(min=0)

        # Store in dictionary
        simulated_leadtime[row['year_month']] = samples

    # Step 6: Convert simulated leadtime dictionary to DataFrame
# Convert the simulated_leadtime dictionary to a DataFrame
    simulated_leadtime_df = pd.DataFrame.from_dict(
        simulated_leadtime, orient='index'
    ).reset_index()

    # Rename columns for clarity
    simulated_leadtime_df.columns = ['year_month'] + [f"Lead Time" ]

    return simulated_leadtime_df

In [23]:
calculate_and_simulate_lead_time(data)

ValueError: scale < 0

In [None]:
import pandas as pd

def consolidated_mc_function(data):
    """
    Consolidates the outputs of four individual functions into a single table.

    :param data: Pandas DataFrame containing the necessary input data.
    :return: Pandas DataFrame summarizing the results of all calculations.
    """
    # Call each individual function
    forecast_5_months, forecast_2_months, mape_test = calculate_monthly_orders_with_sarima(data)
    logistic_predictions, forecast_demand = classify_and_evaluate_product_demand(data)
    simulated_demand_data = simulate_quantity_demand(data)
    simulated_lead_time = calculate_and_simulate_lead_time(data)

    # Compile results into a dictionary
    results_dict = {
        "SARIMA Forecast (5 months)": forecast_5_months,
        "SARIMA Forecast (2 months)": forecast_2_months,
        "MAPE Test": mape_test,
        "Logistic Predictions": logistic_predictions,
        "Forecasted Demand (Logistic)": forecast_demand,
        "Simulated Demand": simulated_demand_data,
        "Simulated Lead Time": simulated_lead_time
    }

    # Convert the dictionary to a DataFrame for better presentation
    results_df = pd.DataFrame.from_dict(results_dict, orient='index').transpose()
    
    return results_df
