In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.cluster import KMeans

In [11]:
def generate_base_pattern(cluster_id):
    """Generate structured demand patterns per cluster, avoiding full sparsity,
    and shuffle the values randomly to introduce variation."""
    patterns = {  ## can be modified
        0: np.array([1, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0]), 
        1: np.array([2, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 1, 0]), 
        2: np.array([3, 0, 3, 3, 0, 1, 0, 0, 3, 1, 0, 0, 2]), 
        3: np.array([5, 1, 2, 1, 4, 1, 2, 0, 1, 2, 1, 1, 0]), 
    }
    pattern = patterns[cluster_id % len(patterns)]
    np.random.shuffle(pattern)  # Shuffle the pattern randomly
    return pattern / pattern.sum()  # Normalize to maintain demand shape

def generate_demand_vector(quarter_demand, max_weekly_demand, min_zero_weeks, base_pattern):
    """Generate sparse demand with some nonzero values for realism."""
    if quarter_demand == 0:
        return np.zeros(13, dtype=int)  # Fully zero case (edge case)
    
    # Scale pattern
    scaled_pattern = base_pattern * quarter_demand
    demand_vector = np.round(scaled_pattern).astype(int)
    
    # Ensure at least one nonzero value if quarter_demand > 0
    if np.sum(demand_vector) == 0:
        demand_vector[np.random.randint(0, 13)] = 1
    
    # Limit max weekly demand
    demand_vector[demand_vector > max_weekly_demand] = max_weekly_demand

    # Ensure sparse weeks
    nonzero_indices = np.where(demand_vector > 0)[0]
    num_nonzero = len(nonzero_indices)
    if num_nonzero > 0:
        num_zeros_to_set = min(min_zero_weeks, num_nonzero - 1)  # Ensure at least one nonzero
        zero_indices = np.random.choice(nonzero_indices, num_zeros_to_set, replace=False)
        demand_vector[zero_indices] = 0
    
    return demand_vector

def generate_sparse_dataset(num_ids=100, num_clusters=4):
    """Generate dataset with temporal structure and coherent historical sums."""
    data = []
    
    # Generate features per ID (constant across quarters)
    feature_matrix = np.array([  ## can be modified
        [
            np.random.randint(1, 6),  # lead_time
            np.random.randint(1, 4),  # criticality
            np.random.randint(1, 11), # equipment_type
        ] for _ in range(num_ids)
    ])

    # Cluster IDs based on static features
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(feature_matrix)

    # Cluster demand settings
    constraints = {
        0: {"quarter_demand_range": 3, "max_weekly_range": 2, "zero_weeks_range": 10},
        1: {"quarter_demand_range": 5, "max_weekly_range": 3, "zero_weeks_range": 7},
        2: {"quarter_demand_range": 10, "max_weekly_range": 5, "zero_weeks_range": 4},
        3: {"quarter_demand_range": 15, "max_weekly_range": 4, "zero_weeks_range": 5},  ## can be modified
    }

    for id_num in range(num_ids):
        # Get ID features and cluster
        lead_time, criticality, equipment_type = feature_matrix[id_num]
        cluster_id = cluster_labels[id_num] % num_clusters
        settings = constraints[3]
        
        # Initialize demand history
        demand_history = []
        base_pattern = generate_base_pattern(cluster_id)

        # Generate 7 quarters per ID
        for period in range(7):
            # Determine year and quarter
            if period < 4:
                year, quarter = 2023, period + 1
            else:
                year, quarter = 2024, period - 3

            # Generate demand
            q_demand = np.random.randint(max(1, settings["quarter_demand_range"]-2), settings["quarter_demand_range"] + 1)
            demand_vec = generate_demand_vector(
                q_demand,
                settings["max_weekly_range"],
                settings["zero_weeks_range"],
                base_pattern
            )
            current_sum = sum(demand_vec)

            # Calculate historical sums
            sum_d_q1 = demand_history[-1] if len(demand_history) >=1 else max(0, current_sum + np.random.randint(-1, 2))
            sum_d_q2 = demand_history[-2] if len(demand_history) >=2 else max(0, current_sum + np.random.randint(-1, 2))
            
            demand_history.append(current_sum)

            data.append([
                id_num + 1,  # ID column
                year,
                lead_time,
                criticality,
                quarter,
                equipment_type,
                sum_d_q2,
                sum_d_q1,
                settings["max_weekly_range"],
                settings["zero_weeks_range"],
                cluster_id,
                demand_vec.tolist()
            ])

    columns = [
        "id", "year", "lead_time", "criticality", "quarter", "equipment_type",
        "sum_d_q2", "sum_d_q1", "max_weekly_demand",
        "min_zero_weeks", "cluster_id", "demand_next_q"
    ]
    
    return pd.DataFrame(data, columns=columns)

# Generate dataset with 100 IDs (700 rows total)
dataset = generate_sparse_dataset(100)
dataset.head(50)

Unnamed: 0,id,year,lead_time,criticality,quarter,equipment_type,sum_d_q2,sum_d_q1,max_weekly_demand,min_zero_weeks,cluster_id,demand_next_q
0,1,2023,2,3,1,1,5,4,4,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]"
1,1,2023,2,3,2,1,3,4,4,5,1,"[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
2,1,2023,2,3,3,1,4,2,4,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]"
3,1,2023,2,3,4,1,2,2,4,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]"
4,1,2024,2,3,1,1,2,2,4,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]"
5,1,2024,2,3,2,1,2,2,4,5,1,"[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
6,1,2024,2,3,3,1,2,2,4,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0]"
7,2,2023,2,2,1,2,3,2,4,5,1,"[0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0]"
8,2,2023,2,2,2,2,5,2,4,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0]"
9,2,2023,2,2,3,2,2,4,4,5,1,"[0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0]"


In [None]:
# def generate_base_pattern(cluster_id):
#     """Generate structured demand patterns per cluster, avoiding full sparsity,
#     and shuffle the values randomly to introduce variation."""
#     patterns = {
#         0: np.array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0]), 
#         1: np.array([0, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 1, 0]), 
#         2: np.array([0, 0, 3, 3, 0, 1, 0, 0, 3, 1, 0, 0, 2]), 
#         3: np.array([5, 1, 2, 1, 4, 1, 2, 0, 1, 2, 1, 1, 0]), 
#     }
#     pattern = patterns[cluster_id % len(patterns)]
#     np.random.shuffle(pattern)  # Shuffle the pattern randomly
#     return pattern / pattern.sum()  # Normalize to maintain demand shape

# def generate_demand_vector(quarter_demand, max_weekly_demand, min_zero_weeks, base_pattern):
#     """Generate sparse demand with some nonzero values for realism."""
#     if quarter_demand == 0:
#         return np.zeros(13, dtype=int)  # Fully zero case (edge case)
    
#     # Scale pattern
#     scaled_pattern = base_pattern * quarter_demand
#     demand_vector = np.round(scaled_pattern).astype(int)
    
#     # Ensure at least one nonzero value if quarter_demand > 0
#     if np.sum(demand_vector) == 0:
#         demand_vector[np.random.randint(0, 13)] = 1
    
#     # Limit max weekly demand
#     demand_vector[demand_vector > max_weekly_demand] = max_weekly_demand

#     # Ensure sparse weeks
#     nonzero_indices = np.where(demand_vector > 0)[0]
#     num_nonzero = len(nonzero_indices)
#     if num_nonzero > 0:
#         num_zeros_to_set = min(min_zero_weeks, num_nonzero - 1)  # Ensure at least one nonzero
#         zero_indices = np.random.choice(nonzero_indices, num_zeros_to_set, replace=False)
#         demand_vector[zero_indices] = 0
    
#     return demand_vector

# def generate_sparse_dataset(num_samples=1000, num_clusters=4, sparse_ratio=0.8):
#     """Generate dataset where demand characteristics depend on cluster ID."""
#     data = []

#     feature_matrix = np.array([
#         [
#             np.random.randint(1, 6),  # lead_time
#             np.random.randint(1, 4),  # criticality
#             np.random.randint(1, 5),  # quarter
#             np.random.randint(1, 11), # equipment_type
#         ] for _ in range(num_samples)
#     ])

#     # Apply K-Means clustering
#     kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
#     cluster_labels = kmeans.fit_predict(feature_matrix)

#     # Define cluster-specific demand settings
#     constraints = {
#         0: {"quarter_demand_range": 3, "max_weekly_range": 2, "zero_weeks_range": 10},  # Very low demand, high sparsity
#         1: {"quarter_demand_range": 5, "max_weekly_range": 3, "zero_weeks_range": 7},  # Moderate demand, still sparse
#         2: {"quarter_demand_range": 10, "max_weekly_range": 5, "zero_weeks_range": 4},  # Medium demand, less sparse
#         3: {"quarter_demand_range": 20, "max_weekly_range": 8, "zero_weeks_range": 2},  # High demand, low sparsity
#     }

#     # Generate demand vectors with structured patterns
#     for i in range(num_samples):
#         lead_time, criticality, quarter, equipment_type = feature_matrix[i]
#         cluster_id = cluster_labels[i] % num_clusters

#         # Get cluster-specific demand constraints
#         constraints_settings = 3
#         settings = constraints[constraints_settings]
#         quarter_demand = np.random.randint(settings["quarter_demand_range"]-10, settings["quarter_demand_range"])
#         max_weekly_demand = settings["max_weekly_range"]
#         min_zero_weeks = settings["zero_weeks_range"]

#         base_pattern = generate_base_pattern(cluster_id)
#         demand_vector = generate_demand_vector(quarter_demand, max_weekly_demand, min_zero_weeks, base_pattern)

#         sum_d_q1 = max(0, sum(demand_vector) + np.random.randint(min(-1, -sum(demand_vector)*0.2), max(sum(demand_vector)*0.1+1, 2)))
#         sum_d_q2 = max(0, sum(demand_vector) + np.random.randint(min(-1, -sum(demand_vector)*0.2), max(sum(demand_vector)*0.1+1, 2)))
#         row = [lead_time, criticality, quarter, equipment_type, sum_d_q2, sum_d_q1, max_weekly_demand, quarter_demand, min_zero_weeks, cluster_id, demand_vector.tolist()]
#         data.append(row)
#     features = ["lead_time", "criticality", "quarter", "equipment_type", "sum_d_q2", "sum_d_q1"]  # USE FOR PREDICTION
#     extra_columns =["max_weekly_demand", "quarter_demand", "min_zero_weeks", "cluster_id"]  # DO NOT USE FOR PREDICTION
#     columns = features + extra_columns + ["demand_next_q"]
#     df = pd.DataFrame(data, columns=columns)

#     return df

# # Generate mostly sparse dataset
# dataset = generate_sparse_dataset(1000)
# dataset.head(10)