In [2]:
# Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Base path for datasets
data_path = "data"

# Load data
receivals = pd.read_csv(f"{data_path}/kernel/receivals.csv")
purchase_orders = pd.read_csv(f"{data_path}/kernel/purchase_orders.csv")
materials = pd.read_csv(f"{data_path}/extended/materials.csv")
transportation = pd.read_csv(f"{data_path}/extended/transportation.csv")

# Convert date columns to datetime
receivals["date_arrival"] = pd.to_datetime(receivals["date_arrival"], utc=True)
purchase_orders["delivery_date"] = pd.to_datetime(purchase_orders["delivery_date"], utc=True)
purchase_orders["created_date_time"] = pd.to_datetime(purchase_orders["created_date_time"], utc=True)
purchase_orders["modified_date_time"] = pd.to_datetime(purchase_orders["modified_date_time"], utc=True)

Before analyzing the data for forecasting, we first want to make sure it behaves as expected. To do this, we merge the `receivals` and `purchase_orders` tables into a single dataset. This lets us see delivered quantities in the context of their corresponding purchase orders, product details, and other relevant information. 

We then remove purchase orders that do not have any matching receivals. These rows represent orders that never resulted in deliveries and are unlikely to be informative for forecasting incoming raw materials.  

With this cleaned and merged dataset, we can now examine the data for unusual or unexpected values, such as negative or zero quantities and weights, to ensure it is intuitive and consistent.

In [18]:
# Merge purchase_orders into receivals (main table)
merged_df = receivals.merge(
    purchase_orders,
    on=['purchase_order_id', 'purchase_order_item_no'],
    how='left',  # keeps all receivals; PO info added when it exists
    suffixes=('', '_po')
)

# Check the result
print(f"Merged dataframe shape: {merged_df.shape}")

# Count purchase order items without any matching receival
po_total = len(purchase_orders)
po_with_match = purchase_orders[purchase_orders.set_index(['purchase_order_id', 'purchase_order_item_no']).index.isin(
    merged_df.set_index(['purchase_order_id', 'purchase_order_item_no']).index
)]
po_without_match = po_total - len(po_with_match)

print(f"Total purchase order items: {po_total}")
print(f"Number of PO items with at least one matching receival: {len(po_with_match)}")
print(f"Number of PO items without any matching receival: {po_without_match}")
print(f"Fraction of PO items without matching receival: {po_without_match / po_total:.2%}")


Merged dataframe shape: (122593, 20)
Total purchase order items: 33171
Number of PO items with at least one matching receival: 22299
Number of PO items without any matching receival: 10872
Fraction of PO items without matching receival: 32.78%


Next, we examine the merged receivals and purchase orders table for non-positive values in key numeric columns. Negative or zero values for quantities or net weights are physically impossible and indicate potential data issues. By identifying these, we can better understand anomalies in the dataset before moving on to further cleaning or analysis.

In [19]:
# Columns to check for non-positive values
check_columns = ['net_weight', 'quantity']

# Function to count zero and negative values in merged_df
def count_non_positive(df, columns):
    print("Non-positive values:")
    for col in columns:
        if col in df.columns:
            num_zero = (df[col] == 0).sum()
            num_negative = (df[col] < 0).sum()
            total = num_zero + num_negative
            print(f"{col}: {total} non-positive values ({num_zero} zero, {num_negative} negative)")

# Run check on relevant columns
count_non_positive(merged_df, check_columns)


Non-positive values:
net_weight: 137 non-positive values (137 zero, 0 negative)
quantity: 0 non-positive values (0 zero, 0 negative)


After examining the merged table, we find that net_weight contains 137 zero values. These entries likely correspond to missing measurements or data entry errors. Given their small number relative to the overall dataset, we will drop these zero net_weight entries to ensure our analysis only includes valid deliveries with meaningful weight information.