In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [4]:
file_path = 'venvx/AnnonymData.csv'
try:
    # For large files, be mindful of memory.
    # We'll load it directly for now, but consider 'chunksize' or 'dtype' optimization if needed.
    df = pd.read_csv(file_path)
    print(f"Successfully loaded data from '{file_path}'.")
    print(f"Dataset shape: {df.shape}") # (rows, columns)
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
    print("\nDataset info:")
    df.info(verbose=True, show_counts=True)
except FileNotFoundError:
    print(f"ERROR: File not found at '{file_path}'. Please check the path and filename.")
    exit()
except Exception as e:
    print(f"An error occurred during data loading: {e}")
    exit()
print("-" * 50)

Successfully loaded data from 'venvx/AnnonymData.csv'.
Dataset shape: (6538739, 14)

First 5 rows of the dataset:
    OrderId                     TransactionId DateOfService  \
0  11518978  4c5060636f584ef9a1effa77282755f5    2020-01-02   
1  11285143  68472c70b9c84fb784834ecc257827d7    2020-01-02   
2  11285146  7262eace0d104592b1269e38f5b45ec1    2020-01-02   
3  11285152  8e451931e8fc4554869c3e4533b65e23    2020-01-02   
4  11285155  bfa8fa0812ee40baa98e5aaf52d30e0b    2020-01-02   

           DateOfOrder  OrderQty                     MenuName MenuPrice  \
0  2020-02-05 11:54:08         1             Mittagessen (Gs)      3,10   
1  2019-12-16 10:30:51         1  Smart Eating Buffet (WGrus)      0,00   
2  2019-12-16 10:31:33         1  Smart Eating Buffet (WGrus)      2,90   
3  2019-12-16 10:32:05         1  Smart Eating Buffet (WGrus)      0,00   
4  2019-12-16 10:32:31         1  Smart Eating Buffet (WGrus)      2,90   

  MenuSubsidy      BookingNr                       Group

In [5]:
print("--- Step 2: Initial Data Cleaning & Preprocessing ---")
df_processed = df.copy()

date_columns = ['DateOfService', 'DateOfOrder', 'DateOfCancel']
for col in date_columns:
    if col in df_processed.columns:
        # Attempt conversion, coercing errors will turn unparseable dates into NaT (Not a Time)
        df_processed[col] = pd.to_datetime(df_processed[col], errors='coerce')
        print(f"Column '{col}' converted. NaNs introduced by coercion: {df_processed[col].isnull().sum()}")
    else:
        print(f"Warning: Date column '{col}' not found.")

--- Step 2: Initial Data Cleaning & Preprocessing ---
Column 'DateOfService' converted. NaNs introduced by coercion: 0
Column 'DateOfOrder' converted. NaNs introduced by coercion: 0
Column 'DateOfCancel' converted. NaNs introduced by coercion: 5604260


In [6]:
print("\nConverting financial columns ('MenuPrice', 'MenuSubsidy') to numeric...")
financial_columns = ['MenuPrice', 'MenuSubsidy']
for col in financial_columns:
    if col in df_processed.columns:
        if df_processed[col].dtype == 'object': # Only process if it's an object type
            # Remove currency symbols (e.g., €, $, £) and commas (as thousands separators)
            # This regex is an example, adjust if your currency format is different
            df_processed[col] = df_processed[col].astype(str).str.replace(r'[€\$£,]', '', regex=True)
            # Convert to numeric, coercing errors.
            # If your numbers use ',' as decimal (e.g., German format), you'd first remove '.', then replace ',' with '.'
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
            print(f"Column '{col}' converted to numeric. NaNs introduced: {df_processed[col].isnull().sum()}")
        elif pd.api.types.is_numeric_dtype(df_processed[col]):
            print(f"Column '{col}' is already numeric.")
        else:
            print(f"Column '{col}' is of type {df_processed[col].dtype} and was not processed as a typical currency string.")
    else:
        print(f"Warning: Financial column '{col}' not found.")


Converting financial columns ('MenuPrice', 'MenuSubsidy') to numeric...
Column 'MenuPrice' converted to numeric. NaNs introduced: 0
Column 'MenuSubsidy' converted to numeric. NaNs introduced: 0


In [7]:
print("\nEnsuring 'OrderQty' and 'CanceledQty' are numeric...")
for col in ['OrderQty', 'CanceledQty']:
    if col in df_processed.columns:
        if not pd.api.types.is_numeric_dtype(df_processed[col]):
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
            print(f"Column '{col}' converted to numeric. NaNs introduced: {df_processed[col].isnull().sum()}")
        else:
            print(f"Column '{col}' is already numeric.")
    else:
        print(f"Warning: Column '{col}' not found.")


Ensuring 'OrderQty' and 'CanceledQty' are numeric...
Column 'OrderQty' is already numeric.
Column 'CanceledQty' is already numeric.


In [9]:
if 'Site' in df_processed.columns:
    missing_site_percentage = df_processed['Site'].isnull().mean() * 100
    print(f"\nMissing values in 'Site': {missing_site_percentage:.2f}%")
    # Example: df_processed['Site'].fillna('Unknown', inplace=True)
    # Or, if you know the main sites are MS, LP, BK, you might investigate if missing sites can be inferred.
    # For now, let's fill with 'Unknown' as a placeholder strategy.
    df_processed['Site'] = df_processed['Site'].fillna('UnknownSite')
    print("Filled missing 'Site' values with 'UnknownSite'.")


Missing values in 'Site': 0.00%
Filled missing 'Site' values with 'UnknownSite'.


In [10]:
print("\nFiltering out irrelevant orders...")
initial_rows = len(df_processed)

# Condition 1: Orders where DateOfCancel is after DateOfService
if 'DateOfCancel' in df_processed.columns and 'DateOfService' in df_processed.columns:
    condition1_filter = (df_processed['DateOfCancel'].notna()) & \
                        (df_processed['DateOfService'].notna()) & \
                        (df_processed['DateOfCancel'] > df_processed['DateOfService'])
    rows_to_drop_cond1 = df_processed[condition1_filter]
    if not rows_to_drop_cond1.empty:
        print(f"Found {len(rows_to_drop_cond1)} orders where DateOfCancel is after DateOfService. These will be dropped.")
        df_processed = df_processed[~condition1_filter]
    else:
        print("No orders found where DateOfCancel is after DateOfService.")


Filtering out irrelevant orders...
Found 70970 orders where DateOfCancel is after DateOfService. These will be dropped.


In [11]:
# Condition 2: Orders where OrderQty < CanceledQty
# Ensure both columns are numeric and handle potential NaNs before comparison
if 'OrderQty' in df_processed.columns and 'CanceledQty' in df_processed.columns:
    # Fill NaNs with 0 for comparison, assuming NaN in Qty means 0 for this specific filter logic
    order_qty_filled = df_processed['OrderQty'].fillna(0)
    canceled_qty_filled = df_processed['CanceledQty'].fillna(0)

    condition2_filter = order_qty_filled < canceled_qty_filled
    rows_to_drop_cond2 = df_processed[condition2_filter]
    if not rows_to_drop_cond2.empty:
        print(f"Found {len(rows_to_drop_cond2)} orders where OrderQty < CanceledQty. These will be dropped.")
        df_processed = df_processed[~condition2_filter]
    else:
        print("No orders found where OrderQty < CanceledQty.")

rows_after_filtering = len(df_processed)
print(f"Rows dropped due to filtering: {initial_rows - rows_after_filtering}")
print(f"Dataset shape after filtering: {df_processed.shape}")


Found 793547 orders where OrderQty < CanceledQty. These will be dropped.
Rows dropped due to filtering: 864517
Dataset shape after filtering: (5674222, 14)


In [12]:
# f. Create the Target Variable: `NeededMeals`
# NeededMeals = OrderQty - CanceledQty (after filtering and ensuring NaNs in Qty are handled)
print("\nCreating the target variable 'NeededMeals'...")
if 'OrderQty' in df_processed.columns and 'CanceledQty' in df_processed.columns:
    # Assuming that if CanceledQty is NaN for an order that wasn't filtered out, it means 0 cancellations for that order.
    # And if OrderQty is NaN (should be rare after initial checks), treat as 0 for this calculation.
    df_processed['OrderQty_filled'] = df_processed['OrderQty'].fillna(0)
    df_processed['CanceledQty_filled'] = df_processed['CanceledQty'].fillna(0)

    df_processed['NeededMeals'] = df_processed['OrderQty_filled'] - df_processed['CanceledQty_filled']

    # Clean up temporary columns
    df_processed.drop(columns=['OrderQty_filled', 'CanceledQty_filled'], inplace=True)

    print("'NeededMeals' column created.")
    print("Summary of 'NeededMeals':")
    print(df_processed['NeededMeals'].describe())

    # Sanity check: NeededMeals should not be negative if OrderQty < CanceledQty was filtered.
    # However, if OrderQty was 0 and CanceledQty was 0, NeededMeals is 0.
    # If OrderQty was 0 and CanceledQty was 1 (bank account not covered), this record should have been filtered.
    if not df_processed[df_processed['NeededMeals'] < 0].empty:
        print(f"Warning: Found {len(df_processed[df_processed['NeededMeals'] < 0])} records with negative NeededMeals. Review filtering logic.")
        print(df_processed[df_processed['NeededMeals'] < 0][['OrderQty', 'CanceledQty', 'NeededMeals']].head())
    else:
        print("No negative 'NeededMeals' found after calculation, which is good.")
else:
    print("Could not create 'NeededMeals' as 'OrderQty' or 'CanceledQty' is missing.")


Creating the target variable 'NeededMeals'...
'NeededMeals' column created.
Summary of 'NeededMeals':
count    5.674222e+06
mean     9.985110e-01
std      4.098289e-01
min      0.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.200000e+02
Name: NeededMeals, dtype: float64
No negative 'NeededMeals' found after calculation, which is good.
