In [21]:
import pandas as pd

In [22]:
# Define food and drinks sets
food = {
    'Add On', 'Bar Snack', 'Bites', 'Brunch', 'Burger', 'Chargrill', 'Chicken or Prawn Add ',
    'Combo', 'Dessert', 'Doneness', 'Foods', 'GF Add Ons', 'GF Burger and Hotdog', 'Mains',
    'Pasta', 'Pizza', 'Platter', 'Serving Choice', 'Sides', 'Smoothie', 'Soup', 'Starter', 'Hotdog'
}

drinks = {
    'Bottled Beer', 'Bourbon', 'Choice of Margarita / ', 'Ciders', 'Cocktail', 'Coffee',
    'Coffee Options', 'Cognac', 'Corkage', 'Draught Beers', 'Drinks', 'Gin', 'Juices',
    'Liqueurs & Aperitifs', 'Live Craft Beer', 'Mineral Water', 'Mocktails', 'Red Wine',
    'Rose Wine', 'Rum', 'Soft Drink', 'Sparkling & Champagne', 'Tea', 'Tequila', 'Vodka',
    'Whisky', 'White Wine'
}

# Date range boundaries
start_date = pd.Timestamp('2024-12-01')
end_date = pd.Timestamp('2025-03-31')

def clean_itemgroup_data(file_path):
    # Read Excel file
    df = pd.read_excel(file_path, engine='xlrd', skiprows=4)
    df = df[:-1]
    df = df.dropna(axis=1, how='all')
    df.columns = ['Date'] + list(df.columns[1:])
    
    # Drop unnecessary columns
    if 'Unnamed: 2' in df.columns:
        df = df.drop(columns=['Unnamed: 2'])
    
    df = df[df['Date'] != 'Total']
    df = df[df['Date'].notna()]
    
    # Drop columns with 'Unnamed' prefix
    unnamed_columns = [col for col in df.columns if col.startswith('Unnamed')]
    df = df.drop(columns=unnamed_columns)
    
    # Format the Date column as Timestamp
    df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%y')
    
    # Filter by date range (using Timestamps, not string comparison)
    df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    
    # (Removed the line converting Date back to string to keep it as Timestamp)
    
    # Convert all columns except 'Date' to float, handling non-numeric values
    for col in df.columns[1:]:
        df[col] = pd.to_numeric(df[col], errors='raise')
    
    # Calculate total_food and total_drinks for each row
    food_cols = list(food.intersection(df.columns))
    drink_cols = list(drinks.intersection(df.columns))

    df['total_food'] = df[food_cols].sum(axis=1, skipna=True)
    df['total_drinks'] = df[drink_cols].sum(axis=1, skipna=True)
    
    # Drop all columns except Date, total_food, total_drinks, Total
    columns_to_keep = ['Date', 'total_food', 'total_drinks', 'Total']
    df = df[[col for col in columns_to_keep if col in df.columns]]
    
    # Convert column names to lowercase
    df.columns = df.columns.str.lower()
    
    return df

def clean_daily_summary(file_path):
    # Read CSV file with appropriate skipping of rows and footers
    df = pd.read_csv(file_path, skiprows=4, skipfooter=7, engine='python')
    
    # Drop unnamed columns
    unnamed_columns = [col for col in df.columns if col.startswith('Unnamed:')]
    df.drop(columns=unnamed_columns, inplace=True)
    
    # Convert START column to date (Timestamp) format
    df['START'] = pd.to_datetime(df['START'], format='%d/%m/%Y %H:%M:%S', errors='raise')
    
    # Filter by date range (Timestamp comparison)
    df = df[(df['START'] >= start_date) & (df['START'] <= end_date)]
    
    # (Removed the line converting START to string to keep it as Timestamp)
    
    # Drop all other columns except START, ORDERS COUNT, AVG ORDER
    columns_to_keep = ['START', 'ORDERS COUNT', 'AVG ORDER']
    df = df[[col for col in columns_to_keep if col in df.columns]]
    
    # Rename START to date
    df.rename(columns={'START': 'date'}, inplace=True)
    
    # Convert all column names to lowercase
    df.columns = df.columns.str.lower()
    
    # Group by date and sum the values for duplicates
    df = df.groupby('date', as_index=False).sum()

    return df

def merge_outlet(itemgroup, daily_summary, outlet_name):
    # Convert the 'date' columns to date only (discarding any time info)
    itemgroup['date'] = pd.to_datetime(itemgroup['date'], errors='raise').dt.date
    daily_summary['date'] = pd.to_datetime(daily_summary['date'], errors='raise').dt.date

    # Merge itemgroup and daily summary using an outer join on date (now date-only)
    merged = pd.merge(itemgroup, daily_summary, on='date', how='outer', suffixes=('_itemgroup', '_daily'))
    
    # Sort by date
    merged = merged.sort_values(by='date').reset_index(drop=True)
    
    # Rename the merged DataFrame for each outlet
    merged.name = f"{outlet_name}_merged"
    return merged

def print_mismatched_dates(df_full, merged_df, outlet_name):
    # Filter df_full to the target outlet and copy
    df_outlet = df_full[df_full['Outlet'] == outlet_name].copy()
    
    # Convert both sets of Date columns to datetime
    df_outlet['Date'] = pd.to_datetime(df_outlet['Date'], errors='raise')
    merged_df['date'] = pd.to_datetime(merged_df['date'], errors='raise')
    
    # Reduce to date-only
    df_outlet_date_only = df_outlet['Date'].dt.date
    merged_outlet_date_only = merged_df['date'].dt.date
    
    # Build sets of unique date-only values
    df_dates = set(df_outlet_date_only.unique())
    merged_dates = set(merged_outlet_date_only.unique())
    
    # Find mismatched dates
    mismatched_dates = df_dates.symmetric_difference(merged_dates)
    
    print(f"Outlet: {outlet_name}")
    print(f"Forgotten Dates:")
    for date in mismatched_dates:
        if date in df_dates:
            print(f"  - {date} is in df but not in merged")
        else:
            print(f"  - {date}")
    print("-"*50)

def check_duplicates_in_df(df, outlet_name):
    """
    Checks for any duplicate dates (ignoring time) in df for a given outlet.
    """
    # 1) Convert the 'Date' column to datetime if not already
    df['Date'] = pd.to_datetime(df['Date'], errors='raise')
    
    # 2) Create a date_only column
    df['date'] = df['Date'].dt.date
    
    # 3) Group by date_only and see if any date appears more than once
    duplicates = (
        df.groupby('date')
          .size()
          .reset_index(name='count')
          .query('count > 1')
    )
    
    # 4) Report results
    print(f"==== Duplicates for {outlet_name} ====")
    if duplicates.empty:
        print("No duplicate dates found.")
    else:
        print(duplicates)
    print("--------------------------------------------------\n")

def merge_on_date_only(
    df_outlet,       # the smaller subset DataFrame
    df_merged,       # the superset merged DataFrame
    date_col_outlet='Date',
    date_col_merged='date',
    how='left'
):
    """
    Merge df_outlet with df_merged on date only (ignoring time).
    By default, keeps all rows in df_merged (the 'superset').
    """
    # Convert to datetime
    df_outlet[date_col_outlet] = pd.to_datetime(df_outlet[date_col_outlet])
    df_merged[date_col_merged] = pd.to_datetime(df_merged[date_col_merged])
    
    # Extract date only
    df_outlet['date_only'] = df_outlet[date_col_outlet].dt.date
    df_merged['date_only'] = df_merged[date_col_merged].dt.date
    
    # Merge on the date_only column
    merged_final = pd.merge(
        df_merged,
        df_outlet,
        on='date_only',
        how=how
    )
    
    return merged_final

In [23]:
df = pd.read_excel(r'daily_data\Picotin_overview.xlsx', engine='openpyxl')
df = df.drop(columns=['Timestamp'])
df = df.dropna(axis=1, how='all')

# Manually replace incorrect entries
df.loc[167, 'Date'] = '2024-12-11 00:00:00'
df.loc[170, 'Date'] = '2024-12-12 00:00:00'
df.loc[456, 'Date'] = '2025-04-01 00:00:00'
df.loc[459, 'Date'] = '2025-04-02 00:00:00'
df.loc[461, 'Date'] = '2025-04-03 00:00:00'

df['Date'] = pd.to_datetime(df['Date'], errors='raise')

# Removed string formatting so we keep Timestamps
# df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')

# Filter with actual Timestamps
df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)].copy()

ASQ_itemgroup = clean_itemgroup_data(r'daily_data\ASQ_itemgroup_daily.xls')
ASQ_daily_summary = clean_daily_summary(r'daily_data\ASQ_daily_summary.csv')

KATONG_itemgroup = clean_itemgroup_data(r'daily_data\KATONG_itemgroup_daily.xls')
KATONG_daily_summary = clean_daily_summary(r'daily_data\KATONG_daily_summary.csv')

RC_itemgroup = clean_itemgroup_data(r'daily_data\RC_itemgroup_daily.xls')
RC_daily_summary = clean_daily_summary(r'daily_data\RC_daily_summary.csv')


In [24]:
outlets = ['ASQ', 'KATONG', 'RC']

for outlet in outlets:
    # Access the respective DataFrames dynamically
    daily_summary = globals().get(f'{outlet}_daily_summary')
    itemgroup = globals().get(f'{outlet}_itemgroup')
    
    if daily_summary is None or itemgroup is None:
        print(f"\nWarning: DataFrames for {outlet} are not loaded.")
        continue

    # Convert Timestamps to *date* only (ignore time)
    summary_date_only = daily_summary['date'].dropna().dt.date
    itemgroup_date_only = itemgroup['date'].dropna().dt.date

    # Find unique dates (no time)
    summary_dates = set(summary_date_only.unique())
    itemgroup_dates = set(itemgroup_date_only.unique())

    # Find missing dates, ignoring time
    missing_in_itemgroup = summary_dates - itemgroup_dates
    missing_in_summary   = itemgroup_dates - summary_dates

    # Find duplicate dates, ignoring time
    duplicates_in_summary    = summary_date_only[summary_date_only.duplicated()]
    duplicates_in_itemgroup  = itemgroup_date_only[itemgroup_date_only.duplicated()]

    # Display the results for the current outlet
    print(f"\nOutlet: {outlet}")
    print("Dates missing in itemgroup:", sorted(missing_in_itemgroup))
    print("Dates missing in daily summary:", sorted(missing_in_summary))
    print("Duplicate dates in daily summary:", duplicates_in_summary.unique())
    print("Duplicate dates in itemgroup:", duplicates_in_itemgroup.unique())



Outlet: ASQ
Dates missing in itemgroup: []
Dates missing in daily summary: []
Duplicate dates in daily summary: []
Duplicate dates in itemgroup: []

Outlet: KATONG
Dates missing in itemgroup: []
Dates missing in daily summary: [datetime.date(2025, 3, 31)]
Duplicate dates in daily summary: []
Duplicate dates in itemgroup: []

Outlet: RC
Dates missing in itemgroup: []
Dates missing in daily summary: [datetime.date(2024, 12, 1), datetime.date(2024, 12, 5), datetime.date(2024, 12, 24), datetime.date(2025, 1, 18), datetime.date(2025, 3, 31)]
Duplicate dates in daily summary: [datetime.date(2024, 12, 4) datetime.date(2024, 12, 23)
 datetime.date(2025, 1, 17)]
Duplicate dates in itemgroup: []


In [25]:
ASQ_merged = merge_outlet(ASQ_itemgroup, ASQ_daily_summary, 'ASQ')
KATONG_merged = merge_outlet(KATONG_itemgroup, KATONG_daily_summary, 'KATONG')
RC_merged = merge_outlet(RC_itemgroup, RC_daily_summary, 'RC')

In [26]:
df_copy = df.copy()

# Print data types of both date columns
print("Data type of 'Date' in df:")
print(df['Date'].dtype)
print("\nData type of 'date' in ASQ_merged:")
print(ASQ_merged['date'].dtype)

# Check for empty or null values in both date columns
print("\nNumber of empty or NaN values in 'Date' column of df:")
print(df['Date'].isna().sum())

print("\nNumber of empty or NaN values in 'date' column of ASQ_merged:")
print(ASQ_merged['date'].isna().sum())

# Print rows where Date is NaN in both dataframes
print("\nRows with empty 'Date' in df:")
print(df[df['Date'].isna()])

print("\nRows with empty 'date' in ASQ_merged:")
print(ASQ_merged[ASQ_merged['date'].isna()])


Data type of 'Date' in df:
datetime64[ns]

Data type of 'date' in ASQ_merged:
object

Number of empty or NaN values in 'Date' column of df:
0

Number of empty or NaN values in 'date' column of ASQ_merged:
0

Rows with empty 'Date' in df:
Empty DataFrame
Columns: [Outlet, Date, Day, Breakfast Sales, Lunch Sales, Evening Sales, Dinner Lunch Sales, Night Sales, Total Sales, Number of Breakfast Floor Staff, Number of Lunch Floor Staff, Number of Evening Floor Staff, Number of Dinner Floor Staff, Number of Night Floor Staff, Number of Breakfast Kitchen Staff, Number of Lunch Kitchen Staff, Number of Evening Kitchen Staff, Number of Dinner Kitchen Staff, Number of Night Kitchen Staff, Total Taxi Claims, Last Bill Closed, Closing Time, Closing Manager Name, Total Floor Staff Hours, Total Kitchen Staff Hours]
Index: []

[0 rows x 25 columns]

Rows with empty 'date' in ASQ_merged:
Empty DataFrame
Columns: [date, total_food, total_drinks, total, orders count, avg order]
Index: []


In [27]:
print_mismatched_dates(df, ASQ_merged, "Asia Square")
print_mismatched_dates(df, KATONG_merged, "Katong")
print_mismatched_dates(df, RC_merged, "Rochester")

Outlet: Asia Square
Forgotten Dates:
  - 2024-12-05
  - 2024-12-31
  - 2024-12-02
  - 2024-12-17
  - 2024-12-03
  - 2024-12-04
--------------------------------------------------
Outlet: Katong
Forgotten Dates:
  - 2024-12-09
  - 2024-12-08
  - 2024-12-05
  - 2024-12-28
  - 2024-12-13
  - 2024-12-02
  - 2024-12-17
  - 2024-12-01
  - 2024-12-11
  - 2024-12-03
  - 2024-12-18
  - 2024-12-06
  - 2024-12-07
  - 2024-12-04
--------------------------------------------------
Outlet: Rochester
Forgotten Dates:
  - 2024-12-09
  - 2024-12-08
  - 2024-12-05
  - 2024-12-22
  - 2025-02-09
  - 2024-12-31
  - 2025-01-28
  - 2024-12-29
  - 2024-12-15
  - 2024-12-02
  - 2025-03-31
  - 2024-12-06
  - 2024-12-07
  - 2024-12-04
  - 2024-12-10
--------------------------------------------------


In [28]:
df_KATONG = df[df['Outlet'] == 'Katong'].copy() 
df_ASQ = df[df['Outlet'] == 'Asia Square'].copy()
df_RC = df[df['Outlet'] == 'Rochester'].copy()

In [29]:
check_duplicates_in_df(df_KATONG, 'KATONG')
check_duplicates_in_df(df_ASQ, 'ASQ')
check_duplicates_in_df(df_RC, 'RC')

==== Duplicates for KATONG ====
          date  count
14  2024-12-29      2
--------------------------------------------------

==== Duplicates for ASQ ====
         date  count
6  2024-12-16      2
--------------------------------------------------

==== Duplicates for RC ====
          date  count
3   2024-12-12      2
77  2025-03-02      2
--------------------------------------------------



In [31]:
# Merge for Katong
KATONG_final = merge_on_date_only(
    df_outlet=df_KATONG,
    df_merged=KATONG_merged,
    date_col_outlet='Date',   # col in df_KATONG
    date_col_merged='date',   # col in KATONG_merged
    how='left'                # keep all rows from KATONG_merged
)

# Merge for Asia Square
ASQ_final = merge_on_date_only(
    df_outlet=df_ASQ,
    df_merged=ASQ_merged,
    date_col_outlet='Date',
    date_col_merged='date',
    how='left'
)

# Merge for Rochester
RC_final = merge_on_date_only(
    df_outlet=df_RC,
    df_merged=RC_merged,
    date_col_outlet='Date',
    date_col_merged='date',
    how='left'
)

# Combine all three into one DataFrame
combined = pd.concat([KATONG_final, ASQ_final, RC_final], ignore_index=True)


In [32]:
combined.to_excel('picotin_dec_march.xlsx', index=False)