In [90]:
import pandas as pd

In [91]:
import pandas as pd

# Date range boundaries
start_date = pd.Timestamp('2024-12-01')
end_date = pd.Timestamp('2025-03-31')

def clean_itemgroup_data(file_path):
    # Read Excel file
    df = pd.read_excel(file_path, engine='xlrd', skiprows=4)
    df = df[:-1]
    df = df.dropna(axis=1, how='all')
    df.columns = ['Date'] + list(df.columns[1:])
    
    # Drop unnecessary columns
    if 'Unnamed: 2' in df.columns:
        df = df.drop(columns=['Unnamed: 2'])
    
    df = df[df['Date'] != 'Total']
    df = df[df['Date'].notna()]
    
    # Drop columns with 'Unnamed' prefix
    unnamed_columns = [col for col in df.columns if col.startswith('Unnamed')]
    df = df.drop(columns=unnamed_columns)
    
    # Format the Date column
    df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%y')
    
    # Filter by date range
    df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    
    # Convert the date back to the desired format
    df['Date'] = df['Date'].dt.strftime('%m/%d/%Y')
    
    # Convert all columns except 'Date' to float, handling non-numeric values
    for col in df.columns[1:]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Calculate total_food and total_drinks for each row
    food_cols = list(food.intersection(df.columns))
    drink_cols = list(drinks.intersection(df.columns))

    df['total_food'] = df[food_cols].sum(axis=1, skipna=True)
    df['total_drinks'] = df[drink_cols].sum(axis=1, skipna=True)
    
    # Drop all columns except Date, total_food, total_drinks, Total
    columns_to_keep = ['Date', 'total_food', 'total_drinks', 'Total']
    df = df[[col for col in columns_to_keep if col in df.columns]]
    
    # Convert column names to lowercase
    df.columns = df.columns.str.lower()
    
    return df

def clean_daily_summary(file_path):
    # Read CSV file with appropriate skipping of rows and footers
    df = pd.read_csv(file_path, skiprows=4, skipfooter=7, engine='python')
    
    # Drop unnamed columns
    unnamed_columns = [col for col in df.columns if col.startswith('Unnamed:')]
    df.drop(columns=unnamed_columns, inplace=True)
    
    # Convert START column to date format
    df['START'] = pd.to_datetime(df['START'], format='%d/%m/%Y %H:%M:%S')
    
    # Filter by date range
    df = df[(df['START'] >= start_date) & (df['START'] <= end_date)]
    
    # Format the date to the desired format
    df['START'] = df['START'].dt.strftime('%d/%m/%Y')
    
    # Drop all other columns except START, ORDERS COUNT, AVG ORDER
    columns_to_keep = ['START', 'ORDERS COUNT', 'AVG ORDER']
    df = df[[col for col in columns_to_keep if col in df.columns]]
    
    # Rename START to date
    df.rename(columns={'START': 'date'}, inplace=True)
    
    # Convert all column names to lowercase
    df.columns = df.columns.str.lower()
    
    return df


In [92]:
ASQ_itemgroup = clean_itemgroup_data(r'daily_data\ASQ_itemgroup_daily.xls')
ASQ_daily_summary = clean_daily_summary(r'daily_data\KATONG_daily_summary.csv')

KATONG_itemgroup = clean_itemgroup_data(r'daily_data\KATONG_itemgroup_daily.xls')
KATONG_daily_summary = clean_daily_summary(r'daily_data\KATONG_daily_summary.csv')

RC_itemgroup = clean_itemgroup_data(r'daily_data\RC_itemgroup_daily.xls')
RC_daily_summary = clean_daily_summary(r'daily_data\RC_daily_summary.csv')

In [93]:
RC_itemgroup['date']

12     12/01/2024
13     12/02/2024
14     12/03/2024
16     12/04/2024
17     12/05/2024
          ...    
147    03/27/2025
149    03/28/2025
150    03/29/2025
151    03/30/2025
152    03/31/2025
Name: date, Length: 121, dtype: object

In [94]:
RC_daily_summary['date']

11     02/12/2024
12     03/12/2024
13     04/12/2024
14     04/12/2024
15     06/12/2024
          ...    
125    26/03/2025
126    27/03/2025
127    28/03/2025
128    29/03/2025
129    30/03/2025
Name: date, Length: 119, dtype: object

In [95]:
# Convert the date columns to datetime for accurate comparison
RC_daily_summary['date'] = pd.to_datetime(RC_daily_summary['date'], dayfirst=True, errors='coerce')
RC_itemgroup['date'] = pd.to_datetime(RC_itemgroup['date'], dayfirst=True, errors='coerce')

# Find unique dates from both DataFrames
summary_dates = set(RC_daily_summary['date'].dropna().unique())
itemgroup_dates = set(RC_itemgroup['date'].dropna().unique())

# Find missing dates
missing_in_itemgroup = summary_dates - itemgroup_dates
missing_in_summary = itemgroup_dates - summary_dates

# Find duplicate dates in each DataFrame
duplicates_in_summary = RC_daily_summary['date'][RC_daily_summary['date'].duplicated()]
duplicates_in_itemgroup = RC_itemgroup['date'][RC_itemgroup['date'].duplicated()]

# Display the results
print("Dates missing in RC_itemgroup:", sorted(missing_in_itemgroup))
print("Dates missing in RC_daily_summary:", sorted(missing_in_summary))
print("\nDuplicate dates in RC_daily_summary:", duplicates_in_summary.unique())
print("Duplicate dates in RC_itemgroup:", duplicates_in_itemgroup.unique())


Dates missing in RC_itemgroup: [Timestamp('2024-12-02 00:00:00'), Timestamp('2024-12-03 00:00:00'), Timestamp('2024-12-04 00:00:00'), Timestamp('2024-12-06 00:00:00'), Timestamp('2024-12-07 00:00:00'), Timestamp('2024-12-08 00:00:00'), Timestamp('2024-12-09 00:00:00'), Timestamp('2024-12-10 00:00:00'), Timestamp('2024-12-11 00:00:00'), Timestamp('2024-12-13 00:00:00'), Timestamp('2024-12-14 00:00:00'), Timestamp('2024-12-15 00:00:00'), Timestamp('2024-12-16 00:00:00'), Timestamp('2024-12-17 00:00:00'), Timestamp('2024-12-18 00:00:00'), Timestamp('2024-12-19 00:00:00'), Timestamp('2024-12-20 00:00:00'), Timestamp('2024-12-21 00:00:00'), Timestamp('2024-12-22 00:00:00'), Timestamp('2024-12-23 00:00:00'), Timestamp('2024-12-25 00:00:00'), Timestamp('2024-12-26 00:00:00'), Timestamp('2024-12-27 00:00:00'), Timestamp('2024-12-28 00:00:00'), Timestamp('2024-12-29 00:00:00'), Timestamp('2024-12-30 00:00:00'), Timestamp('2024-12-31 00:00:00'), Timestamp('2025-01-04 00:00:00'), Timestamp('2025-