In [2]:
import pandas as pd

# File paths
data_path = "../data"

# Load datasets with encoding handling for problematic files
datasets = {
    "train": pd.read_csv(data_path + "/train.csv"),
    "sample_submission": pd.read_csv(data_path + "/sample_submission.csv"),
    "meal_info": pd.read_csv(data_path + "/meal_info.csv"),
    "fulfilment_center_info": pd.read_csv(data_path + "/fulfilment_center_info.csv"),
    "test": pd.read_csv(data_path + "/test.csv"),
    "balaji_fast_food": pd.read_csv(data_path + "/Balaji Fast Food Sales.csv"),
    "cafe_ocean": pd.read_csv(data_path + "/Cafe_Ocean.csv"),
    "country_code": pd.read_csv(data_path + "/Country-Code.csv"),
    "dataset": pd.read_csv(data_path + "/dataset.csv"),
    "dataset2": pd.read_csv(data_path + "/DATASET2.csv"),
    # Handle encoding for these files
    "zomato": pd.read_csv(data_path + "/zomato.csv", encoding="ISO-8859-1"),
    "chaska_cafe": pd.read_csv(data_path + "/chaska_cafe_sales_data.csv", encoding="ISO-8859-1"),
}

# Inspect datasets
for name, df in datasets.items():
    print(f"\n{name.upper()} - Overview:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(df.head())

# Check for missing data
for name, df in datasets.items():
    print(f"\n{name.upper()} - Missing Data:")
    print(df.isnull().sum())

# Check data types
for name, df in datasets.items():
    print(f"\n{name.upper()} - Data Types:")
    print(df.dtypes)

# Check for irregular values
for name, df in datasets.items():
    print(f"\n{name.upper()} - Sample Data with Irregularities:")
    print(df[df.isin(['######', None, '']).any(axis=1)].head())  # Identify rows with invalid placeholders


  "dataset2": pd.read_csv(data_path + "/DATASET2.csv"),



TRAIN - Overview:
Shape: (456548, 9)
Columns: ['id', 'week', 'center_id', 'meal_id', 'checkout_price', 'base_price', 'emailer_for_promotion', 'homepage_featured', 'num_orders']
        id  week  center_id  meal_id  checkout_price  base_price  \
0  1379560     1         55     1885          136.83      152.29   
1  1466964     1         55     1993          136.83      135.83   
2  1346989     1         55     2539          134.86      135.86   
3  1338232     1         55     2139          339.50      437.53   
4  1448490     1         55     2631          243.50      242.50   

   emailer_for_promotion  homepage_featured  num_orders  
0                      0                  0         177  
1                      0                  0         270  
2                      0                  0         189  
3                      0                  0          54  
4                      0                  0          40  

SAMPLE_SUBMISSION - Overview:
Shape: (32573, 2)
Columns: ['id', 

In [5]:
# Handle missing values
for name, df in datasets.items():
    print(f"\n{name.upper()} - Missing Values Before Handling:")
    print(df.isnull().sum())

    # Check if 'Date' or 'date' exists and convert to datetime
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    elif 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # Forward fill and backward fill missing values
    df.fillna(method='ffill', inplace=True)  # Forward fill
    df.fillna(method='bfill', inplace=True)  # Backward fill

    print(f"\n{name.upper()} - Missing Values After Handling:")
    print(df.isnull().sum())



TRAIN - Missing Values Before Handling:
id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
num_orders               0
dtype: int64

TRAIN - Missing Values After Handling:
id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
num_orders               0
dtype: int64

SAMPLE_SUBMISSION - Missing Values Before Handling:
id            0
num_orders    0
dtype: int64

SAMPLE_SUBMISSION - Missing Values After Handling:
id            0
num_orders    0
dtype: int64

MEAL_INFO - Missing Values Before Handling:
meal_id     0
category    0
cuisine     0
dtype: int64

MEAL_INFO - Missing Values After Handling:
meal_id     0
category    0
cuisine     0
dtype: int64

FULFILMENT_C

  df.fillna(method='ffill', inplace=True)  # Forward fill
  df.fillna(method='bfill', inplace=True)  # Backward fill
  df.fillna(method='ffill', inplace=True)  # Forward fill
  df.fillna(method='bfill', inplace=True)  # Backward fill
  df.fillna(method='ffill', inplace=True)  # Forward fill
  df.fillna(method='bfill', inplace=True)  # Backward fill


Date            0
Bill Number     0
Item Desc       0
Time            0
Quantity        0
Rate            0
Tax             0
Discount        0
Total           0
Category        0
dtype: int64

COUNTRY_CODE - Missing Values Before Handling:
Country Code    0
Country         0
dtype: int64

COUNTRY_CODE - Missing Values After Handling:
Country Code    0
Country         0
dtype: int64

DATASET - Missing Values Before Handling:
sl no              0
date               0
item_name          0
item_type          0
Unnamed: 4      1000
quantity           0
Unnamed: 6      1000
Unnamed: 7      1000
Unnamed: 8      1000
time_of_sale       0
Unnamed: 10     1000
Unnamed: 11     1000
Unnamed: 12     1000
item               0
count              0
dtype: int64

DATASET - Missing Values After Handling:
sl no              0
date               0
item_name          0
item_type          0
Unnamed: 4      1000
quantity           0
Unnamed: 6      1000
Unnamed: 7      1000
Unnamed: 8      1000
time_of_sale

  df.fillna(method='ffill', inplace=True)  # Forward fill
  df.fillna(method='bfill', inplace=True)  # Backward fill
  df.fillna(method='ffill', inplace=True)  # Forward fill
  df.fillna(method='bfill', inplace=True)  # Backward fill
  df.fillna(method='ffill', inplace=True)  # Forward fill
  df.fillna(method='bfill', inplace=True)  # Backward fill


In [6]:
# Standardize time information
for name, df in datasets.items():
    print(f"\n{name.upper()} - Before Time Alignment:")

    if 'week' in df.columns:
        # Convert 'week' to a standardized date (assuming Week 1 corresponds to a fixed start date)
        df['standardized_date'] = pd.to_datetime("2023-01-01") + pd.to_timedelta(df['week'] * 7, unit='D')
    elif 'Date' in df.columns:
        # Ensure all 'Date' columns are consistently named
        df.rename(columns={'Date': 'standardized_date'}, inplace=True)
    elif 'date' in df.columns:
        # Ensure lowercase 'date' columns are consistently named
        df.rename(columns={'date': 'standardized_date'}, inplace=True)

    # Print a sample of the updated time column
    if 'standardized_date' in df.columns:
        print(df[['standardized_date']].head())



TRAIN - Before Time Alignment:
  standardized_date
0        2023-01-08
1        2023-01-08
2        2023-01-08
3        2023-01-08
4        2023-01-08

SAMPLE_SUBMISSION - Before Time Alignment:

MEAL_INFO - Before Time Alignment:

FULFILMENT_CENTER_INFO - Before Time Alignment:

TEST - Before Time Alignment:
  standardized_date
0        2025-10-19
1        2025-10-19
2        2025-10-19
3        2025-10-19
4        2025-10-19

BALAJI_FAST_FOOD - Before Time Alignment:
  standardized_date
0        2022-07-03
1        2022-07-03
2        2022-07-03
3        2023-02-03
4        2022-10-02

CAFE_OCEAN - Before Time Alignment:
  standardized_date
0        2020-01-01
1        2020-01-01
2        2020-01-01
3        2020-01-01
4        2020-01-01

COUNTRY_CODE - Before Time Alignment:

DATASET - Before Time Alignment:
  standardized_date
0        2022-01-04
1        2022-01-05
2        2022-01-06
3        2022-01-07
4        2022-01-07

DATASET2 - Before Time Alignment:
  standardized_date
