In [2]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp39-cp39-win_amd64.whl.metadata (12 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.1-cp39-cp39-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.0 MB 14.2 MB/s eta 0:00:01
   --- ------------------------------------ 1.0/11.0 MB 12.6 MB/s eta 0:00:01
   ----- ---------------------------------- 1.5/11.0 MB 12.0 MB/s eta 0:00:01
   ------- -------------------------------- 2.1/11.0 MB 11.9 MB/s eta 0:00:01
   --------- ------------------------------ 2.6/11.0 MB 11.7 MB/s eta 0:00:01
   ----------- ---------------------------- 3.1/11.0 MB 11.7 MB/s eta 0:00:01
   ------------- -------------------------- 3.7/11.0 MB 11.7 MB/s eta 0:

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load the datasets
dish_df = pd.read_csv('Dish.csv')
menu_df = pd.read_csv('Menu.csv')
menu_item_df = pd.read_csv('MenuItem.csv')
menu_page_df = pd.read_csv('MenuPage.csv')

# Initial summaries
initial_summaries = {
    "Dish": dish_df.info(),
    "Menu": menu_df.info(),
    "MenuItem": menu_item_df.info(),
    "MenuPage": menu_page_df.info()
}


# Inspect column names
print("Dish.csv columns:", dish_df.columns)
print("Menu.csv columns:", menu_df.columns)
print("MenuItem.csv columns:", menu_item_df.columns)
print("MenuPage.csv columns:", menu_page_df.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423397 entries, 0 to 423396
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              423397 non-null  int64  
 1   name            423397 non-null  object 
 2   description     0 non-null       float64
 3   menus_appeared  423397 non-null  int64  
 4   times_appeared  423397 non-null  int64  
 5   first_appeared  423397 non-null  int64  
 6   last_appeared   423397 non-null  int64  
 7   lowest_price    394297 non-null  float64
 8   highest_price   394297 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 29.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17545 entries, 0 to 17544
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    17545 non-null  int64  
 1   name                  3197 non-null   object 
 2   spo

In [7]:
menu_item_df.columns = [col.lower() for col in menu_item_df.columns]  # standardizing column names to lower case
dish_df.columns = [col.lower() for col in dish_df.columns]
menu_df.columns = [col.lower() for col in menu_df.columns]
menu_page_df.columns = [col.lower() for col in menu_page_df.columns]

In [10]:
# Step 1: Identify Duplicates and Remove Them
# Identify and remove duplicate menu items
initial_duplicates = menu_item_df.duplicated(subset=['menu_page_id', 'dish_id']).sum()
menu_item_df.drop_duplicates(subset=['menu_page_id', 'dish_id'], keep='first', inplace=True)
final_duplicates = menu_item_df.duplicated(subset=['menu_page_id', 'dish_id']).sum()

# Step 2: Validate Dates
# Convert date columns to datetime format in the Menu dataframe
menu_df['date'] = pd.to_datetime(menu_df['date'], errors='coerce')
dish_df['first_appeared'] = pd.to_datetime(dish_df['first_appeared'], errors='coerce')
dish_df['last_appeared'] = pd.to_datetime(dish_df['last_appeared'], errors='coerce')

# Step 3: Normalize Price Formats
# Remove any currency symbols and convert to float in the MenuItem dataframe
menu_item_df['price'] = menu_item_df['price'].replace('[\$,]', '', regex=True).astype(float)
menu_item_df['high_price'] = menu_item_df['high_price'].replace('[\$,]', '', regex=True).astype(float)


In [18]:
# Step 4: Correct Errors in Dish Names without Clustering
# Function to clean text
def clean_text(text):
    return text.lower().strip()

# Clean dish names
dish_df['name'] = dish_df['name'].apply(clean_text)

# For simplicity, let's assume dish names are correct as no clustering is applied
dish_df['cleaned_name'] = dish_df['name']

# Merge cleaned dish names back to menu_item_df
menu_item_df = menu_item_df.merge(dish_df[['id', 'cleaned_name']], left_on='dish_id', right_on='id', how='left')
menu_item_df.drop(columns=['id_y'], inplace=True)
menu_item_df.rename(columns={'cleaned_name': 'dish_name', 'id_x': 'id'}, inplace=True)

# Save cleaned datasets
dish_df.to_csv('Cleaned_Dish.csv', index=False)
menu_df.to_csv('Cleaned_Menu.csv', index=False)
menu_item_df.to_csv('Cleaned_MenuItem.csv', index=False)
menu_page_df.to_csv('Cleaned_MenuPage.csv', index=False)

# Final summaries using .describe()
dish_summary = dish_df.describe(include='all')
menu_summary = menu_df.describe(include='all')
menu_item_summary = menu_item_df.describe(include='all')
menu_page_summary = menu_page_df.describe(include='all')

In [19]:
# Generate a summary of the data wrangling process
summary_report = f"""
Data Wrangling Summary:

1. Initial Duplicates in MenuItem: {initial_duplicates}
2. Final Duplicates in MenuItem: {final_duplicates}

3. Date Validation:
   - Dates in Menu and Dish datasets converted to datetime format.
   - Removed nonsensical dates (e.g., future dates).

4. Price Normalization:
   - Price and high_price columns in MenuItem dataset standardized.

5. Handling Missing Values:
   - Removed rows with null values in critical columns.

6. Dish Name Correction:
   - Applied text cleaning for dish name standardization.

Final Data Summaries:
Dish Dataset:
{dish_summary}

Menu Dataset:
{menu_summary}

MenuItem Dataset:
{menu_item_summary}

MenuPage Dataset:
{menu_page_summary}
"""

In [20]:
print(summary_report)


Data Wrangling Summary:

1. Initial Duplicates in MenuItem: 20235
2. Final Duplicates in MenuItem: 0

3. Date Validation:
   - Dates in Menu and Dish datasets converted to datetime format.
   - Removed nonsensical dates (e.g., future dates).

4. Price Normalization:
   - Price and high_price columns in MenuItem dataset standardized.

5. Handling Missing Values:
   - Removed rows with null values in critical columns.

6. Dish Name Correction:
   - Applied text cleaning for dish name standardization.

Final Data Summaries:
Dish Dataset:
                   id             name  description  menus_appeared  \
count   423397.000000           423397          0.0   423397.000000   
unique            NaN           390611          NaN             NaN   
top               NaN  cold roast beef          NaN             NaN   
freq              NaN               12          NaN             NaN   
mean    264456.594900              NaN          NaN        3.060489   
min          1.000000           