## Import Library

In [1]:
import pandas as pd
import numpy as np
import ast

In [4]:
# Load the dataset
df = pd.read_csv(r'../data/zomato.csv').sample(frac=0.3, random_state=42)
print(f"Original dataset shape: {df.shape}")


Original dataset shape: (15515, 17)


In [5]:
# Missing values analysis
print("\n MISSING VALUES ANALYSIS ")
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_data,
    'Missing_Percentage': missing_percentage
}).sort_values('Missing_Percentage', ascending=False)
print(missing_df)


 MISSING VALUES ANALYSIS 
                             Missing_Count  Missing_Percentage
dish_liked                            8426           54.308733
rate                                  2347           15.127296
phone                                  389            2.507251
approx_cost(for two people)            115            0.741218
rest_type                               84            0.541412
cuisines                                15            0.096681
location                                 8            0.051563
url                                      0            0.000000
address                                  0            0.000000
votes                                    0            0.000000
name                                     0            0.000000
online_order                             0            0.000000
book_table                               0            0.000000
reviews_list                             0            0.000000
menu_item                   

### Comprehensive Data Cleaning

In [6]:
# Create a copy for feature engineering
df_clean = df.copy()

# 1. Clean rating column
def clean_rating(rate):
    if pd.isna(rate) or rate in ['NEW', '-', 'nan']:
        return np.nan
    try:
        return float(rate.split('/')[0])
    except:
        return np.nan
df_clean['rating'] = df_clean['rate'].apply(clean_rating)

# 2. Clean cost column
def clean_cost(cost):
    if pd.isna(cost):
        return np.nan
    try:
        # Remove commas and currency symbols
        cost_str = str(cost).replace(',', '').replace('₹', '').strip()
        return float(cost_str)
    except:
        return np.nan
# df_clean_clean['cost_for_two'] = df_clean_clean['approx_cost(for two people)'].apply(clean_cost)
df_clean['cost'] = df_clean['approx_cost(for two people)'].apply(clean_cost)

# Fill missing values
df_clean['rating'] = df_clean['rating'].fillna(df_clean['rating'].median())
df_clean['cost'] = df_clean['cost'].fillna(df_clean['cost'].median())
df_clean['votes'] = df_clean['votes'].fillna(0)

print(f"After cleaning - Rating nulls: {df_clean['rating'].isnull().sum()}, Cost nulls: {df_clean['cost'].isnull().sum()}")
# Clean and process cuisines
def process_cuisines(cuisines):
    if pd.isna(cuisines):
        return 'not_specified'
    return str(cuisines).lower().strip()

df_clean['cuisines_clean'] = df_clean['cuisines'].apply(process_cuisines)

# Clean location
df_clean['location_clean'] = df_clean['location'].fillna('unknown').str.lower().str.strip()

# Clean restaurant type
df_clean['rest_type_clean'] = df_clean['rest_type'].fillna('not_specified').str.lower().str.strip()

# Process dishes liked
def process_dishes(dishes):
    if pd.isna(dishes) or dishes == '':
        return 'no_dishes'
    try:
        dishes_str = str(dishes).replace(';', ',')
        dish_list = [dish.strip().lower() for dish in dishes_str.split(',') if dish.strip()]
        return ', '.join(dish_list) if dish_list else 'no_dishes'
    except:
        return 'no_dishes'

df_clean['dishes_clean'] = df_clean['dish_liked'].apply(process_dishes)

print("Text features processed")

After cleaning - Rating nulls: 0, Cost nulls: 0
Text features processed


In [7]:
def create_restaurant_text(row):
    """Combine all relevant text features into one string for embedding"""
    parts = []
    
    # Add cuisines
    if row['cuisines_clean'] != 'not_specified':
        parts.append(f"cuisines: {row['cuisines_clean']}")
    
    # Add restaurant type  
    if row['rest_type_clean'] != 'not_specified':
        parts.append(f"type: {row['rest_type_clean']}")
    
    # Add location
    if row['location_clean'] != 'unknown':
        parts.append(f"location: {row['location_clean']}")
    
    # Add dishes
    if row['dishes_clean'] != 'no_dishes':
        parts.append(f"popular dishes: {row['dishes_clean']}")
    
    # Add service info
    services = []
    if row['online_order'] == 'Yes':
        services.append('online ordering available')
    if row['book_table'] == 'Yes':
        services.append('table booking available')
    
    if services:
        parts.append(f"services: {', '.join(services)}")
    
    return ' | '.join(parts) if parts else 'no information available'

df_clean['embedding_text'] = df_clean.apply(create_restaurant_text, axis=1)


### Numerical Feature Engineering

In [8]:
# Create normalized numerical features
df_clean['rating_normalized'] = df_clean['rating'] / 5.0  # Normalize to 0-1
df_clean['cost_normalized'] = (df_clean['cost'] - df_clean['cost'].min()) / (df_clean['cost'].max() - df_clean['cost'].min())  # Min-max normalization
df_clean['votes_log'] = np.log1p(df_clean['votes'])  # Log transform votes
df_clean['votes_normalized'] = (df_clean['votes_log'] - df_clean['votes_log'].min()) / (df_clean['votes_log'].max() - df_clean['votes_log'].min())

print("Numerical features normalized")

Numerical features normalized


### Select features for embedding

In [9]:
# Select features for embedding
embedding_data = df_clean[['name', 'address', 'location_clean', 'cuisines_clean', 
                          'rest_type_clean', 'embedding_text', 'rating', 'cost', 
                          'rating_normalized', 'cost_normalized', 'votes_normalized',
                          'online_order', 'book_table', 'votes']].copy()

# Rename columns for clarity
embedding_data = embedding_data.rename(columns={
    'location_clean': 'location',
    'cuisines_clean': 'cuisines', 
    'rest_type_clean': 'restaurant_type'
})

print(f"Embedding data shape before cleaning: {embedding_data.shape}")



Embedding data shape before cleaning: (15515, 14)


In [10]:
# Remove any remaining nulls in critical columns
embedding_data = embedding_data.dropna(subset=['name', 'embedding_text'])

print(f"Final embedding dataset shape: {embedding_data.shape}")

Final embedding dataset shape: (15515, 14)


In [11]:
print("Final data check:")
print(f"Null values in each column:")
print(embedding_data.isnull().sum())

print(f"\nSample embedding text:")
print(embedding_data['embedding_text'].iloc[0])

Final data check:
Null values in each column:
name                 0
address              0
location             0
cuisines             0
restaurant_type      0
embedding_text       0
rating               0
cost                 0
rating_normalized    0
cost_normalized      0
votes_normalized     0
online_order         0
book_table           0
votes                0
dtype: int64

Sample embedding text:
cuisines: oriya, fast food | type: quick bites | location: btm | popular dishes: rasgulla, mutton kosha, chicken kasha, samosa chaat, kheer, veg thali | services: online ordering available


In [12]:
import os

# Create output directory
os.makedirs('../processed_data', exist_ok=True)

# Save the processed data
embedding_data.to_csv('../processed_data/restaurants_for_embedding.csv', index=False)

print("Data saved to '../processed_data/restaurants_for_embedding.csv'")
print(f"Ready for embedding with {len(embedding_data)} restaurants")


# Display final summary
print("\n=== FINAL DATASET SUMMARY ===")
print(f"Total restaurants: {len(embedding_data)}")
print(f"Columns: {list(embedding_data.columns)}")
print(f"\nNumerical features summary:")
print(embedding_data[['rating', 'cost', 'votes', 'rating_normalized', 'cost_normalized', 'votes_normalized']].describe())


# Preview the final data
print("\n=== DATA PREVIEW ===")
embedding_data.head()

Data saved to '../processed_data/restaurants_for_embedding.csv'
Ready for embedding with 15515 restaurants

=== FINAL DATASET SUMMARY ===
Total restaurants: 15515
Columns: ['name', 'address', 'location', 'cuisines', 'restaurant_type', 'embedding_text', 'rating', 'cost', 'rating_normalized', 'cost_normalized', 'votes_normalized', 'online_order', 'book_table', 'votes']

Numerical features summary:
             rating          cost         votes  rating_normalized  \
count  15515.000000  15515.000000  15515.000000       15515.000000   
mean       3.704705    554.779826    286.243764           0.740941   
std        0.394439    431.608506    777.574957           0.078888   
min        1.800000     40.000000      0.000000           0.360000   
25%        3.500000    300.000000      7.000000           0.700000   
50%        3.700000    400.000000     42.000000           0.740000   
75%        3.900000    700.000000    201.000000           0.780000   
max        4.900000   4100.000000  16832.

Unnamed: 0,name,address,location,cuisines,restaurant_type,embedding_text,rating,cost,rating_normalized,cost_normalized,votes_normalized,online_order,book_table,votes
8440,Kalingas,"399, 16th Main, N.S Palya, BTM, Bangalore",btm,"oriya, fast food",quick bites,"cuisines: oriya, fast food | type: quick bites...",3.9,250.0,0.78,0.051724,0.533646,Yes,No,179
23575,Angel Restaurant,"1st Cross, Bilekahalli, Near IIMB College, Ban...",bannerghatta road,"kerala, south indian, north indian",quick bites,"cuisines: kerala, south indian, north indian |...",2.8,450.0,0.56,0.100985,0.525325,Yes,No,165
16711,Cafe Talkhouse,"4rd floor, 7th Cross, Opposite to BMTC Bus Dep...",hsr,north indian,quick bites,cuisines: north indian | type: quick bites | l...,3.8,400.0,0.76,0.08867,0.326587,No,No,23
42588,Paratha Xpress,"17/1 Cambridge Road, Opposite The Frank Antony...",ulsoor,"north indian, chinese",quick bites,"cuisines: north indian, chinese | type: quick ...",3.5,200.0,0.7,0.039409,0.225794,No,No,8
5266,Classic Lassi Shop,"Skywalk, 5/1, Near Body Craft, Assaye Road, Ul...",ulsoor,"beverages, ice cream",beverage shop,"cuisines: beverages, ice cream | type: beverag...",3.5,150.0,0.7,0.027094,0.388876,Yes,No,43
