In [None]:
#Data loading and cleaning
import pandas as pd

# Load dataset
df = pd.read_csv(r"D:\Swiggy\data\swiggy.csv")
df.head()

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [6]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148541 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            148541 non-null  int64 
 1   name          148455 non-null  object
 2   city          148541 non-null  object
 3   rating        148455 non-null  object
 4   rating_count  148455 non-null  object
 5   cost          148410 non-null  object
 6   cuisine       148442 non-null  object
 7   lic_no        148312 non-null  object
 8   link          148541 non-null  object
 9   address       148455 non-null  object
 10  menu          148541 non-null  object
dtypes: int64(1), object(10)
memory usage: 12.5+ MB


(148541, 11)

In [3]:
df.describe()

Unnamed: 0,id
count,148541.0
mean,363466.378912
std,167890.977174
min,211.0
25%,233320.0
50%,412628.0
75%,502223.0
max,581031.0


In [None]:
#Removing duplicates
df = df.drop_duplicates()

In [7]:
# Drop rows with missing values
df = df.dropna(subset=['name', 'rating', 'rating_count', 'cost', 'cuisine', 'address', 'lic_no'])
print("After dropping rows with missing values:", df.shape)

After dropping rows with missing values: (148255, 11)


In [11]:
import numpy as np

#clean rating
df = df[df['rating'] != '--']
df['rating'] = df['rating'].astype(float)

#Clean Cost
df['cost'] = df['cost'].astype(str)
df['cost'] = df['cost'].str.replace('₹', '', regex=False).str.replace(',', '', regex=False)
df['cost'] = df['cost'].astype(int)

#Clean rating count
def cleaned_rc(value):
    value = str(value).strip().lower()
    if 'too few' in value:
        return 0
    elif 'k' in value:
        try:
            num = float(value.split('k')[0])
            return int(num * 1000)
        except:
            return 0
    elif '+' in value:
        try:
            return int(value.split('+')[0].strip())
        except:
            return 0
    else:
        try:
            return int(value)
        except:
            return 0


df['rating_count'] = df['rating_count'].apply(cleaned_rc)

df.reset_index(drop=True, inplace=True)

#Save cleaned data
df.to_csv(r"D:\Swiggy\data\cleaned_data.csv", index=False)

print("Cleaning complete. Cleaned data saved as 'cleaned_data.csv'")


Cleaning complete. Cleaned data saved as 'cleaned_data.csv'


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


# Load the cleaned data
cleaned_df = pd.read_csv(r"D:\Swiggy\data\cleaned_data.csv")

df_encoded = cleaned_df.copy()

# Split multiple cuisines into separate rows
df_encoded['cuisine'] = df_encoded['cuisine'].str.split(', ')
df_encoded = df_encoded.explode('cuisine')

# One-hot encode 'city' and 'cuisine'
df_ohe = pd.get_dummies(df_encoded[['city', 'cuisine']], prefix=['city', 'cuisine'])

# Drop original 'city' and 'cuisine' columns and join encoded columns
df_encoded = df_encoded.drop(columns=['city', 'cuisine'])
df_encoded = pd.concat([df_encoded, df_ohe], axis=1)

# Save the encoded data
df_encoded.to_csv(r"D:\Swiggy\data\encoded_data.csv", index=False)
print(" Encoded data saved successfully!")


 Encoded data saved successfully!


In [13]:
import pickle

# Save the pkl file
encoder = df_encoded.columns.tolist()
with open(r"D:\Swiggy\data\encoder.pkl", 'wb') as f:
    pickle.dump(encoder, f)

print("pkl file saved successfully")

pkl file saved successfully


In [35]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pickle

df_encoded = pd.read_csv(r"D:\Swiggy\data\encoded_data.csv")

df_cluster = df_encoded.drop(columns=['id', 'name', 'lic_no', 'link', 'address', 'menu'])

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cluster)

#Apply KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to original data
df_encoded['cluster'] = clusters

# Save clustered data
df_encoded.to_csv(r"D:\Swiggy\data\clustered_data.csv", index=False)

with open(r"D:\Swiggy\data\scaler.pkl", 'wb') as f:
    pickle.dump(scaler, f)

with open(r"D:\Swiggy\data\kmeans_model.pkl", 'wb') as f:
    pickle.dump(kmeans, f)

print("KMeans clustering done and saved!")


KMeans clustering done and saved!


In [36]:
import pandas as pd
import numpy as np
import pickle

# Load encoded & clustered data
df_encoded = pd.read_csv(r"D:\Swiggy\data\encoded_data.csv")
df_clustered = pd.read_csv(r"D:\Swiggy\data\clustered_data.csv")

with open(r"D:\Swiggy\data\encoder.pkl", 'rb') as f:
    encoder = pickle.load(f)

with open(r"D:\Swiggy\data\scaler.pkl", 'rb') as f:
    scaler = pickle.load(f)

with open(r"D:\Swiggy\data\kmeans_model.pkl", 'rb') as f:
    kmeans = pickle.load(f)

cleaned_df = pd.read_csv(r"D:\Swiggy\data\cleaned_data.csv")

cleaned_df['cluster'] = df_clustered['cluster']

df_multi = cleaned_df.copy()
df_multi['cuisine'] = df_multi['cuisine'].str.split(',')
df_multi = df_multi.explode('cuisine')
df_multi['cuisine'] = df_multi['cuisine'].str.strip()

# ---------- Recommendation Function ----------
def recommend_restaurants(city, cuisine, rating, cost, top_n=5):
    # Create a single-row input DataFrame matching encoder columns
    input_data = pd.DataFrame(columns=encoder)
    input_data.loc[0] = 0  # initialize with zeros

    input_data.at[0, 'rating'] = rating
    input_data.at[0, 'rating_count'] = 100  # assume average rating count
    input_data.at[0, 'cost'] = cost

    city_col = f'city_{city}'
    cuisine_col = f'cuisine_{cuisine}'

    if city_col in input_data.columns:
        input_data.at[0, city_col] = 1
    if cuisine_col in input_data.columns:
        input_data.at[0, cuisine_col] = 1

    feature_input = input_data.drop(columns=['id', 'name', 'lic_no', 'link', 'address', 'menu', 'cluster'], errors='ignore')
    scaled_input = scaler.transform(feature_input)

    cluster_label = kmeans.predict(scaled_input)[0]

    # Filter similar restaurants
    similar_restaurants = df_multi[df_multi['cluster'] == cluster_label]

    # Sort by rating and get top N unique restaurants
    top_n_results = similar_restaurants.sort_values(by='rating', ascending=False)
    top_n_results = top_n_results.drop_duplicates(subset='name').head(top_n)

    return top_n_results[['name', 'city', 'cuisine', 'rating', 'cost']]



In [37]:

# ✅ Example test
recommendations = recommend_restaurants(city="Chennai", cuisine="Chinese", rating=4.0, cost=300, top_n=5)
print(recommendations)

                                                  name  \
7829   Lubov Patisserie - Cakes, Macarons and Desserts   
31134                                      Momo Street   
31138                           Gobblers Rolls & Bowls   
21837                                    HRX by Eatfit   
46253                                  Nawab'z Kitchen   

                              city       cuisine  rating  cost  
7829   Basaveshwaranagar,Bangalore        Bakery     5.0   500  
31134       Abids & Koti,Hyderabad       Chinese     5.0   300  
31138       Abids & Koti,Hyderabad  North Indian     5.0   300  
21837            Mayur Vihar,Delhi        Salads     5.0   270  
46253                Airoli,Mumbai  North Indian     5.0   450  
