In [1]:
#Data loading and cleaning
import pandas as pd

# Load dataset
file_path = r"D:\Swiggy\data\swiggy.csv"
df = pd.read_csv(file_path)

# Drop rows with missing critical fields
df = df.dropna(subset=['name', 'rating', 'cost', 'cuisine'])

# Remove rows with invalid ratings (e.g., '--')
df = df[df['rating'] != '--']

# Convert 'rating' to float
df['rating'] = df['rating'].astype(float)

# Clean 'cost': remove ₹ symbol and commas
df['cost'] = df['cost'].str.replace('₹', '', regex=False).str.replace(',', '', regex=False)

# Convert cost to integer, drop rows with errors
df['cost'] = pd.to_numeric(df['cost'], errors='coerce')
df = df.dropna(subset=['cost'])
df['cost'] = df['cost'].astype(int)

# Reset index
df.reset_index(drop=True, inplace=True)

# Save cleaned data
df.to_csv(r"D:\Swiggy\data\cleaned_data.csv", index=False)

print("✅ Cleaning complete. Cleaned data saved as 'cleaned_data.csv'")
print(f"Shape of cleaned data: {df.shape}")


✅ Cleaning complete. Cleaned data saved as 'cleaned_data.csv'
Shape of cleaned data: (61421, 11)


In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import pickle

# Load the cleaned data
cleaned_df = pd.read_csv(r"D:\Swiggy\data\cleaned_data.csv")

# STEP 1: Expand 'cuisine' column into multiple values
# Split comma-separated cuisines into list
cleaned_df['cuisine'] = cleaned_df['cuisine'].str.split(',')

# Trim whitespace from each cuisine
cleaned_df['cuisine'] = cleaned_df['cuisine'].apply(lambda x: [i.strip() for i in x])

# Create cuisine dummy variables
cuisine_dummies = cleaned_df['cuisine'].explode().str.get_dummies().groupby(level=0).sum()

# STEP 2: One-hot encode the 'city' column
city_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # <-- updated line
city_encoded = city_encoder.fit_transform(cleaned_df[['city']])
city_df = pd.DataFrame(city_encoded, columns=city_encoder.get_feature_names_out(['city']))


# STEP 3: Combine numerical + encoded data
encoded_df = pd.concat([
    cleaned_df[['rating', 'cost']].reset_index(drop=True),
    city_df.reset_index(drop=True),
    cuisine_dummies.reset_index(drop=True)
], axis=1)

# Save encoded data
encoded_df.to_csv(r"D:\Swiggy\data\encoded_data.csv", index=False)

# Save encoder for city (if needed later in app)
with open(r"D:\Swiggy\data\encoder.pkl", 'wb') as f:
    pickle.dump(city_encoder, f)

print("✅ Encoding complete. Saved to 'encoded_data.csv'")
print(f"Shape of encoded data: {encoded_df.shape}")


✅ Encoding complete. Saved to 'encoded_data.csv'
Shape of encoded data: (61421, 895)


In [4]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load cleaned and encoded data
cleaned_df = pd.read_csv(r"D:\Swiggy\data\cleaned_data.csv")
encoded_df = pd.read_csv(r"D:\Swiggy\data\encoded_data.csv")

# Load encoder
with open(r"D:\Swiggy\data\encoder.pkl", 'rb') as f:
    city_encoder = pickle.load(f)

# Recommendation function
def recommend_restaurants(city, cuisines, min_rating, max_cost, top_n=5):
    # 1. Encode city
    city_encoded = city_encoder.transform([[city]])
    city_df = pd.DataFrame(city_encoded, columns=city_encoder.get_feature_names_out(['city']))
    
    # 2. Encode cuisines
    # Get cuisine columns from encoded_df
    all_cuisine_columns = [col for col in encoded_df.columns if col not in ['rating', 'cost'] and not col.startswith('city_')]
    cuisine_vec = [1 if cuisine in cuisines else 0 for cuisine in all_cuisine_columns]
    cuisine_df = pd.DataFrame([cuisine_vec], columns=all_cuisine_columns)
    
    # 3. Combine input features
    input_features = pd.DataFrame({
        'rating': [min_rating],
        'cost': [max_cost]
    })
    user_vector = pd.concat([input_features, city_df, cuisine_df], axis=1)
    
    # 4. Match column order
    user_vector = user_vector[encoded_df.columns]
    
    # 5. Compute cosine similarity
    similarities = cosine_similarity(user_vector, encoded_df)[0]
    top_indices = similarities.argsort()[::-1][:top_n]
    
    # 6. Get results from original cleaned data
    recommended = cleaned_df.iloc[top_indices][['name', 'city', 'rating', 'cost', 'cuisine', 'address']]
    return recommended

# 🔍 Example usage:
results = recommend_restaurants(
    city="Chennai",
    cuisines=["South Indian", "Biryani"],
    min_rating=4.0,
    max_cost=300,
    top_n=5
)

print("📌 Top Recommended Restaurants:\n")
print(results)




📌 Top Recommended Restaurants:

                              name                       city  rating  cost  \
48893                    Food Land                  North-goa     4.8   400   
18898         Royal Biryani Darbar          Mogappair,Chennai     4.3   350   
9432   RNR Biryani - Taste of 1953  Electronic City,Bangalore     4.3   350   
8527   RNR Biryani - Taste of 1953  Kanakapura Road,Bangalore     4.3   350   
4041            AMBUR STAR BIRYANI      Koramangala,Bangalore     4.1   330   

                    cuisine                                            address  
48893  South Indian,Biryani  Food Land, Baga Beach,Bardez.North Goa 403516,...  
18898  Biryani,South Indian  Royal Biryani Darbar, Plot No 152BRam Nagar si...  
9432   Biryani,South Indian  RNR Biryani - Taste of 1953, Taste of 1953, No...  
8527   Biryani,South Indian  RNR Biryani - Taste of 1953, Rainbow Centrum, ...  
4041   Biryani,South Indian  AMBUR STAR BIRYANI, 104,JATTI BUILDING,1ST MAI...  
