# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the Dataset

In [2]:
# Load the Dataset
df = pd.read_csv("Dataset .csv")

In [5]:
# Display the few random rows to check the data
df.sample(5)

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
3489,9883,Aggarwal Eating Point,1,New Delhi,"3-5, Mini DDA Market, Amrit Puri B, Garhi, Eas...",East of Kailash,"East of Kailash, New Delhi",77.253708,28.556926,"Street Food, Mithai",...,Indian Rupees(Rs.),No,No,No,No,1,0.0,White,Not rated,2
5031,7317,Bikaner,1,New Delhi,"L-10, Near IGI Airport, Mahipalpur, New Delhi",Mahipalpur,"Mahipalpur, New Delhi",77.124921,28.54646,"Mithai, North Indian, South Indian, Chinese, S...",...,Indian Rupees(Rs.),No,No,No,No,2,2.9,Orange,Average,6
133,17294565,Nacho Mamas Burritos,216,Augusta,"976 Broad St, Augusta, GA 30901",Augusta,"Augusta, Augusta",-81.9694,33.4764,Mexican,...,Dollar($),No,No,No,No,1,4.0,Green,Very Good,380
8788,304484,Chauhan Hotel,1,Noida,"Main Road, Opposite Sector 51, Sector 52, Noida",Sector 52,"Sector 52, Noida",77.367691,28.583308,North Indian,...,Indian Rupees(Rs.),No,No,No,No,1,2.9,Orange,Average,11
6182,300337,Hot Spot Roll Corner,1,New Delhi,"G 68, Satyam Tower, Near Post Office, Paschim ...",Paschim Vihar,"Paschim Vihar, New Delhi",77.101517,28.669808,Fast Food,...,Indian Rupees(Rs.),No,No,No,No,1,3.2,Orange,Average,34


# Data Cleaning

In [6]:
# Drop irrelevant columns
drop_cols = ["Restaurant ID", "Restaurant Name", "Address", "Locality", "Locality Verbose", 
             "Rating color", "Rating text", "Currency", "Switch to order menu"]
df_cleaned = df.drop(columns=drop_cols)

In [7]:
df_cleaned.columns

Index(['Country Code', 'City', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Has Table booking', 'Has Online delivery',
       'Is delivering now', 'Price range', 'Aggregate rating', 'Votes'],
      dtype='object')

In [8]:
# Handle missing values
df_cleaned.dropna(inplace=True)

# Data Preprocessing

In [10]:
# Encode categorical variables
label_encoders = {}
categorical_cols = ["Cuisines", "Has Table booking", "Has Online delivery", "Is delivering now", "City"]

for col in categorical_cols:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le

In [11]:
# Define features (X) and target (y)
X = df_cleaned.drop(columns=["Aggregate rating"])
y = df_cleaned["Aggregate rating"]

In [12]:
# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modeling Phase

In [13]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

In [14]:
# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results[name] = {"RMSE": rmse, "R² Score": r2}

In [15]:
# Print results
for model, metrics in results.items():
    print(f"{model}: RMSE = {metrics['RMSE']:.3f}, R² Score = {metrics['R² Score']:.3f}")

Linear Regression: RMSE = 1.234, R² Score = 0.335
Decision Tree: RMSE = 0.428, R² Score = 0.920
Random Forest: RMSE = 0.296, R² Score = 0.962
