In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv('/content/Dataset .csv')

# Clean column names (remove leading/trailing spaces)
df.columns = df.columns.str.strip()

# Drop rows with missing target value
df = df.dropna(subset=['Aggregate rating'])

# Drop duplicates and other NaNs
df = df.drop_duplicates().dropna()

# Target and feature selection
target_col = 'Aggregate rating'
feature_cols = [
    'Has Table booking', 'Has Online delivery', 'Price range', 'Votes',
    'Is delivering now', 'Switch to order menu', 'Average Cost for two',
    'City', 'Cuisines'
]

# Only include columns that exist in the dataset
feature_cols = [col for col in feature_cols if col in df.columns]

X = df[feature_cols].copy()
y = df[target_col].astype(float)

# Encode categorical columns
le = LabelEncoder()
for col in X.select_dtypes(include='object').columns:
    X[col] = le.fit_transform(X[col].astype(str))

# Convert Price and Cost to numeric (just to be sure)
if 'Average Cost for two' in X.columns:
    X['Average Cost for two'] = pd.to_numeric(X['Average Cost for two'], errors='coerce')

if 'Price range' in X.columns:
    X['Price range'] = pd.to_numeric(X['Price range'], errors='coerce')

X = X.fillna(0)  # fill any remaining missing values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    print(f"\n=== {name} ===")
    print(f"R² Score: {r2:.4f}")
    print(f"MSE      : {mse:.4f}")

# Optional: Feature importance for Random Forest
rf = models['Random Forest']
importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)

print("\nRandom Forest Feature Importances:")
print(feat_imp)



=== Linear Regression ===
R² Score: 0.3006
MSE      : 1.6017

=== Decision Tree ===
R² Score: 0.9167
MSE      : 0.1908

=== Random Forest ===
R² Score: 0.9557
MSE      : 0.1014

Random Forest Feature Importances:
Votes                   0.954746
Cuisines                0.015891
Average Cost for two    0.011557
City                    0.011333
Price range             0.002805
Has Online delivery     0.002479
Has Table booking       0.000994
Is delivering now       0.000194
Switch to order menu    0.000000
dtype: float64
