
# Task 1: Predict Restaurant Ratings

This notebook implements Task 1 of the Machine Learning Internship project


In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:

# Load the dataset
df = pd.read_csv(r'C:\Users\robis\archive\Dataset .csv')

# Quick overview of the data
print(df.head())
print(df.info())
print(df.describe())


In [None]:

# --- Data Preprocessing ---

# 1. Handle missing values
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute missing values
df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
df[categorical_cols] = cat_imputer.fit_transform(df[categorical_cols])

# 2. Encode categorical variables
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Check if missing values still exist
print(df.isnull().sum())


In [None]:

# --- Feature Selection ---

# Set target column
target_column = 'Aggregate rating'  # Change if your column name is different!

X = df.drop(columns=[target_column])
y = df[target_column]

# --- Split data into train and test sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# --- Model Training ---

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)


In [None]:

# --- Model Evaluation ---

# Linear Regression predictions
lr_preds = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_preds)
lr_r2 = r2_score(y_test, lr_preds)

# Decision Tree predictions
dt_preds = dt_model.predict(X_test)
dt_mse = mean_squared_error(y_test, dt_preds)
dt_r2 = r2_score(y_test, dt_preds)

# --- Results ---

print("\n--- Linear Regression Performance ---")
print(f"Mean Squared Error: {lr_mse:.2f}")
print(f"R-squared: {lr_r2:.2f}")

print("\n--- Decision Tree Regression Performance ---")
print(f"Mean Squared Error: {dt_mse:.2f}")
print(f"R-squared: {dt_r2:.2f}")


In [None]:

# --- Feature Importance (Decision Tree) ---
feature_importances = pd.Series(dt_model.feature_importances_, index=X.columns)
feature_importances.nlargest(10).plot(kind='barh', figsize=(8,6), color='skyblue')
plt.title("Top 10 Influential Features (Decision Tree)")
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.show()
