In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load dataset
data = pd.read_csv("Swiggy.csv", header=None) # Load without header

# Assign column names based on the structure observed in the data.head() output
data.columns = ["State", "City", "Restaurant Name", "Location", "Category", "Dish Name", "Price (INR)", "Rating", "Rating Count"]
print(data.head())

# 1. Data Cleaning (convert Price to float if needed)
data["Price (INR)"] = data["Price (INR)"].replace(",", "", regex=True).astype(float)

# Define features
num_features = ["Rating", "Rating Count"]
cat_features = ["State", "City", "Restaurant Name", "Location", "Category", "Dish Name"]

# Preprocessing for numerical features
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Preprocessing for categorical features
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Combine preprocessing steps for Input Data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)

preprocessor.set_output(transform="pandas")

# Apply transformations to Input data
data_preprocessed = preprocessor.fit_transform(data)
print(data_preprocessed.head())

# Output Data (Target)
preprocessor_out = ColumnTransformer(
    transformers=[
        ("num", num_transformer, ["Price (INR)"])
    ]
)

preprocessor_out.set_output(transform="pandas")

data_preprocessed_out = preprocessor_out.fit_transform(data)
print(data_preprocessed_out.head())

# 2. Feature Engineering (optional example)
data_preprocessed["Price_per_rating"] = data["Price (INR)"] / (data["Rating"] + 1)

# 3. Data Splitting
X = data_preprocessed
y = data_preprocessed_out

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display processed training data
print(X_train.head())
print(y_train.head())

   State       City Restaurant Name   Location    Category  \
0   elhi  New Delhi     Sagar Ratna  Sector 61      Thalis   
1  Delhi  New Delhi     Sagar Ratna  Sector 61      Thalis   
2  Delhi  New Delhi     Sagar Ratna  Sector 61      Thalis   
3  Delhi  New Delhi     Sagar Ratna  Sector 61  Meal Combo   
4  Delhi  New Delhi     Sagar Ratna  Sector 61  Meal Combo   

                Dish Name  Price (INR)  Rating  Rating Count  
0            Deluxe Thali        410.0     4.6         349.0  
1         Executive Thali        375.0     4.0         139.0  
2     Sagar Special Thali        350.0     4.6         180.0  
3  Rava Masala Dosa Combo        365.0     0.0           0.0  
4       Masala Dosa Combo        330.0     4.8          10.0  
   num__Rating  num__Rating Count  cat__State_Delhi  cat__State_Telangana  \
0     0.967794           3.271411               0.0                   0.0   
1     0.685677           1.107966               1.0                   0.0   
2     0.967794    