In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

train_url = 'https://github.com/FlipRoboTechnologies/ML-Datasets/raw/main/Restaurant%20Food%20Cost/Data_Train.xlsx'
test_url = 'https://github.com/FlipRoboTechnologies/ML-Datasets/raw/main/Restaurant%20Food%20Cost/Data_Test.xlsx'

train_data = pd.read_excel(train_url)
test_data = pd.read_excel(test_url)
print(train_data.head())
print(train_data.info())
train_data = train_data.dropna()
def extract_hours(time_str):
    if pd.isna(time_str):
        return 0
    opening, closing = time_str.split(' to ')
    opening_hour = int(opening.split()[0])
    closing_hour = int(closing.split()[0])
    return closing_hour - opening_hour

train_data['HOURS'] = train_data['TIME'].apply(extract_hours)
test_data['HOURS'] = test_data['TIME'].apply(extract_hours)
X = train_data.drop(['COST', 'RESTAURANT_ID'], axis=1)
y = train_data['COST']
categorical_cols = ['TITLE', 'CUISINES', 'CITY', 'LOCALITY']
numerical_cols = ['RATING', 'VOTES', 'HOURS']
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100))
])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print('MAE:', mean_absolute_error(y_val, y_pred))

X_test = test_data.drop(['RESTAURANT_ID'], axis=1)
test_predictions = model.predict(X_test)

output = pd.DataFrame({'RESTAURANT_ID': test_data['RESTAURANT_ID'], 'COST': test_predictions})
output.to_csv('predictions.csv', index=False)


               TITLE  RESTAURANT_ID  \
0      CASUAL DINING           9438   
1  CASUAL DINING,BAR          13198   
2      CASUAL DINING          10915   
3        QUICK BITES           6346   
4     DESSERT PARLOR          15387   

                                     CUISINES  \
0                 Malwani, Goan, North Indian   
1              Asian, Modern Indian, Japanese   
2  North Indian, Chinese, Biryani, Hyderabadi   
3                            Tibetan, Chinese   
4                                    Desserts   

                                     TIME     CITY        LOCALITY RATING  \
0  11am – 4pm, 7:30pm – 11:30pm (Mon-Sun)    Thane  Dombivali East    3.6   
1                    6pm – 11pm (Mon-Sun)  Chennai       Ramapuram    4.2   
2     11am – 3:30pm, 7pm – 11pm (Mon-Sun)  Chennai      Saligramam    3.8   
3                 11:30am – 1am (Mon-Sun)   Mumbai     Bandra West    4.1   
4                    11am – 1am (Mon-Sun)   Mumbai     Lower Parel    3.8   

       

ValueError: not enough values to unpack (expected 2, got 1)