In [None]:
# Step 1: Load the dataset
import pandas as pd
import numpy as np
data = pd.read_csv('swiggy_final_versionV2.csv')

In [None]:
print(data.columns)
print(len(data))

Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'lic_no', 'link', 'address', 'menu', 'weekly_avg_salary',
       'no_years_open'],
      dtype='object')
148242


In [None]:
cuisines = data['cuisine'].unique()
print(len(cuisines))
print(cuisines)

2120
['Beverages,Pizzas' 'Sweets,Bakery' 'Beverages' ... 'Biryani,Thai'
 'South American,Andhra' 'Barbecue,Italian-American']


In [None]:
import re

weird_values = []

# Use regular expression to find values containing numbers or specific words
pattern = re.compile(r'\d+|discount|offer|default|delivery|brand', re.IGNORECASE)


# Loop through each element of the array
for elem in cuisines:
    # Check if the element matches the pattern
    if pattern.findall(str(elem)):
        weird_values.append(elem)

print(weird_values)
print(len(weird_values))

[]
0


In [None]:
#TODO: Currently working here
#Checking for wierd values in the dataset in city, rating, rating_count, cost, cuisine, weekly_avg_salary, no_years_open
unique_city = data.groupby('city')['city'].count().reset_index(name='Count')
unique_rating = data.groupby('rating')['rating'].count().reset_index(name='Count')
unique_rating_count = data.groupby('rating_count')['rating_count'].count().reset_index(name='Count')
unique_cost = data.groupby('cost')['cost'].count().reset_index(name='Count')
unique_cuisine = data.groupby('cuisine')['cuisine'].count().reset_index(name='Count')
unique_weekly_avg_salary = data.groupby('weekly_avg_salary')['weekly_avg_salary'].count().reset_index(name='Count')
unique_no_years_open = data.groupby('no_years_open')['no_years_open'].count().reset_index(name='Count')

In [None]:
unique_city.to_csv('unique_city.csv', index=False)
unique_rating.to_csv('unique_rating.csv', index=False)
unique_rating_count.to_csv('unique_rating_count.csv', index=False)
unique_cost.to_csv('unique_cost.csv', index=False)
unique_cuisine.to_csv('unique_cuisine.csv', index=False)
unique_weekly_avg_salary.to_csv('unique_weekly_avg_salary.csv', index=False)
unique_no_years_open.to_csv('unique_no_years_open.csv', index=False)

In [None]:
# Step 2: Data Preprocessing
columns_to_remove = ['id', 'name', 'lic_no', 'link', 'address', 'menu']
data = data.drop(columns_to_remove, axis=1)
data['cuisine'] = data['cuisine'].astype('category')
data['city'] = data['city'].astype('category')

In [None]:
print(data[['cuisine', 'city']].head(5))
print(data.columns)

                      cuisine    city
0            Beverages,Pizzas  Abohar
1               Sweets,Bakery  Abohar
2                   Beverages  Abohar
3            Fast Food,Indian  Abohar
4  Italian-American,Fast Food  Abohar
Index(['city', 'rating', 'rating_count', 'cost', 'cuisine',
       'weekly_avg_salary', 'no_years_open'],
      dtype='object')


In [None]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X = data.drop('rating', axis=1)
X = pd.get_dummies(X, columns=['cuisine', 'city'])
y = data['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#print(X_train)
#print(y_train)
print(len(X_train.columns))

2945


In [None]:
# Step 3: Define XGBoost parameters
import xgboost as xgb
params = {'objective': 'reg:squarederror', 'colsample_bytree': 0.3, 'learning_rate': 0.1,
          'max_depth': 5, 'alpha': 10, 'n_estimators': 10}

In [None]:

# Step 4: Train the model
model = xgb.XGBRegressor(**params)
model.fit(X_train, y_train)

In [None]:

# Step 5: Evaluate the model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_pred = model.predict(X_test)

print('MAE:', mean_absolute_error(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred, squared=False))
print('R-squared:', r2_score(y_test, y_pred))

MAE: 0.7837067008106143
RMSE: 1.186182856527383
R-squared: -0.03925034751473566


In [None]:

# Step 6: Predict ratings
# Use the trained XGBoost model to predict the rating of a restaurant based on its features
new_data = pd.DataFrame({'city': ['Abohar'], 'rating_count': [1.0], 'cost': [200.0], 'cuisine': ['Beverages,Pizzas'], 'weekly_avg_salary': [4529.75], 'no_years_open': [1]})
new_data_encoded = pd.get_dummies(new_data, columns=['cuisine', 'city'])
new_data_encoded = new_data_encoded.reindex(columns=X_train.columns, fill_value=0)
rating = model.predict(new_data_encoded)
print('Predicted Rating:', rating[0])

Predicted Rating: 1.5029217
