In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("zomato_clean.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45584 entries, 0 to 45583
Data columns (total 35 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Delivery_person_Age               45584 non-null  float64
 1   Delivery_person_Ratings           45584 non-null  float64
 2   Restaurant_latitude               45584 non-null  float64
 3   Restaurant_longitude              45584 non-null  float64
 4   Delivery_location_latitude        45584 non-null  float64
 5   Delivery_location_longitude       45584 non-null  float64
 6   Road_traffic_density              45584 non-null  int64  
 7   Vehicle_condition                 45584 non-null  int64  
 8   multiple_deliveries               45584 non-null  float64
 9   Festival                          45584 non-null  int64  
 10  City                              45584 non-null  int64  
 11  Time_taken (min)                  45584 non-null  int64  
 12  Deli

In [4]:
selected_columns = [
    'Delivery_person_Age',
    'Delivery_person_Ratings',
    'Restaurant_latitude',
    'Restaurant_longitude',
    'Delivery_location_latitude',
    'Delivery_location_longitude',
    'Time_Orderd_Hour',
                'Time_Orderd_Minute',
    'Time_Order_picked_Hour',
                'Time_Order_picked_Min',
    'Weather_conditions_Cloudy',
                'Weather_conditions_Fog',
                'Weather_conditions_Sandstorms',
                'Weather_conditions_Stormy',
                'Weather_conditions_Sunny',
                'Weather_conditions_Windy',
    'multiple_deliveries',
    'Festival',
                'City',
    'Road_traffic_density',
    'Vehicle_condition',
                'Type_of_vehicle_electric_scooter',
    'Type_of_vehicle_motorcycle',
    'Type_of_vehicle_scooter',
    'distance',
    'Time_taken (min)'
]

# Creating a new DataFrame with the selected columns
new_data_frame = data[selected_columns]

In [5]:
#Divide the dataset into train and test dataset
from sklearn.model_selection import train_test_split

In [6]:
#Divide the dataset into independent and dependent dataset
X = new_data_frame.iloc[:,:-1] #Independent features
y = new_data_frame.iloc[:,-1]  #Dependent or Target feature

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.32, random_state=42)

In [8]:
# Standardize the data using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [9]:
from catboost import CatBoostRegressor
cat = CatBoostRegressor()
cat.fit(x_train,y_train)

Learning rate set to 0.070438
0:	learn: 8.9615842	total: 156ms	remaining: 2m 36s
1:	learn: 8.5939048	total: 162ms	remaining: 1m 20s
2:	learn: 8.2688551	total: 168ms	remaining: 55.9s
3:	learn: 7.9663612	total: 175ms	remaining: 43.6s
4:	learn: 7.7222740	total: 181ms	remaining: 36s
5:	learn: 7.4784685	total: 188ms	remaining: 31.1s
6:	learn: 7.2417663	total: 194ms	remaining: 27.6s
7:	learn: 7.0511812	total: 204ms	remaining: 25.3s
8:	learn: 6.8303045	total: 211ms	remaining: 23.2s
9:	learn: 6.6799548	total: 222ms	remaining: 22s
10:	learn: 6.4894956	total: 230ms	remaining: 20.7s
11:	learn: 6.3380221	total: 236ms	remaining: 19.4s
12:	learn: 6.2174347	total: 243ms	remaining: 18.5s
13:	learn: 6.0875142	total: 249ms	remaining: 17.5s
14:	learn: 5.9934856	total: 256ms	remaining: 16.8s
15:	learn: 5.8661901	total: 261ms	remaining: 16.1s
16:	learn: 5.7655637	total: 267ms	remaining: 15.4s
17:	learn: 5.6377679	total: 274ms	remaining: 14.9s
18:	learn: 5.5220760	total: 282ms	remaining: 14.5s
19:	learn: 5.

<catboost.core.CatBoostRegressor at 0x21fdb31beb0>

In [10]:
y_pred = cat.predict(x_test)

In [11]:
import plotly.graph_objs as go
#Conclusion :
# 1] The regression model appears to capture the overall trend of the data but may struggle with certain data points, leading to larger prediction errors.
# 2] The presence of outliers suggests that the model's performance could be improved either by addressing these outliers or by using a more robust regression technique.
# 3] The jagged nature of the regression line indicates a close fit to the data points, which might be an issue if the model is overfitting.

# Assuming you have y_test and y_pred defined

# Assuming y_test and y_pred are already defined with 30 samples
y_test_sample = y_test[:50]
y_pred_sample = y_pred[:50]

# Convert range to list for x-axis
x_values = list(range(len(y_test_sample)))

# Create traces for actual and predicted data points
trace_actual = go.Scatter(x=x_values, y=y_test_sample, mode='markers', name='Actual', marker=dict(color='green'))
trace_predicted = go.Scatter(x=x_values, y=y_pred_sample, mode='markers', name='Predicted', marker=dict(color='red'))

# Create a trace for the regression line
trace_regression = go.Scatter(x=x_values, y=y_pred_sample, mode='lines', name='Regression line', line=dict(color='blue', width=2))

# Combine traces into a data list
data = [trace_actual, trace_predicted, trace_regression]

# Define layout for the plot
layout = go.Layout(
    title='Comparison of Actual and Predicted Data Points with CatBoostRegressor',
    xaxis=dict(title='Data Point Index'),
    yaxis=dict(title='y'),
    hovermode='closest',  # Show closest data point's values upon hover
)

# Create figure object that combines data and layout
fig = go.Figure(data=data, layout=layout)

# Display the interactive plot
fig.show()

In [12]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import numpy as np
mae=mean_absolute_error(y_pred, y_test)
mse=mean_squared_error(y_pred, y_test)
rmse= np.sqrt(mean_squared_error(y_pred, y_test))
r2_square=r2_score(y_pred, y_test)

In [13]:
print(mae)
print(mse)
print(rmse)
print(r2_square)

3.2110173690624015
16.197201120793878
4.02457465091578
0.7720139991887034


In [14]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8175725248281838