In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv('Datasets/dynamic_pricing.csv')
data.shape

(1000, 10)

In [3]:
data.head()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Number_of_Riders         1000 non-null   int64  
 1   Number_of_Drivers        1000 non-null   int64  
 2   Location_Category        1000 non-null   object 
 3   Customer_Loyalty_Status  1000 non-null   object 
 4   Number_of_Past_Rides     1000 non-null   int64  
 5   Average_Ratings          1000 non-null   float64
 6   Time_of_Booking          1000 non-null   object 
 7   Vehicle_Type             1000 non-null   object 
 8   Expected_Ride_Duration   1000 non-null   int64  
 9   Historical_Cost_of_Ride  1000 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 78.2+ KB


In [5]:
data.isnull().sum()

Number_of_Riders           0
Number_of_Drivers          0
Location_Category          0
Customer_Loyalty_Status    0
Number_of_Past_Rides       0
Average_Ratings            0
Time_of_Booking            0
Vehicle_Type               0
Expected_Ride_Duration     0
Historical_Cost_of_Ride    0
dtype: int64

In [6]:
data.describe()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Number_of_Past_Rides,Average_Ratings,Expected_Ride_Duration,Historical_Cost_of_Ride
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,60.372,27.076,50.031,4.25722,99.588,372.502623
std,23.701506,19.068346,29.313774,0.435781,49.16545,187.158756
min,20.0,5.0,0.0,3.5,10.0,25.993449
25%,40.0,11.0,25.0,3.87,59.75,221.365202
50%,60.0,22.0,51.0,4.27,102.0,362.019426
75%,81.0,38.0,75.0,4.6325,143.0,510.497504
max,100.0,89.0,100.0,5.0,180.0,836.116419


# EDA

### let’s have a look at the relationship between expected ride duration and the historical cost of the ride:

In [7]:
fig = px.scatter(data,
                 x='Expected_Ride_Duration',
                 y='Historical_Cost_of_Ride',
                 title='Expected Ride Duration vs. Historical Cost of Ride',
                 trendline='ols')

fig.show()

### let’s have a look at the distribution of the historical cost of rides based on the vehicle type:

In [8]:
fig = px.box(data,
             x='Vehicle_Type',
             y='Historical_Cost_of_Ride',
             title='Historical Cost of Ride Distribution by Vehicle Type')

fig.show()

### let’s have a look at the correlation matrix:

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Number_of_Riders         1000 non-null   int64  
 1   Number_of_Drivers        1000 non-null   int64  
 2   Location_Category        1000 non-null   object 
 3   Customer_Loyalty_Status  1000 non-null   object 
 4   Number_of_Past_Rides     1000 non-null   int64  
 5   Average_Ratings          1000 non-null   float64
 6   Time_of_Booking          1000 non-null   object 
 7   Vehicle_Type             1000 non-null   object 
 8   Expected_Ride_Duration   1000 non-null   int64  
 9   Historical_Cost_of_Ride  1000 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 78.2+ KB


In [10]:
num_columns = ['Number_of_Riders', 'Number_of_Drivers', 'Number_of_Past_Rides', 'Average_Ratings', 'Expected_Ride_Duration', 'Historical_Cost_of_Ride']
corr_matrix = data[num_columns].corr()

In [11]:
fig = go.Figure(data=go.Heatmap(z=corr_matrix.values, 
                                x=corr_matrix.columns, 
                                y=corr_matrix.columns,
                                colorscale='Viridis'))
fig.update_layout(title='Correlation Matrix')
fig.show()

# Implementing a Dynamic Pricing Strategy

### Calculate demand_multiplier based on percentile for high and low demand

In [12]:
high_demand_percentile = 75
low_demand_percentile = 25

data['demand_multiplier'] = np.where(data['Number_of_Riders'] > np.percentile(data['Number_of_Riders'], high_demand_percentile),
                                                                              data['Number_of_Riders'] / np.percentile(data['Number_of_Riders'], high_demand_percentile),
                                                                              data['Number_of_Riders'] / np.percentile(data['Number_of_Riders'], low_demand_percentile))

### Calculate supply_multiplier based on percentile for high and low supply

In [13]:
high_supply_multiplier = 75
low_supply_multiplier = 25

data['supply_multiplier'] = np.where(data['Number_of_Drivers'] > np.percentile(data['Number_of_Drivers'], high_supply_multiplier),
                                     data['Number_of_Drivers'] / np.percentile(data['Number_of_Drivers'], high_supply_multiplier),
                                     data['Number_of_Drivers'] / np.percentile(data['Number_of_Drivers'], low_supply_multiplier))

In [14]:
data.head()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride,demand_multiplier,supply_multiplier
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273,1.111111,1.184211
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753,1.45,1.026316
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469,1.05,2.818182
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232,1.098765,2.545455
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422,1.95,2.0


In [15]:
print(data['demand_multiplier'].max())
print(data['demand_multiplier'].min())
print(data['demand_multiplier'].mean())

2.025
0.5
1.2215154320987656


In [16]:
print(data['supply_multiplier'].max())
print(data['supply_multiplier'].min())
print(data['supply_multiplier'].mean())

3.4545454545454546
0.45454545454545453
1.5999736842105263


In [17]:
# Define price adjustment factors for high and low demand/supply
demand_threshold_high = 1.2  # Higher demand threshold
demand_threshold_low = 0.8  # Lower demand threshold
supply_threshold_high = 0.8  # Higher supply threshold
supply_threshold_low = 1.2  # Lower supply threshold

### Calculate adjusted_ride_cost for dynamic pricing

In [18]:
data["adjusted_ride_cost"] = data['Historical_Cost_of_Ride'] * (np.maximum(data['demand_multiplier'], demand_threshold_low) * 
                                                                np.maximum(data['supply_multiplier'], supply_threshold_high))


In [19]:
data.head()

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride,demand_multiplier,supply_multiplier,adjusted_ride_cost
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273,1.111111,1.184211,374.022728
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753,1.45,1.026316,258.753086
2,42,31,Rural,Silver,0,3.99,Afternoon,Premium,76,329.795469,1.05,2.818182,975.894774
3,89,28,Rural,Regular,67,4.31,Afternoon,Premium,134,470.201232,1.098765,2.545455,1315.085824
4,78,22,Rural,Regular,74,3.77,Afternoon,Economy,149,579.681422,1.95,2.0,2260.757547


### let’s calculate the profit percentage we got after implementing this dynamic pricing strategy:

In [20]:
# Calculate the profit percentage for each ride
data['profile_percentage'] = ((data['adjusted_ride_cost'] - data['Historical_Cost_of_Ride']) / data['Historical_Cost_of_Ride']) * 100

In [21]:
# Identify profitable rides where profit percentage is positive

profitable_ride = data[data['profile_percentage'] > 1]

loss_ride = data[data['profile_percentage'] < 0]

In [22]:
# Calculate the count of profitable and loss rides
profitable_count = len(profitable_ride)
loss_count = len(loss_ride)

In [23]:
# Create a donut chart to show the distribution of profitable and loss rides
labels = ['Profitable Rides', 'Loss Rides']
values = [profitable_count, loss_count]

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.4)])
fig.update_layout(title='Profitability of Rides (Dynamic Pricing vs. Historical Pricing)')
fig.show()

### let’s have a look at the relationship between the expected ride duration and the cost of the ride based on the dynamic pricing strategy:

In [24]:
fig = px.scatter(data, 
                 x='Expected_Ride_Duration', 
                 y='adjusted_ride_cost',
                 title='Expected Ride Duration vs. Cost of Ride', 
                 trendline='ols')
fig.show()

# Training a Predictive Model

In [25]:
#let’s convert it vehicle type feature into a numerical feature 
data["Vehicle_Type"] = data["Vehicle_Type"].map({"Premium": 1, 
                                           "Economy": 0})

### let’s split the data and train a Machine Learning model to predict the cost of a ride:

In [26]:
#splitting data
x = np.array(data[["Number_of_Riders", "Number_of_Drivers", "Vehicle_Type", "Expected_Ride_Duration"]])
y = np.array(data[["adjusted_ride_cost"]])

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [27]:
# Reshape y to 1D array
y_train = y_train.ravel()
y_test = y_test.ravel()

In [28]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train, y_train)

### let’s test this Machine Learning model using some input values:

In [29]:
def get_vehicle_type_numeric(vehicle_type):
    vehicle_type_mapping = {
        "Premium": 1,
        "Economy": 0
    }
    vehicle_type_numeric = vehicle_type_mapping.get(vehicle_type)
    return vehicle_type_numeric

In [30]:
# Predicting using user input values
def predict_price(number_of_riders, number_of_drivers, vehicle_type, Expected_Ride_Duration):
    vehicle_type_numeric = get_vehicle_type_numeric(vehicle_type)
    if vehicle_type_numeric is None:
        raise ValueError("Invalid vehicle type")
    
    input_data = np.array([[number_of_riders, number_of_drivers, vehicle_type_numeric, Expected_Ride_Duration]])
    predicted_price = model.predict(input_data)
    return predicted_price

In [31]:
# Example prediction using user input values
user_number_of_riders = 40
user_number_of_drivers = 30
user_vehicle_type = "Premium"
Expected_Ride_Duration = 30
predicted_price = predict_price(user_number_of_riders, user_number_of_drivers, user_vehicle_type, Expected_Ride_Duration)
print("Predicted price:", predicted_price)

Predicted price: [480.14594226]


### let's comparison of the actual and predicted results:

In [32]:
# Predict on the test set
y_pred = model.predict(x_test)

In [33]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.9017631619604874

In [34]:
# Create a scatter plot with actual vs predicted values
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=y_test.flatten(),
    y=y_pred,
    mode='markers',
    name='Actual vs Predicted'
))

# Add a line representing the ideal case
fig.add_trace(go.Scatter(
    x=[min(y_test.flatten()), max(y_test.flatten())],
    y=[min(y_test.flatten()), max(y_test.flatten())],
    mode='lines',
    name='Ideal',
    line=dict(color='red', dash='dash')
))

fig.update_layout(
    title='Actual vs Predicted Values',
    xaxis_title='Actual Values',
    yaxis_title='Predicted Values',
    showlegend=True,
)

fig.show()