# Uber & Lyft Price Prediction - Linear Regression Model

# Import Libraries

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Data loading

In [44]:
data = pd.read_csv('rideshare_kaggle.csv')

Data loaded successfully.

# Exploratory Data Analysis

### Data Summary

In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693071 entries, 0 to 693070
Data columns (total 57 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           693071 non-null  object 
 1   timestamp                    693071 non-null  float64
 2   hour                         693071 non-null  int64  
 3   day                          693071 non-null  int64  
 4   month                        693071 non-null  int64  
 5   datetime                     693071 non-null  object 
 6   timezone                     693071 non-null  object 
 7   source                       693071 non-null  object 
 8   destination                  693071 non-null  object 
 9   cab_type                     693071 non-null  object 
 10  product_id                   693071 non-null  object 
 11  name                         693071 non-null  object 
 12  price                        637976 non-null  float64
 13 

- 693701 rows data
- 57 total columns <br>
-> 29 float columns <br>
-> 17 int columns <br>
-> 11 object columns

In [46]:
data.describe(include='object').T

Unnamed: 0,count,unique,top,freq
id,693071,693071,424553bb-7174-41ea-aeb4-fe06d4f4b9d7,1
datetime,693071,31350,2018-11-26 23:21:14,156
timezone,693071,1,America/New_York,693071
source,693071,12,Financial District,58857
destination,693071,12,Financial District,58851
cab_type,693071,2,Uber,385663
product_id,693071,13,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,55096
name,693071,13,UberXL,55096
short_summary,693071,9,Overcast,218895
long_summary,693071,11,Mostly cloudy throughout the day.,202340


- timezone column only has 1 unique values across 693071 data which makes this columns useless

### Check Missing Values

In [47]:
data.isna().sum()

id                                 0
timestamp                          0
hour                               0
day                                0
month                              0
datetime                           0
timezone                           0
source                             0
destination                        0
cab_type                           0
product_id                         0
name                               0
price                          55095
distance                           0
surge_multiplier                   0
latitude                           0
longitude                          0
temperature                        0
apparentTemperature                0
short_summary                      0
long_summary                       0
precipIntensity                    0
precipProbability                  0
humidity                           0
windSpeed                          0
windGust                           0
windGustTime                       0
v

In [48]:
data.dropna(inplace=True)
data.isna().sum().sum()

0

No more missing values

### Check Duplicates

In [49]:
data.duplicated().sum()

0

No duplicates

# Feature Engineering

In [50]:
data.name

0               Shared
1                  Lux
2                 Lyft
3         Lux Black XL
4              Lyft XL
              ...     
693065             WAV
693066          UberXL
693067           UberX
693069       Black SUV
693070        UberPool
Name: name, Length: 637976, dtype: object

### Feature Selection

For this model, we will only take a look at the distance and car type

In [51]:
data_fe = data[['distance', 'name', 'price']]

### Data Split

In [52]:
X = data_fe.drop(columns='price')
y = data_fe.price

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Preprocessing

In [54]:
# Define preprocessing steps for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Scale numerical features to have mean=0 and variance=1
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features using one-hot encoding
])

# Combine preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['distance']),
        ('cat', categorical_transformer, ['name'])
    ])

# Define the preprocessing pipeline and include the linear regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

# Model

### Train

In [55]:
# Fit the model to the training data
model.fit(X_train, y_train)

### Evaluation

In [56]:
## define rmse and mape function

def calculate_rmse(y_true, y_pred):
    """
    Calculate the Root Mean Squared Error (RMSE).
    """
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse

def calculate_mape(y_true, y_pred):
    """
    Calculate the Mean Absolute Percentage Error (MAPE).
    """
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mape

In [57]:
# Calculate RMSE and MAPE for train set
train_rmse = calculate_rmse(y_train, model.predict(X_train))
train_mape = calculate_mape(y_train, model.predict(X_train))

# Calculate RMSE and MAPE for test set
test_rmse = calculate_rmse(y_test, model.predict(X_test))
test_mape = calculate_mape(y_test, model.predict(X_test))

print(f'Train RMSE: {train_rmse}, Train MAPE: {train_mape}')
print(f'Test RMSE: {test_rmse}, Test MAPE: {test_mape}')

Train RMSE: 3.041919721282186, Train MAPE: 14.168123990767073
Test RMSE: 3.0538790857818836, Test MAPE: 14.161330841514278


In [62]:
y.mean()

16.545125490614065

# Conclusion

Our project to predict ride-hailing prices for Uber and Lyft services navigated several key events:

- Dealt with missing values exclusively associated with the 'taxi' car type.
- Focused feature selection on 'distance' and 'car type' to streamline complexity.
- Employed StandardScaler for scaling and One Hot Encoder for encoding during preprocessing.
- Chose linear regression, recognizing its simplicity and effectiveness for capturing linear relationships.
- Evaluated model performance using RMSE and MAPE, with a mean price of 16.5.
- Observed an RMSE of 3.05 and a MAPE of 14.15, reflecting our model's predictive capabilities.

In conclusion, our project marks progress in creating a transparent pricing model for ride-hailing services. Despite challenges, strategic decisions and methodical approaches pave the way for continued refinement and advancement in ride-hailing pricing analysis.