In [1]:
# Predict the price of the Uber ride from a given pickup point to the agreed drop-off location. 
# Perform following tasks: 
# 1.  Pre-process the dataset. 
# 2.  Identify outliers. 
# 3.  Check the correlation. 
# 4.  Implement linear regression and random forest regression models. 
# 5.  Evaluate the models and compare their respective scores like R2, RMSE, etc. 
# Dataset link: https://www.kaggle.com/datasets/yasserh/uber-fares-dataset

<!-- # Linear Regression – It is a simple and widely used supervised learning algorithm that models the relationship between a dependent variable (fare amount) and independent variables (distance, passengers, time). It fits a straight line (Y = mX + c) minimizing the error between predicted and actual values using the least squares method.

# Random Forest Regression – This is an ensemble learning technique that builds multiple decision trees and averages their predictions to improve accuracy and reduce overfitting. It captures non-linear relationships and interactions between features better than Linear Regression.

# R² (Coefficient of Determination) – It measures how well the regression predictions approximate the real data points. R² ranges from 0 to 1, where a higher value indicates a better fit.

# Formula: R² = 1 - (SSR/SST),
# where SSR = Sum of squared residuals, and SST = Total sum of squares.

# RMSE (Root Mean Square Error) – It is the square root of the average of squared differences between predicted and actual values. It represents the model’s prediction error in the same units as the target variable.

# Formula: RMSE = √(Σ(y_pred - y_actual)² / n)

# Comparison – Linear Regression is fast and interpretable but may underperform with complex data. Random Forest handles non-linearity and feature interactions better, generally achieving higher R² and lower RMSE, indicating better predictive performance. -->

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
# Load dataset
df = pd.read_csv('uber.csv')
print('Initial shape:', df.shape)

Initial shape: (200000, 9)


In [4]:
# Drop missing values and unnecessary columns
df = df.dropna(subset=['dropoff_longitude','dropoff_latitude'])
df = df.drop(['Unnamed: 0','key'], axis=1)

In [5]:
# Convert datetime and create simple features
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['hour'] = df['pickup_datetime'].dt.hour
df['dayofweek'] = df['pickup_datetime'].dt.dayofweek

In [6]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,dayofweek
0,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,19,3
1,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,20,4
2,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,21,0
3,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,8,4
4,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,17,3


In [7]:
# Calculate distance (approx.)
df['distance'] = np.sqrt((df['dropoff_longitude'] - df['pickup_longitude'])**2 +
(df['dropoff_latitude'] - df['pickup_latitude'])**2)

In [8]:
# Keep only reasonable fares and distances
df = df[(df['fare_amount']>0) & (df['distance']>0)]

In [9]:
# Select features
X = df[['distance','passenger_count','hour','dayofweek']]
y = df['fare_amount']

In [10]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [12]:
# Random Forest Regression
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [13]:
# Evaluation
print('Linear Regression:')
print('R2:', r2_score(y_test, y_pred_lr))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_lr)))

Linear Regression:
R2: 0.0005449262952890166
RMSE: 9.455089371167269


In [14]:
print('\nRandom Forest Regression:')
print('R2:', r2_score(y_test, y_pred_rf))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred_rf)))


Random Forest Regression:
R2: 0.7413677259583211
RMSE: 4.809781625574562
