In [1]:
#importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


Matplotlib is building the font cache; this may take a moment.


In [3]:
%matplotlib inline

In [11]:
# load data from excel file
file_path='uber_rides_data.csv'

In [12]:
#loading data into a pandas dataframe
df = pd.read_csv(file_path)

In [13]:
# printing first five rows
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [14]:
# shape of the dataset
df.shape


(200000, 8)

In [15]:
# 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            200000 non-null  int64  
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 12.2+ MB


In [16]:
# finding missing values
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [17]:
# number of missing values in the dropoff_longitude column is 1.


In [20]:
# data type of  pickup_datetime column 
print(df['pickup_datetime'].dtypes)


object


In [21]:
#  pickup_datetime column is of object datatype

In [22]:
# changing the pickup_datetime column to datetime format
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [23]:
print(df['pickup_datetime'].dtypes)

datetime64[ns, UTC]


In [24]:
# removing null values
df = df.dropna()

In [25]:
df.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [26]:
# Average fair amount
average_fare = df['fare_amount'].mean()


In [27]:
print(average_fare)

11.359891549457748


In [29]:
# distance between each pickup and dropoff points using Haversine formula.

def haversine_dist(lat1, lon1, lat2, lon2):
    """
    Parameters:
        lat1, lon1: Latitude and longitude of the pickup location.
        lat2, lon2: Latitude and longitude of the dropoff location.
    
    """
    # Radius of the Earth in kilometers
    R = 6371.0
    
    # Convert latitude and longitude from degrees to radians
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    
    # Haversine formula
    d_lon = lon2 - lon1
    d_lat = lat2 - lat1
    a = np.sin(d_lat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(d_lon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    
    return distance

In [30]:
df['haversine_distance'] = df.apply(lambda row: haversine_dist(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['haversine_distance'] = df.apply(lambda row: haversine_dist(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)


In [31]:
df['haversine_distance']

0          1.683323
1          2.457590
2          5.036377
3          1.661683
4          4.475450
            ...    
199995     0.112210
199996     1.875050
199997    12.850319
199998     3.539715
199999     5.417783
Name: haversine_distance, Length: 199999, dtype: float64

In [32]:
#  Median Haversine Distance
median_distance = df['haversine_distance'].median()
print("Median Haversine Distance:", median_distance, "kilometers")

Median Haversine Distance: 2.1209923961833708 kilometers


In [33]:
# Maximum haversine distance 
max_distance = df['haversine_distance'].max()
print("Maximum Haversine Distance:", max_distance, "kilometers")

Maximum Haversine Distance: 16409.239135313168 kilometers


In [34]:
#  rides with haversine distance = 0
rides_with_zero_distance = (df['haversine_distance'] == 0.0).sum()
print("Number of rides with 0.0 Haversine Distance:", rides_with_zero_distance)

Number of rides with 0.0 Haversine Distance: 5632


In [35]:
# mean 'fare_amount' for rides with 0 haversine distance
mean_fare_for_zero_distance = df[df['haversine_distance'] == 0.0]['fare_amount'].mean()
print("Mean Fare Amount for Rides with 0.0 Haversine Distance:", mean_fare_for_zero_distance)

Mean Fare Amount for Rides with 0.0 Haversine Distance: 11.585317826704546


In [36]:
# Maximum 'fare_amount' for a ride
max_fare = df['fare_amount'].max()
print("Maximum Fare Amount:", max_fare)

Maximum Fare Amount: 499.0


In [37]:
# Minimum 'fare_amount' for a ride
min_fare = df['fare_amount'].min()
print("Minimum Fare Amount:", min_fare)

Minimum Fare Amount: -52.0


In [38]:
#Haversine distance between pickup and dropoff location for the costliest ride
haversine_dist_costliest_ride =df[df['fare_amount'] == df['fare_amount'].max()]['haversine_distance']

In [48]:
haversine_dist_costliest_ride

170081    0.00079
Name: haversine_distance, dtype: float64

In [49]:
# rides recorded in the year 2014
df['pickup_year'] = df['pickup_datetime'].dt.year
rides_in_2014 = (df['pickup_year'] == 2014).sum()
print("Number of Rides Recorded in 2014:", rides_in_2014)

Number of Rides Recorded in 2014: 29968


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pickup_year'] = df['pickup_datetime'].dt.year


In [50]:
#  rides recorded in the first quarter of 2014?

df['pickup_quarter'] = df['pickup_datetime'].dt.quarter
rides_in_first_quarter_2014 = (df['pickup_year'] == 2014) & (df['pickup_quarter'] == 1)
num_rides_in_first_quarter_2014 = rides_in_first_quarter_2014.sum()
print("Number of Rides Recorded in the First Quarter of 2014:", num_rides_in_first_quarter_2014)

Number of Rides Recorded in the First Quarter of 2014: 7687


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pickup_quarter'] = df['pickup_datetime'].dt.quarter


In [52]:
#Day of the week in September 2010 on which maximum rides were recorded 

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['day_of_week'] = df['pickup_datetime'].dt.day_name()
df['year'] = df['pickup_datetime'].dt.year
september_2010_rides = df[(df['year'] == 2010) & (df['pickup_datetime'].dt.month == 9)]
day_counts = september_2010_rides['day_of_week'].value_counts()
max_rides_day = day_counts.idxmax()
max_rides_count = day_counts.max()
print("Day of the week in September 2010 on which maximum rides were recorded")
print("Day:", max_rides_day)
print("Number of Rides:", max_rides_count)

Day of the week in September 2010 on which maximum rides were recorded
Day: Thursday
Number of Rides: 457


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_week'] = df['pickup_datetime'].dt.day_name()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = df['pickup_datetime'].dt.year


In [64]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [55]:
df_encoded = pd.get_dummies(df, columns=['day_of_week'], drop_first=True)

In [57]:
df_encoded.columns

Index(['ride_id', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'haversine_distance', 'pickup_year',
       'pickup_quarter', 'year', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday',
       'day_of_week_Wednesday'],
      dtype='object')

In [60]:
columns_to_exclude=['ride_id','pickup_datetime','pickup_longitude',
       'pickup_latitude','dropoff_longitude','dropoff_latitude',
       'pickup_year','pickup_quarter','year']
df_filtered = df_encoded.drop(columns=columns_to_exclude)

In [61]:
df_filtered

Unnamed: 0,fare_amount,passenger_count,haversine_distance,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,7.5,1,1.683323,0,0,0,1,0,0
1,7.7,1,2.457590,0,0,0,0,0,0
2,12.9,1,5.036377,1,0,0,0,0,0
3,5.3,3,1.661683,0,0,0,0,0,0
4,16.0,5,4.475450,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
199995,3.0,1,0.112210,0,0,1,0,0,0
199996,7.5,1,1.875050,0,0,0,0,0,0
199997,30.9,2,12.850319,1,0,0,0,0,0
199998,14.5,1,3.539715,0,0,0,0,0,1


In [62]:
df_filtered.iloc[:, 1:]

Unnamed: 0,passenger_count,haversine_distance,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,1,1.683323,0,0,0,1,0,0
1,1,2.457590,0,0,0,0,0,0
2,1,5.036377,1,0,0,0,0,0
3,3,1.661683,0,0,0,0,0,0
4,5,4.475450,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
199995,1,0.112210,0,0,1,0,0,0
199996,1,1.875050,0,0,0,0,0,0
199997,2,12.850319,1,0,0,0,0,0
199998,1,3.539715,0,0,0,0,0,1


In [65]:
X = df_filtered.iloc[:, 1:]
y = df_filtered['fare_amount']

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Train a Random Forest Regression model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Make predictions on the testing data
lr_preds = lr_model.predict(X_test)
rf_preds = rf_model.predict(X_test)

# Calculate adjusted R-squared for both models
def adjusted_r_squared(y_true, y_pred, n, p):
    r_squared = r2_score(y_true, y_pred)
    adjusted_r2 = 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))
    return adjusted_r2

n = len(y_test)
p = X_test.shape[1]

lr_adjusted_r2 = adjusted_r_squared(y_test, lr_preds, n, p)
rf_adjusted_r2 = adjusted_r_squared(y_test, rf_preds, n, p)

print("Adjusted R-squared for Linear Regression:", lr_adjusted_r2)
print("Adjusted R-squared for Random Forest Regression:", rf_adjusted_r2)

Adjusted R-squared for Linear Regression: 0.0006130998123348164
Adjusted R-squared for Random Forest Regression: 0.6311571634149258
