In [1]:
import numpy as np
import pandas as pd

In [9]:
excel_file_path = 'uber_rides_data.xlsx'
df = pd.read_excel(excel_file_path)


In [10]:
df.shape

(200000, 8)

In [14]:
df.dtypes

ride_id                int64
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [16]:
df['dropoff_longitude'].isnull().sum()

1

In [17]:
df['pickup_datetime'].dtypes

dtype('O')

In [22]:
df.dropna()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
199995,42598914,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


In [23]:
df.describe()

Unnamed: 0,ride_id,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967153,40.767158,-73.963658,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [24]:
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    r = 6371  # Radius of Earth in kilometers
    distance = r * c

    return distance

In [25]:
df['haversine_distance'] = df.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], 
                                                         row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

In [27]:
df['haversine_distance'].median()

2.120992396182902

In [28]:
df['haversine_distance'].max()

# Print the maximum Haversine distance

16409.23913531317

In [33]:
zero_distance_rides = df[df['haversine_distance'] == 0.0]
len(zero_distance_rides)

5632

In [35]:
'''A Haversine distance of 0.0 implies that the pickup and dropoff locations 
are the same or extremely close, indicating that the ride may not 
have actually occurred or was very short.'''
zero_distance_rides['fare_amount'].mean()

11.585317826704578

In [36]:
df['fare_amount'].max()

499.0

In [37]:
#the ride with the highest 'fare_amount'
costliest_ride = df[df['fare_amount'] == df['fare_amount'].max()]

In [38]:
#Haversine distance for the costliest ride
haversine_distance_costliest_ride = haversine(costliest_ride['pickup_latitude'].values[0],
                                              costliest_ride['pickup_longitude'].values[0],
                                              costliest_ride['dropoff_latitude'].values[0],
                                              costliest_ride['dropoff_longitude'].values[0])

In [39]:
'''The calculated Haversine distance for the costliest ride may provide
insights into whether the fare amount is justified based on the distance 
traveled. If the distance is relatively short but the fare is very high, 
it could indicate overcharging'''

haversine_distance_costliest_ride

0.0007899213191009994

In [41]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['pickup_year'] = df['pickup_datetime'].dt.year
rides_in_2014 = len(df[df['pickup_year'] == 2014])


In [42]:
rides_in_2014

29968

In [43]:
df['pickup_quarter'] = df['pickup_datetime'].dt.quarter

In [44]:
# Count the number of rides recorded in the first quarter of 2014 (Q1 2014)
rides_in_q1_2014 = len(df[(df['pickup_year'] == 2014) & (df['pickup_quarter'] == 1)])

In [45]:
rides_in_q1_2014

7687

In [51]:
df['pickup_day_of_week'] = df['pickup_datetime'].dt.day_name()

In [52]:
september_2010_rides = df[(df['pickup_datetime'].dt.year == 2010) & (df['pickup_datetime'].dt.month == 9)]

In [53]:
day_of_week_counts = september_2010_rides['pickup_day_of_week'].value_counts()

In [58]:
max_rides_day = day_of_week_counts.idxmax()

In [59]:
max_rides_day

'Thursday'

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

In [64]:
df['ride_week_day']=df['pickup_day_of_week']
# Preprocess data, including encoding 'ride_week_day' using Label Encoding
label_encoder = LabelEncoder()
df['ride_week_day_encoded'] = label_encoder.fit_transform(df['ride_week_day'])

In [66]:
df['distance']=df['haversine_distance']

In [72]:
df.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,haversine_distance,pickup_year,pickup_quarter,pickup_day_of_week,ride_week_day,ride_week_day_encoded,distance
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.683323,2015,2,Thursday,Thursday,4,1.683323
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2.45759,2009,3,Friday,Friday,0,2.45759
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,5.036377,2009,3,Monday,Monday,1,5.036377
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.661683,2009,2,Friday,Friday,0,1.661683
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,4.47545,2014,3,Thursday,Thursday,4,4.47545


In [80]:
df['distance'].fillna(df['distance'].mean(), inplace=True)

In [81]:
# Split the data into features (X) and target variable (y)
X = df[['passenger_count', 'distance', 'ride_week_day_encoded']]
y = df['fare_amount']

# Split the data into a 70-30 training-testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [82]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r_squared = r2_score(y_test, y_pred)
    adj_r_squared = 1 - (1 - r_squared) * (len(y_test) - 1) / (len(y_test) - X_test.shape[1] - 1)
    results[model_name] = adj_r_squared


In [83]:
min_adj_r_squared_model = min(results, key=results.get)

print("Model with the least adjusted R-squared value:", min_adj_r_squared_model)

Model with the least adjusted R-squared value: Linear Regression
