In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np

In [15]:
# Load the dataset
df = pd.read_csv('uber.csv')


In [16]:
# Show first few rows to verify data loaded correctly
print(df.head())

   Unnamed: 0                            key  fare_amount  \
0    24238194    2015-05-07 19:52:06.0000003          7.5   
1    27835199    2009-07-17 20:04:56.0000002          7.7   
2    44984355   2009-08-24 21:45:00.00000061         12.9   
3    25894730    2009-06-26 08:22:21.0000001          5.3   
4    17610152  2014-08-28 17:47:00.000000188         16.0   

           pickup_datetime  pickup_longitude  pickup_latitude  \
0  2015-05-07 19:52:06 UTC        -73.999817        40.738354   
1  2009-07-17 20:04:56 UTC        -73.994355        40.728225   
2  2009-08-24 21:45:00 UTC        -74.005043        40.740770   
3  2009-06-26 08:22:21 UTC        -73.976124        40.790844   
4  2014-08-28 17:47:00 UTC        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude  passenger_count  
0         -73.999512         40.723217                1  
1         -73.994710         40.750325                1  
2         -73.962565         40.772647                1  
3         

In [17]:
# Convert pickup_datetime to datetime type for easier manipulation
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')


In [18]:
# Check for missing or zero coordinates (likely invalid data)
print("Rows with zero coordinates:", df[(df['pickup_longitude'] == 0) | (df['pickup_latitude'] == 0) | (df['dropoff_longitude'] == 0) | (df['dropoff_latitude'] == 0)].shape[0])


Rows with zero coordinates: 3968


In [19]:
# Remove rows with zero coordinates
df = df[(df['pickup_longitude'] != 0) & (df['pickup_latitude'] != 0) & (df['dropoff_longitude'] != 0) & (df['dropoff_latitude'] != 0)]


In [20]:
# Define a function to calculate Haversine distance between pickup and dropoff points
def haversine(lat1, lon1, lat2, lon2):
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Earth radius in km
    return c * r


In [21]:
# Calculate distance and add as a new column
df['distance_km'] = haversine(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])


In [22]:
# Remove rides with zero or near zero distance (invalid)
df = df[df['distance_km'] > 0.01]


In [23]:
# Check the data after cleaning
print(df[['fare_amount', 'distance_km']].describe())


         fare_amount    distance_km
count  193506.000000  193506.000000
mean       11.314580       4.835734
std         9.545667      94.720384
min       -52.000000       0.010016
25%         6.000000       1.286273
50%         8.500000       2.185514
75%        12.500000       3.949647
max       230.000000   16409.239135


In [24]:
# Features and target
X = df[['distance_km']]  # you can add more features later
y = df['fare_amount']

In [25]:
# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
# --- Linear Regression ---
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)


In [27]:
y_pred_lr = lr_model.predict(X_test)

In [28]:
# --- Random Forest Regression ---
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [29]:
y_pred_rf = rf_model.predict(X_test)

In [30]:
# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    print(f"--- {model_name} Evaluation ---")
    print(f"R2 Score: {r2_score(y_true, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.4f}")
    print()


In [31]:
# Evaluate both models
evaluate_model(y_test, y_pred_lr, 'Linear Regression')
evaluate_model(y_test, y_pred_rf, 'Random Forest Regression')


--- Linear Regression Evaluation ---
R2 Score: 0.0010
RMSE: 9.6439
MAE: 5.9663

--- Random Forest Regression Evaluation ---
R2 Score: 0.7167
RMSE: 5.1353
MAE: 2.7960

