In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, root_mean_squared_error, accuracy_score

# --- Load ---
df = pd.read_csv("S:/Sameet Patil - SEM 7/LP3/codes/uber.csv")

# --- Basic valid-range filters for coordinates, fare, passengers ---
df = df.dropna(
    subset=[
        "pickup_longitude",
        "pickup_latitude",
        "dropoff_longitude",
        "dropoff_latitude",
        "fare_amount",
        "passenger_count",
    ]
)

# keep only valid lat/lon ranges
df = df[
    (df.pickup_latitude.between(-90, 90))
    & (df.dropoff_latitude.between(-90, 90))
    & (df.pickup_longitude.between(-180, 180))
    & (df.dropoff_longitude.between(-180, 180))
]

# reasonable fares and passenger counts
df = df[(df.fare_amount > 0) & (df.fare_amount <= 500)]
df = df[df.passenger_count.between(1, 6)]


# --- Haversine distance (vectorized) ---
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # km
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = (
        np.sin(dphi / 2.0) ** 2
        + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2.0) ** 2
    )
    return 2 * R * np.arcsin(np.sqrt(a))


df["distance_km"] = haversine(
    df.pickup_latitude.values,
    df.pickup_longitude.values,
    df.dropoff_latitude.values,
    df.dropoff_longitude.values,
)

# remove zero distances and extremely large distances (likely errors)
df = df[(df.distance_km > 0.01) & (df.distance_km <= 200)]

# optional: reset index
df = df.reset_index(drop=True)

print("After cleaning, rows:", len(df))
print(df[["fare_amount", "passenger_count", "distance_km"]].describe().T)

# --- Features & train/test ---
X = df[["distance_km", "passenger_count"]]
y = df["fare_amount"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Linear Regression
lr = LinearRegression().fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1).fit(
    X_train, y_train
)
rf_pred = rf.predict(X_test)

After cleaning, rows: 192746
                    count       mean       std       min       25%       50%  \
fare_amount      192746.0  11.324385  9.550877  0.010000  6.000000  8.500000   
passenger_count  192746.0   1.690266  1.305734  1.000000  1.000000  1.000000   
distance_km      192746.0   3.390447  4.045997  0.010016  1.286296  2.185674   

                       75%         max  
fare_amount      12.500000  230.000000  
passenger_count   2.000000    6.000000  
distance_km       3.948294  177.995633  


In [9]:
# Evaluation helper
def eval_print(y_true, y_pred, name):
    print(f"\n{name}")
    print("R2:", round(r2_score(y_true, y_pred), 4))
    print("RMSE:", round(root_mean_squared_error(y_true, y_pred), 4))


eval_print(y_test, lr_pred, "Linear Regression")
eval_print(y_test, rf_pred, "Random Forest")


Linear Regression
R2: 0.66
RMSE: 5.6553

Random Forest
R2: 0.7311
RMSE: 5.029
