
# Uber Fare Prediction — Cleaned Practical Notebook

**Tasks covered:**  
1. Pre-process the dataset (datetime parsing, dtype fixes).  
2. Identify outliers (IQR method) and optionally remove them.  
3. Check correlations between features and target.  
4. Implement Linear Regression and Random Forest Regression.  
5. Evaluate models (R², RMSE, MAE) and compare results.

**Notes:** This notebook expects a CSV file called `uber.csv` in the same folder. If your dataset has a different name, update the `DATA_PATH` variable below.


In [None]:

# Basic imports and settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = "uber.csv"  # change if your file has a different name


In [None]:

# Load data (adjust path if needed)
df = pd.read_csv(DATA_PATH)
print("Loaded:", DATA_PATH)
print("Shape:", df.shape)
df.head()


In [None]:

# Inspect columns and basic info
print(df.columns.tolist())
display(df.info())
display(df.describe(include='all').T)


In [None]:

# Drop obvious useless columns if present (adjust to your dataset)
for col in ['Unnamed: 0', 'key']:
    if col in df.columns:
        df.drop(columns=col, inplace=True)
print("After dropping cols, shape:", df.shape)


In [None]:

# Parse pickup_datetime to datetime and extract month & hour as integer types
if 'pickup_datetime' in df.columns:
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
    df['month'] = df['pickup_datetime'].dt.month.astype('Int64')
    df['hour'] = df['pickup_datetime'].dt.hour.astype('Int64')
    # drop original if not needed
    df.drop(columns=['pickup_datetime'], inplace=True)
else:
    print("No pickup_datetime column found; ensure your timestamp column is named 'pickup_datetime'")


In [None]:

# Ensure numeric columns are numeric (coerce errors -> NaN)
num_cols = ['fare_amount','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count']
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')
# Show dtypes
df.dtypes


In [None]:

# Single, vectorized haversine distance function
def haversine_vectorized(lon1, lat1, lon2, lat2):
    # convert to radians
    lon1, lat1, lon2, lat2 = map(np.radians, (lon1, lat1, lon2, lat2))
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c
    return km

# Add distance column if coordinates present
coord_cols = ['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']
if all(col in df.columns for col in coord_cols):
    df['distance_km'] = haversine_vectorized(df['pickup_longitude'].values,
                                            df['pickup_latitude'].values,
                                            df['dropoff_longitude'].values,
                                            df['dropoff_latitude'].values)
    print("Added distance_km column.")
else:
    print("Coordinate columns missing; distance not computed.")


In [None]:

# Handle missing or suspicious values
# passenger_count: replace 0 or very large numbers with median of realistic values (1..6)
if 'passenger_count' in df.columns:
    med_pc = df.loc[df['passenger_count'].between(1,6),'passenger_count'].median()
    df['passenger_count'] = df['passenger_count'].apply(lambda x: med_pc if pd.isna(x) or x==0 or x>6 else x)

# fare_amount: replace non-positive fares with median positive fare
if 'fare_amount' in df.columns:
    med_fare = df.loc[df['fare_amount']>0,'fare_amount'].median()
    df['fare_amount'] = df['fare_amount'].apply(lambda x: med_fare if pd.isna(x) or x<=0 else x)

# Drop rows with essential NaNs (e.g., coordinates or fare)
essential_cols = ['fare_amount','distance_km']
for c in essential_cols:
    if c in df.columns:
        df = df[~df[c].isna()]

print("After cleaning, shape:", df.shape)


In [None]:

# Outlier detection using IQR for numeric columns of interest
def remove_outliers_iqr(df, cols, k=1.5):
    df_out = df.copy()
    for col in cols:
        if col in df_out.columns:
            Q1 = df_out[col].quantile(0.25)
            Q3 = df_out[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - k * IQR
            upper = Q3 + k * IQR
            # keep only rows within bounds
            df_out = df_out[(df_out[col] >= lower) & (df_out[col] <= upper)]
    return df_out

numeric_check_cols = [c for c in ['fare_amount','distance_km','passenger_count','hour','month'] if c in df.columns]
print("Numeric cols checked for outliers:", numeric_check_cols)
df_no_out = remove_outliers_iqr(df, numeric_check_cols, k=1.5)
print("Shape before outlier removal:", df.shape, "after:", df_no_out.shape)
# You can switch to df = df_no_out if you want to remove outliers permanently
# df = df_no_out


In [None]:

# Correlation heatmap (using numeric columns)
plt.figure(figsize=(8,6))
corr_matrix = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation matrix (numeric features)')
plt.show()


In [None]:

# Prepare features and target
# Choose a reasonable set of features
features = []
for c in ['distance_km','passenger_count','hour','month','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']:
    if c in df.columns:
        features.append(c)
print("Using features:", features)

X = df[features].copy()
y = df['fare_amount'].copy()

# Fill any remaining NaNs in X with median
X = X.fillna(X.median())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
print("Train/Test sizes:", X_train.shape, X_test.shape)


In [None]:

# Scale numeric features (important for linear models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

r2_lr = r2_score(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
mae_lr = mean_absolute_error(y_test, y_pred_lr)

print("Linear Regression — R2: {:.4f}, RMSE: {:.4f}, MAE: {:.4f}".format(r2_lr, rmse_lr, mae_lr))


In [None]:

# Random Forest Regression
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)  # tree-based models don't require scaled features
y_pred_rf = rf.predict(X_test)

r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print("Random Forest — R2: {:.4f}, RMSE: {:.4f}, MAE: {:.4f}".format(r2_rf, rmse_rf, mae_rf))


In [None]:

# Comparison table
results = pd.DataFrame({
    'model': ['LinearRegression','RandomForest'],
    'r2': [r2_lr, r2_rf],
    'rmse': [rmse_lr, rmse_rf],
    'mae': [mae_lr, mae_rf]
})
display(results)


In [None]:

# Feature importances from Random Forest (if available)
if hasattr(rf, 'feature_importances_'):
    importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
    print("Feature importances (Random Forest):")
    display(importances)
    plt.figure(figsize=(6,4))
    importances.plot(kind='bar')
    plt.title('Feature importances')
    plt.show()


In [None]:

# Save cleaned sample of dataframe and brief instructions
df.sample(5)
