# Uber Trips — College Practical (EDA + Cleaning + Simple Model)

This notebook loads **`uber.csv`**, performs **EDA**, **data cleaning**, **feature engineering** (Haversine distance), builds visualizations, and trains a **Linear Regression** model to predict fare.

**What you'll get**
- Cleaned dataset with features
- Descriptive statistics
- Plots (hour, day-of-week, daily trend, fare & distance histograms, fare vs distance scatter)
- Model metrics (R², MAE) and coefficients

> Tip: Run cells from top to bottom. Replace the CSV path in the next cell if your file name is different.


In [None]:
# If needed, install libraries (uncomment if your environment doesn't have them)
# %pip install -q pandas numpy matplotlib scikit-learn

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

# Notebook options
pd.set_option('display.max_columns', 100)


In [None]:
# --- Configuration ---
CSV_PATH = 'uber.csv'   # <-- change if your filename is different
PLOTS_DIR = Path('plots')
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

def load_dataset(csv_path: str, sample: int = None) -> pd.DataFrame:
    from pathlib import Path
    import pandas as pd
    try:
        df = pd.read_csv(csv_path, sep=None, engine='python', low_memory=False)
    except Exception:
        df = pd.read_csv(csv_path, low_memory=False)
    # Drop unnamed index-like columns
    for c in list(df.columns):
        if c.lower().startswith('unnamed'):
            df = df.drop(columns=[c])
    if sample is not None and len(df) > sample:
        df = df.sample(n=sample, random_state=42)
    return df.reset_index(drop=True)

def haversine_km(lat1, lon1, lat2, lon2):
    """Vectorized Haversine distance in kilometers."""
    R = 6371.0
    lat1 = np.radians(lat1.astype(float))
    lon1 = np.radians(lon1.astype(float))
    lat2 = np.radians(lat2.astype(float))
    lon2 = np.radians(lon2.astype(float))
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

def plot_and_save(series_or_df, kind, title, xlabel, ylabel, filename: str):
    """One chart per figure, Matplotlib only, no explicit colors."""
    plt.figure()
    if kind == 'bar':
        series_or_df.plot(kind='bar')
    elif kind == 'line':
        series_or_df.plot(kind='line')
    elif kind == 'hist':
        series_or_df.plot(kind='hist', bins=50)
    elif kind == 'scatter':
        plt.scatter(series_or_df.iloc[:,0], series_or_df.iloc[:,1], s=5)
    else:
        raise ValueError('Unsupported plot kind')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    out_path = PLOTS_DIR / filename
    plt.savefig(out_path, dpi=150)
    plt.show()
    plt.close()
    return out_path


In [None]:
df = load_dataset(CSV_PATH, sample=120000)  # sample to keep things snappy
print(f'Rows: {len(df)} | Columns: {len(df.columns)}')
df.head()

In [None]:
# Identify and parse a datetime-like column (prefer 'pickup_datetime')
dt_col = None
candidates = ['pickup_datetime', 'date_time', 'datetime', 'timestamp', 'date/time', 'date', 'time']
for cand in candidates:
    for c in df.columns:
        if c.lower() == cand.lower():
            dt_col = c; break
    if dt_col: break
if dt_col is None:
    # fuzzy pickup time
    for c in df.columns:
        if 'pickup' in c.lower() and 'time' in c.lower():
            dt_col = c; break

if dt_col is None:
    raise ValueError("No datetime-like column found (e.g., 'pickup_datetime'). Found columns: " + ', '.join(df.columns))

df['pickup_datetime'] = pd.to_datetime(df[dt_col], errors='coerce', infer_datetime_format=True)
df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_week'] = df['pickup_datetime'].dt.day_name()
df['date'] = df['pickup_datetime'].dt.date

df[['pickup_datetime','hour','day_of_week','date']].head()

In [None]:
# Keep essential columns if present; drop nulls & obvious outliers
required = ['fare_amount','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','pickup_datetime']
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f'Missing required columns: {missing}')

before = len(df)
df = df.dropna(subset=required)

# NYC-ish bounds (guards against swapped coords too)
lat_ok = df['pickup_latitude'].between(40.0, 42.0) & df['dropoff_latitude'].between(40.0, 42.0)
lon_ok = df['pickup_longitude'].between(-75.0, -72.0) & df['dropoff_longitude'].between(-75.0, -72.0)
df = df[lat_ok & lon_ok]

# Reasonable passenger count
df = df[(df['passenger_count'] >= 1) & (df['passenger_count'] <= 6)]

# Positive, reasonable fares
df = df[(df['fare_amount'] > 0) & (df['fare_amount'] < 300)]

# Drop duplicates
df = df.drop_duplicates().reset_index(drop=True)
after = len(df)
print(f'Cleaning reduced rows from {before} -> {after}')
df.head()

In [None]:
df['distance_km'] = haversine_km(
    df['pickup_latitude'], df['pickup_longitude'],
    df['dropoff_latitude'], df['dropoff_longitude']
)
# Remove tiny/insane distances
before = len(df)
df = df[(df['distance_km'] > 0.05) & (df['distance_km'] < 100)].reset_index(drop=True)
after = len(df)
print(f'Feature engineering filtered rows: {before} -> {after}')
df[['fare_amount','distance_km','hour','passenger_count']].head()

In [None]:
summary = df.describe(include='all', datetime_is_numeric=True)
summary_path = Path('summary.csv')
summary.to_csv(summary_path, index=True)
summary

In [None]:
# Trips by hour
hour_counts = df['hour'].value_counts().sort_index()
plot_and_save(hour_counts, 'bar', 'Trips by Hour of Day', 'Hour', 'Trips', 'trips_by_hour.png')

# Trips by day of week (Mon..Sun order)
order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
dow_counts = df['day_of_week'].value_counts().reindex(order)
plot_and_save(dow_counts, 'bar', 'Trips by Day of Week', 'Day', 'Trips', 'trips_by_dow.png')

# Daily trend
daily = df.groupby('date').size()
plot_and_save(daily, 'line', 'Daily Trip Counts', 'Date', 'Trips', 'daily_trips.png')

# Fare histogram
plot_and_save(df['fare_amount'], 'hist', 'Fare Amount Distribution', 'Fare ($)', 'Frequency', 'fare_hist.png')

# Distance histogram
plot_and_save(df['distance_km'], 'hist', 'Trip Distance Distribution', 'Distance (km)', 'Frequency', 'distance_hist.png')

# Fare vs Distance scatter (sample)
small = df.sample(n=min(20000, len(df)), random_state=42)[['distance_km','fare_amount']]
plot_and_save(small, 'scatter', 'Fare vs Distance (sample)', 'Distance (km)', 'Fare ($)', 'fare_vs_distance_scatter.png')


In [None]:
# Linear Regression: fare ~ distance_km + hour + passenger_count
features = ['distance_km','hour','passenger_count']
X = df[features].astype(float).values
y = df['fare_amount'].astype(float).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred_tr = model.predict(X_train)
y_pred_te = model.predict(X_test)

metrics = {
    'train_r2': float(r2_score(y_train, y_pred_tr)),
    'test_r2': float(r2_score(y_test, y_pred_te)),
    'test_mae': float(mean_absolute_error(y_test, y_pred_te)),
    'coef_distance_km': float(model.coef_[0]),
    'coef_hour': float(model.coef_[1]),
    'coef_passenger_count': float(model.coef_[2]),
    'intercept': float(model.intercept_),
    'n_train': int(len(y_train)),
    'n_test': int(len(y_test))
}
metrics

In [None]:
out_csv = Path('uber_clean_features.csv')
df.to_csv(out_csv, index=False)
print('Saved cleaned dataset with features to:', out_csv.resolve())

## Next Steps (optional)
- Try **Regularized** models (Ridge/Lasso) or **Tree-based** models.
- Add **rush-hour** flags, **month/season** dummies.
- Clip or transform fares (e.g., log) and compare metrics.
- Evaluate with **R²**, **MAE**, and **Residual plots**.
