In [1]:
# --- Import necessary libraries ---
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# --- Configuration ---
URL_JAN = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
URL_FEB = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'
LOCATION_FEATURES = ['PULocationID', 'DOLocationID']

In [3]:
# --- Data Loading and Preprocessing ---
def load_and_preprocess_data(url, min_duration=1, max_duration=60):
    """
    Loads taxi trip data from a URL, calculates duration, filters by duration range,
    and converts location IDs to string type.

    Args:
        url (str): URL of the parquet file.
        min_duration (int): Minimum trip duration in minutes (inclusive).
        max_duration (int): Maximum trip duration in minutes (inclusive).

    Returns:
        pd.DataFrame: Preprocessed DataFrame.
    """
    print(f"Loading data from: {url}")
    df = pd.read_parquet(url)

    # Calculate trip duration in minutes
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

    # Filter data based on duration
    df = df[(df['duration'] >= min_duration) & (df['duration'] <= max_duration)]

    # Convert location IDs to string type for featureVectorizer
    df['PULocationID'] = df['PULocationID'].astype(str)
    df['DOLocationID'] = df['DOLocationID'].astype(str)

    print(f"Data shape after preprocessing: {df.shape}")
    return df

In [4]:
# Load and preprocess January data
df_train = load_and_preprocess_data(URL_JAN)

# Load and preprocess February data
df_val = load_and_preprocess_data(URL_FEB)

Loading data from: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Data shape after preprocessing: (3009173, 20)
Loading data from: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Data shape after preprocessing: (2855951, 20)


In [5]:
# --- Feature Engineering ---
def create_feature_matrix(df, features):
    """
    Creates a feature matrix from a DataFrame using DictVectorizer.

    Args:
        df (pd.DataFrame): Input DataFrame.
        features (list): List of column names to use as features.

    Returns:
        tuple: A tuple containing:
            - scipy.sparse.csr_matrix: The feature matrix.
            - DictVectorizer: The fitted DictVectorizer object.
    """
    print("Creating feature matrix...")
    dicts = df[features].to_dict(orient='records')
    dv = DictVectorizer()
    X = dv.fit_transform(dicts)
    print(f"Feature matrix shape: {X.shape}")
    print(f"Number of features: {len(dv.feature_names_)}")
    return X, dv

In [6]:
# Create feature matrix for training data
X_train, dv = create_feature_matrix(df_train, LOCATION_FEATURES)

# Create feature matrix for validation data using the same DictVectorizer
print("Creating feature matrix for validation data...")
val_dicts = df_val[LOCATION_FEATURES].to_dict(orient='records')
X_val = dv.transform(val_dicts)
print(f"Validation feature matrix shape: {X_val.shape}")

Creating feature matrix...
Feature matrix shape: (3009173, 515)
Number of features: 515
Creating feature matrix for validation data...
Validation feature matrix shape: (2855951, 515)


In [7]:
# --- Model Training ---
def train_model(X_train, y_train):
    """
    Trains a Linear Regression model.

    Args:
        X_train (scipy.sparse.csr_matrix): Training feature matrix.
        y_train (np.ndarray): Training target variable.

    Returns:
        sklearn.linear_model.LinearRegression: The trained model.
    """
    print("Training Linear Regression model...")
    model = LinearRegression()
    model.fit(X_train, y_train)
    print("Model training complete.")
    return model

In [8]:
# Prepare target variable for training
y_train = df_train['duration'].values

# Train the model
model = train_model(X_train, y_train)

Training Linear Regression model...
Model training complete.


In [9]:
# --- Model Evaluation ---
def evaluate_model(model, X, y, data_name):
    """
    Evaluates the model using Root Mean Squared Error (RMSE).

    Args:
        model (sklearn.linear_model.LinearRegression): The trained model.
        X (scipy.sparse.csr_matrix): Feature matrix.
        y (np.ndarray): Target variable.
        data_name (str): Name of the dataset for reporting (e.g., 'train', 'validation').

    Returns:
        float: RMSE score.
    """
    print(f"Evaluating model on {data_name} data...")
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    print(f'RMSE on {data_name}: {rmse:.2f}')
    return rmse

In [10]:
# Evaluate on training data
train_rmse = evaluate_model(model, X_train, y_train, 'train')

# Prepare target variable for validation
y_val = df_val['duration'].values

# Evaluate on validation data
val_rmse = evaluate_model(model, X_val, y_val, 'validation')

Evaluating model on train data...
RMSE on train: 7.65
Evaluating model on validation data...
RMSE on validation: 7.81


In [11]:
# --- Summary ---
print("\n--- Summary ---")
print(f"RMSE on training data: {train_rmse:.2f}")
print(f"RMSE on validation data: {val_rmse:.2f}")


--- Summary ---
RMSE on training data: 7.65
RMSE on validation data: 7.81
