In [31]:
# import
import os
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# # Lets plot
# import seaborn as sns
# import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [32]:
# setting path to the data directory
CURRENT_DIRECTORY = os.getcwd()
PARENT_DIRECTORY = os.path.dirname(CURRENT_DIRECTORY)
DATA_PATH = os.path.join(PARENT_DIRECTORY, '_data')

In [33]:
# Read the data
def read_data(filepath):
    if filepath.endswith('.parquet'):
        df = pq.read_table(filepath).to_pandas()
        df.columns = df.columns.str.lower()
    elif filepath.endswith('.csv'):
        df = pd.read_csv(filepath)
        df.columns = df.columns.str.lower()
    else:
        raise ValueError('Unsupported file format')
    return df

In [51]:
# Calculate trip duration and filter outliers
def preprocess_data(df):
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy()
    df['pulocationid'] = df['pulocationid'].astype(str)
    df['dolocationid'] = df['dolocationid'].astype(str)
    return df

In [52]:
# One-hot encoding
def one_hot_encode(df, dv=None):
    dicts = df[['pulocationid', 'dolocationid']].to_dict(orient='records')
    if dv is None:
        dv = DictVectorizer(sparse=False)
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv

In [53]:
# Calculate RMSE
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [54]:
# Train model
def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    train_rmse = rmse(y_train, y_pred)
    return model, train_rmse

In [55]:
# Read and preprocess January data
# READ JANUARY DATA
january_data_path = os.path.join(DATA_PATH, 'yellow_tripdata_2023-01.parquet')
january_df = read_data(january_data_path)

print(f'Number of columns in January data: {january_df.shape[1]}')
january_df = preprocess_data(january_df)
january_duration_std_dev = january_df['duration'].std()
print(f'Standard deviation of trips duration in January: {january_duration_std_dev:.2f}')

# Filter outliers in January data
january_filtered_df, records_left_fraction = preprocess_data(january_df)
print(f'Fraction of records left after dropping outliers: {records_left_fraction:.2f}')

# Fit DictVectorizer and transform January data
X_train, dv = one_hot_encode(january_filtered_df)
print(f'Dimensionality of the feature matrix: {X_train.shape[1]}')

# Train model on January data
model, train_rmse = train_model(X_train, january_filtered_df['duration'].values)
print(f'Training RMSE: {train_rmse:.2f}')

Number of columns in January data: 19
Standard deviation of trips duration in January: 9.94


ValueError: too many values to unpack (expected 2)

In [None]:


# Read and preprocess February data
february_data_path = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'
february_df = read_data(february_data_path)
february_df = preprocess_data(february_df)
february_duration_std_dev = february_df['duration'].std()
print(f'Standard deviation of trips duration in February: {february_duration_std_dev:.2f}')

# Filter outliers in February data
february_filtered_df, _ = outliers(february_df)
print(f'Fraction of records left after dropping outliers: {records_left_fraction:.2f}')

# One-hot encode February data using the same DictVectorizer
X_val, _ = one_hot_encode(february_filtered_df, dv)
print(f'Validation Feature Matrix size: {X_val.shape}')

# Predict on February data and calculate RMSE
y_val = february_filtered_df['duration'].values
y_pred = model.predict(X_val)
val_rmse = rmse(y_val, y_pred)
print(f'Validation RMSE: {val_rmse:.2f}')
