# Quiz answers are in the last cell of this notebook!

# Install required packages

In [1]:
!pip install pyarrow scikit-learn



# Overview
The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in this module.

We'll use the same NYC taxi dataset, but instead of "Green Taxi Trip Records", we'll use "Yellow Taxi Trip Records". 
Download the data for January and February 2023 from [here](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page).

# Solution

In [2]:
import pandas as pd
import sklearn
print(f"Pandas version: {pd.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")

# Download data files
!wget -nc https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
!wget -nc https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

def load_prep_data(filepath):
    df_raw = pd.read_parquet(filepath)

    # Preserve the raw input data in df variable and work on a clone
    df = df_raw.copy()

    # Convert pandas.Timedelta to float of minutes
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

    # Convert pandas.Timedelta to float of minutes
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    # Dropping outliers
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    return df_raw, df

# Load local parquet files
df_raw_jan, df_jan = load_prep_data('./yellow_tripdata_2023-01.parquet')
df_raw_feb, df_feb = load_prep_data('./yellow_tripdata_2023-02.parquet')

Pandas version: 2.2.2
scikit-learn version: 1.5.2
File ‘yellow_tripdata_2023-01.parquet’ already there; not retrieving.

File ‘yellow_tripdata_2023-02.parquet’ already there; not retrieving.



In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

# Apply one-hot encoding on PULocationID and DOLocationID columns.
# One hot encoding (using DictVectorizer) only vectorizes string
# values. Since pickup and dropoff IDs are categorical values stored 
# as integers (however 'location ID A' > 'location ID B' has no meaning), we
# have to convert them to string for DictVectorizer to vectorize them.
# Before one hot encoding:
#   [('PULocationID', 'DOLocationID'), 
#   (161, 141), 
#   (43, 237)]
# After one hot encoding:
#   [("DOLocationID=141","DOLocationID=237","PULocationID=161","PULocationID=43"), 
#   (1.0, 0.0, 1.0, 0.0),
#   (0.0, 1.0, 0.0, 1.0)]

categorical = ['PULocationID', 'DOLocationID']
as_string = df_jan[categorical].astype(str)
train_dicts = as_string.to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_jan['duration'].values

# Training a model
lr = LinearRegression()
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)
print(f"Performance on train set: {root_mean_squared_error(y_train, y_train_pred)}")

# Validating
as_string = df_feb[categorical].astype(str)
train_dicts = as_string.to_dict(orient='records')

X_val = dv.transform(train_dicts)
y_val = df_feb['duration'].values

y_val_pred = lr.predict(X_val)
print(f"Performance on validation set: {root_mean_squared_error(y_val, y_val_pred)}")

Performance on train set: 7.649261937621321
Performance on validation set: 7.811817646307258


# Quiz answers

In [4]:
# Q1. Downloading the data
print(f"Q1. Read the data for January. How many columns are there?\n  Answer: {len(df_raw_jan.columns)}")  # 19

# Q2. Computing duration
duration = df_raw_jan.tpep_dropoff_datetime - df_raw_jan.tpep_pickup_datetime
print(f"Q2. What's the standard deviation of the trips duration in January?\n  Answer: {duration.std().total_seconds() / 60:0.2f}")  # 42.59

# Q3. Dropping outliers
print(f"Q3. What fraction of the records left after you dropped the outliers?\n  Answer: {df_jan.shape[0]/df_raw_jan.shape[0]:0.0%}")  # 98%

# Q4. One-hot encoding
print(f"Q4. What's the dimensionality of this matrix (number of columns)?\n  Answer: {X_train.shape[1]}")  # 515

# Q5. Training a model
print(f"Q5. What's the RMSE on train?\n  Answer: {root_mean_squared_error(y_train, y_train_pred):0.3f}")  # 7.64

# Q6. Evaluating the model
print(f"Q6. What's the RMSE on validation?\n  Answer: {root_mean_squared_error(y_val, y_val_pred):0.3f}")  # 7.81

Q1. Read the data for January. How many columns are there?
  Answer: 19
Q2. What's the standard deviation of the trips duration in January?
  Answer: 42.59
Q3. What fraction of the records left after you dropped the outliers?
  Answer: 98%
Q4. What's the dimensionality of this matrix (number of columns)?
  Answer: 515
Q5. What's the RMSE on train?
  Answer: 7.649
Q5. What's the RMSE on validation?
  Answer: 7.812
