In [22]:
import pandas as pd

In [23]:
train = pd.read_parquet("../../data/yellow_tripdata_2023-01.parquet", engine="pyarrow")

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

# Homework

In [25]:
# Q1: How many columns are in the January 2023 yellow taxi trips data?
# Answer: 19
len(train.columns)

19

In [26]:
# Q2: What is the standard deviation of the trip duration?
# Answer: 42.59
def calculate_trip_duration(data):
    data = data.copy(deep=True)
    data["tpep_dropoff_datetime"] = pd.to_datetime(data["tpep_dropoff_datetime"])
    data["tpep_pickup_datetime"] = pd.to_datetime(data["tpep_pickup_datetime"])

    data["duration"] = (data["tpep_dropoff_datetime"] - data["tpep_pickup_datetime"]).dt.total_seconds() / 60

    return data["duration"]

train["duration"] = calculate_trip_duration(data=train)
train["duration"].describe().round(2)

count    3066766.00
mean          15.67
std           42.59
min          -29.20
25%            7.12
50%           11.52
75%           18.30
max        10029.18
Name: duration, dtype: float64

In [27]:
# Q3: What fraction of the records left after you dropped the outliers? (filter out records with duration < 1 and > 60 minutes)
# Answer: 0.98

nrows = len(train)
train = train[(train["duration"] >= 1) & (train["duration"] <= 60)]

len(train) / nrows

0.9812202822125979

In [28]:
# Q4: One-hot-encoding location ids. How many feature columns are in the resulting dataframe?
# Answer: 515
from sklearn.feature_extraction import DictVectorizer


def make_dicts(data):
    data = data.copy(deep=True)
    data["PULocationID"] = data["PULocationID"].astype(str)
    data["DOLocationID"] = data["DOLocationID"].astype(str)

    return data[["PULocationID", "DOLocationID"]].to_dict(orient="records")

train_dicts = make_dicts(data=train)


dv = DictVectorizer()

X = dv.fit_transform(train_dicts)
X.shape

(3009173, 515)

In [29]:
# Q5: Train a linear regression model to predict the trip duration. What is the RMSE?
# Answer: 7.64
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

y = train["duration"].values

lr = LinearRegression()
lr.fit(X, y)
y_pred = lr.predict(X)
rmse = root_mean_squared_error(y, y_pred)
rmse

7.649261931416412

In [30]:
# Q6: Evaluate the model on the validation set. What is the RMSE?
# Answer: 42.28
validation = pd.read_parquet("../../data/yellow_tripdata_2023-02.parquet", engine="pyarrow")
validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2913955 entries, 0 to 2913954
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [31]:
validation["duration"] = calculate_trip_duration(data=validation)
validation = validation[(validation["duration"] >= 1) & (validation["duration"] <= 60)]

y_valid = validation["duration"].values

valid_dicts = make_dicts(data=validation)
X_valid = dv.transform(valid_dicts)

In [32]:
y_valid_pred = lr.predict(X_valid)

rmse_valid = root_mean_squared_error(y_valid_pred, y_valid)
rmse_valid

7.8118162035401735