In [1]:
# pip install pyarrow


The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in this module.


In [45]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


## Q1. Downloading the data
We'll use [the same NYC taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page),
but instead of "**Green** Taxi Trip Records", we'll use "**Yellow** Taxi Trip Records".

Download the data for January and February 2022.

In [77]:
df_jan = pd.read_parquet('data/yellow_tripdata_2022-01.parquet')


In [78]:
df_jan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2463931 entries, 0 to 2463930
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [79]:
print('Question 1: Number of columns in Jan 2022 Yellow Taxi Trip data:',df_jan.shape[1])

Question 1: Number of columns in Jan 2022 Yellow Taxi Trip data: 19


## Q2. Computing duration

Now let's compute the `duration` variable. It should contain the duration of a ride in minutes. 

In [80]:
df_jan['duration'] = df_jan.tpep_dropoff_datetime - df_jan.tpep_pickup_datetime
df_jan.duration = df_jan.duration.apply(lambda td: td.total_seconds() / 60)



In [81]:
print('Question 2:  Standard deviation of the trips duration in Jan 2022 Yellow Taxi Trip data:' ,np.round(df_jan['duration'].std(),2))

Question 2:  Standard deviation of the trips duration in Jan 2022 Yellow Taxi Trip data: 46.45


## Q3. Dropping outliers

Next, we need to check the distribution of the `duration` variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).


In [82]:
init_rec_count = df_jan.shape[0]
init_rec_count


2463931

In [83]:
df_jan = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]

In [84]:
print('Question 3: Fraction of the records left after dropping the outliers:',str(int(100*(df_jan.shape[0]/init_rec_count)))+'%')

Question 3: Fraction of the records left after dropping the outliers: 98%


## Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model. 

* Turn the dataframe into a list of dictionaries
* Fit a dictionary vectorizer 
* Get a feature matrix from it

In [85]:
categorical = ['PULocationID', 'DOLocationID']

df_jan[categorical] = df_jan[categorical].astype(str)

In [86]:
train_dicts = df_jan[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df_jan[target].values

In [42]:
print('Question 4: Dimensionality after OHE:',X_train.shape[1])

Question 4: Dimensionality after OHE: 515


## Q5. Training a model

Now let's use the feature matrix from the previous step to train a model. 

* Train a plain linear regression model with default parameters 
* Calculate the RMSE of the model on the training data


In [88]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)


In [51]:
print('Question 5: RMSE on train:',np.round(mean_squared_error(y_train, y_pred, squared=False),2))

Question 5: RMSE on train: 6.99


## Q6. Evaluating the model

Now let's apply this model to the validation dataset (February 2022). 


In [89]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [90]:
df_feb = read_dataframe('Data/yellow_tripdata_2022-02.parquet')



In [106]:
df_train = read_dataframe('Data/yellow_tripdata_2022-01.parquet')
df_val = read_dataframe('Data/yellow_tripdata_2022-02.parquet')

In [107]:

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [109]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [111]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

print('Question 6: RMSE on validation:',np.round(mean_squared_error(y_val, y_pred, squared=False),2))

Question 6: RMSE on validation: 7.79
