In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
!pip install pyarrow
!pip install fastparquet



In [3]:
df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')

### Q1. Read the data for January. How many columns are there?

In [4]:
len(df.columns)

19

### Q2. Computing duration
Now let's compute the duration variable. It should contain the duration of a ride in minutes 

What's the standard deviation of the trips duration in January?

In [5]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
round(df.duration.std(),2)

42.59

### Q3. Dropping outliers
Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive)<br/>
What fraction of the records left after you dropped the outliers?.

In [6]:
tot_records = len(df.duration.values.tolist())
df = df[(df.duration >= 1) & (df.duration <= 60)]
round((df.shape[0]/tot_records)*100,2)

98.12

### Q4. One-hot encoding
What's the dimensionality of this matrix (number of columns)?

In [9]:
categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)
train_dicts = df[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_train.shape

(3009173, 515)

### Q5. Training a model
Now let's use the feature matrix from the previous step to train a model.

1. Train a plain linear regression model with default parameters
2. Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [10]:
target = 'duration'
y_train = df[target].values
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)

7.649261812064437

### Q6. Evaluating the model
Now let's apply this model to the validation dataset (February 2023)

What's the RMSE on validation?

In [11]:
test_df = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')

In [12]:
test_df['duration'] = test_df.tpep_dropoff_datetime - test_df.tpep_pickup_datetime
test_df.duration = test_df.duration.apply(lambda td: td.total_seconds() / 60)

In [13]:
test_df = test_df[(test_df.duration >= 1) & (test_df.duration <= 60)]

In [14]:
categorical = ['PULocationID', 'DOLocationID']
test_df[categorical] = test_df[categorical].astype(str)
val_dicts = test_df[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
target = 'duration'
y_val = test_df[target].values
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

7.811822042099508