In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Questions

## Q1- Download data
- Use Yellow taxi trip dataset
- 2023/01 and 02 data
- For January. How many columns are there ?

In [3]:
# Load the dataset
df = pd.read_parquet('./dataset/yellow_tripdata_2023-01.parquet')
len(df.columns)

19

## Q2 - Computing duration
- What is the standar deviation of the trips durations in January

In [4]:
# Transform strings into datetime format
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

# Calculate difference into a new column
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

# Transforms datetime to minutes
df.duration = df.duration.apply (lambda td: td.total_seconds() /60)

# Show std deviation
df['duration'].std()

42.594351241920904

## Q3 - Droping outliers
Remove outliers keeping duraiont between 1 and 60 minutes (inclusive)

- What fraction of the records left after you dropped the outliers ?

In [5]:
# Remove outliers. Keep: 1 <= duration <= 60 minutes
clean_df = df[(df.duration >=1) & (df.duration <=60)]

# Fraction of records keept
len(clean_df) / len(df) * 100

98.1220282212598

Q4 - One-hot encoding
Apply one-hot encoding to the pickup and dropoff locations IDS. We'll use only these two features for our model.
- Turn the dataframe into a list of dictionaries (remember to re-cast the IDs to strings - otherwiese it will label encode them)
- Fit a dictionary vectorizer
- Get a feature matrix from it

What's the dimensionality of this matrix (number of columns) ?

In [12]:
# Use the clean data
df = clean_df

# Columns used for categorical features
categorical = ['PULocationID','DOLocationID']

# Turns categorical columns into strings
df[categorical] = df[categorical].astype(str)

# Createst the dictionary
train_dicts = df [categorical].to_dict(orient='records')

# Fit a dictionary vectorizer
dv = DictVectorizer()

# Get a feature matrix
X_train = dv.fit_transform(train_dicts)

In [13]:
X_train.shape[1]

515

## Q5 - Training a Model
Now let's use the feature matrix from the previous step to train a model.

- Train a plain linear regression model with default parameters, where `duration` is the response variable
- Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [8]:
# Set the target and create a training target
target = 'duration'
y_train = df[target].values

# Calculate a linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [9]:
# Use the regresion to calculate estimations for the training and then compare it with the real value.
y_train_pred = lr.predict(X_train)

# Calculate RMSE
mean_squared_error(y_train, y_train_pred, squared=False)

7.649261027780877

## Q6 - Evaluating the model

Now let's apply this model to the validation dataset (February 2023).

What's the RMSE on validation?

In [10]:
# Load the validation dataset (Feb 2023)
df_val = pd.read_parquet('./dataset/yellow_tripdata_2023-02.parquet')

# Transform strings into datetime format
df_val.tpep_dropoff_datetime = pd.to_datetime(df_val.tpep_dropoff_datetime)
df_val.tpep_pickup_datetime = pd.to_datetime(df_val.tpep_pickup_datetime)

# Calculate difference into a new column
df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime

# Transforms datetime to minutes
df_val.duration = df_val.duration.apply (lambda td: td.total_seconds() /60)

# Filter outliers
df_val=df_val[(df_val.duration >=1) & (df_val.duration <=60)]

In [14]:
# Turns categorical columns into strings
df_val[categorical] = df_val[categorical].astype(str)

# Creates the dictionary
validation_dicts = df_val[categorical].to_dict(orient='records')

# Get a feature matrix
X_validation = dv.transform(validation_dicts)

In [23]:
# Get predictions for validation set
y_val_pred = lr.predict(X_validation)

# Real values in validation set
y_val = df_val[target].values

# Calculate RMSE
mean_squared_error(y_val, y_val_pred, squared=False)

7.811832644833854