In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Q1. Downloading the data

In [3]:
df_january = pd.read_parquet('data/fhv_tripdata_2021-01.parquet')
df_february = pd.read_parquet('data/fhv_tripdata_2021-02.parquet')

In [4]:
print("Number of elements in January:", df_january.shape[0])
print("Number of elements in February:", df_february.shape[0])

Number of elements in January: 1154112
Number of elements in February: 1037692


# Q2. Computing duration

In [5]:
def get_duration(dropoff, pickup):
    return (dropoff - pickup).apply(lambda t: t.total_seconds()/60)

In [6]:
def filter_by_duration(duration):
    return (duration >= 1) & (duration <= 60)

In [7]:
df_january['duration'] = get_duration(df_january['dropOff_datetime'], df_january['pickup_datetime'])
df_february['duration'] = get_duration(df_february['dropOff_datetime'], df_february['pickup_datetime'])

In [8]:
print("Mean duration in January:", df_january['duration'].mean())
print("Mean duration in February:", df_february['duration'].mean())

Mean duration in January: 19.167224093791006
Mean duration in February: 20.706986225199763


In [9]:
df_january_filtered = df_january[filter_by_duration(df_january['duration'])].copy()
df_february_filtered = df_february[filter_by_duration(df_february['duration'])].copy()

In [10]:
print("Number of elements in January after filtering:", df_january_filtered.shape[0])
print("Number of elements in February after filtering:", df_february_filtered.shape[0])

Number of elements in January after filtering: 1109826
Number of elements in February after filtering: 990113


# Q3. Missing values

In [11]:
df_january_filtered.loc[:, 'PUlocationID'].fillna(-1, inplace=True)
df_january_filtered.loc[:, 'DOlocationID'].fillna(-1, inplace=True)

In [12]:
perc_pickup_january = (df_january_filtered['PUlocationID'] == -1).sum()/df_january_filtered.shape[0]
perc_dropoff_january = (df_january_filtered['DOlocationID'] == -1).sum()/df_january_filtered.shape[0]

print("Percentage of Nones in pickup location ID (January):", perc_pickup_january)
print("Percentage of Nones in dropoff location ID (January):", perc_dropoff_january)

Percentage of Nones in pickup location ID (January): 0.8352732770722617
Percentage of Nones in dropoff location ID (January): 0.13327044059158824


In [13]:
df_february_filtered.loc[:, 'PUlocationID'].fillna(-1, inplace=True)
df_february_filtered.loc[:, 'DOlocationID'].fillna(-1, inplace=True)

In [14]:
perc_pickup_february = (df_february_filtered['PUlocationID'] == -1).sum()/df_february_filtered.shape[0]
perc_dropoff_february = (df_february_filtered['DOlocationID'] == -1).sum()/df_february_filtered.shape[0]

print("Percentage of Nones in pickup location ID (February):", perc_pickup_february)
print("Percentage of Nones in dropoff location ID (February):", perc_dropoff_february)

Percentage of Nones in pickup location ID (February): 0.8571354986754037
Percentage of Nones in dropoff location ID (February): 0.13610567682678643


# Q4. One-hot encoding

In [15]:
dict_january = df_january_filtered[['PUlocationID', 'DOlocationID']].astype(str).to_dict(orient='records')
dict_february = df_february_filtered[['PUlocationID', 'DOlocationID']].astype(str).to_dict(orient='records')

In [16]:
dv = DictVectorizer().fit(dict_january)

In [17]:
X_train = dv.transform(dict_january)
X_test = dv.transform(dict_february)

In [18]:
y_train = df_january_filtered['duration'].values
y_test = df_february_filtered['duration'].values

In [19]:
print("The dimensionality of the data is: ", X_train.shape[1])

The dimensionality of the data is:  525


# Q5. Training a model

In [20]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [21]:
y_pred = lr.predict(X_train)

In [22]:
rmse = mean_squared_error(y_train, y_pred, squared=False)

In [23]:
print("The RMSE in the training set is: ", rmse)

The RMSE in the training set is:  10.528519388232237


# Q6. Evaluating the model

In [24]:
y_pred_test = lr.predict(X_test)

In [25]:
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

In [26]:
print("The RMSE in the test set is: ", rmse_test)

The RMSE in the test set is:  11.014286813221993
