In [1]:
import os
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [3]:
!bash get_data.sh

In [2]:
def prepare_file(file):
    df = pd.read_parquet('../data/'+file)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    print(f"The average duration of trips in {file} is {df.duration.mean():.2f} minutes.")
    print(f"The standard deviation of the trip durations in {file} is {df.duration.std():.2f} minutes.")

    
    entries_total = df.shape[0]
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    print(f"{entries_total - df.shape[0]} (potential) outliers were dropped.", '\n')
    
    return df


In [3]:
###Question 1
file = 'yellow_tripdata_2022-01.parquet'
df = pd.read_parquet('../data/'+file)
print(f"Q1: The number of columns in {file} is {df.shape[1]}", '\n')

###Question 2
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
print(f"Q2: The standard deviation of trip durations in {file} is {df.duration.std()} minutes.", '\n')

###Question 3
entries_total = df.shape[0]
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
print(f"Q3: {df.shape[0]/entries_total:.0%} is the fraction of the records left after outliers were dropped.")


Q1: The number of columns in yellow_tripdata_2022-01.parquet is 19 

Q2: The standard deviation of trip durations in yellow_tripdata_2022-01.parquet is 46.44530513776802 minutes. 

Q3: 98% is the fraction of the records left after outliers were dropped.


In [4]:
features = ['PULocationID', 'DOLocationID']
target = 'duration'
train_dicts = df[features].astype(str).to_dict(orient='records')

In [5]:
file_val = 'yellow_tripdata_2022-02.parquet'
df_val = prepare_file(file_val)
val_dicts = df_val[features].astype(str).to_dict(orient='records')

The average duration of trips in yellow_tripdata_2022-02.parquet is 15.65 minutes.
The standard deviation of the trip durations in yellow_tripdata_2022-02.parquet is 47.26 minutes.
61244 (potential) outliers were dropped. 



In [6]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [7]:
print(f"Q4: The dimensionality of the feature matrix is {X_train.shape[1]}.")

Q4: The dimensionality of the feature matrix is 515.


In [8]:
lr = LinearRegression()

In [9]:
y_train = df[target].values
y_val = df_val[target].values

In [10]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
y_pred_val = lr.predict(X_val)
RMSE = mean_squared_error(y_train, y_pred, squared=False)
RMSE_val = mean_squared_error(y_val, y_pred_val, squared=False)

In [11]:
print(f"Q5: The RMSE on the training set is {RMSE}.")

Q5: The RMSE on the training set is 6.9861910730902625.


In [12]:
print(f"Q6: The RMSE on the validation set is {RMSE_val}.")

Q6: The RMSE on the validation set is 7.786412673537908.
