In [1]:
!python -V

Python 3.12.10


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [10]:
import subprocess
from pathlib import Path
import ipywidgets as widgets

In [11]:
data_link_pattern = "https://d37ci6vzurychx.cloudfront.net/trip-data/{}_tripdata_{}-{}.parquet"
data_dir = Path("./data")
data_dir.mkdir(parents=True, exist_ok=True)

In [12]:
combinations = [
    ("green", "2023", "01"),
    ("green", "2023", "02"),
    ("yellow", "2023", "01"),
    ("yellow", "2023", "02"),
]
for combination in combinations:
    subprocess.run(
        [
            "wget",
            "-q",
            "-P",
            "data",
            data_link_pattern.format(*combination),
        ],
        check=True
    )
print("data downloaded successfully")

data downloaded successfully


In [13]:
dataset_colour = widgets.Dropdown(
    options=["green", "yellow"],
    value="green",
    description="Dataset 'colour':"
)
display(dataset_colour)

Dropdown(description="Dataset 'colour':", options=('green', 'yellow'), value='green')

In [19]:
print(f"loading {dataset_colour.value} data")
df = pd.read_parquet(f'./data/{dataset_colour.value}_tripdata_2023-02.parquet')

loading yellow data


In [44]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    print(f"loaded {filename}")
    print(f"no. of columns in the file: {len(df.columns)}")

    if dataset_colour.value == "green":
        df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    else:
        df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    print(f"duration stdev: {df.duration.std()}")

    original_dataset_length = df.shape[0]

    df = df[(df.duration >= 1) & (df.duration <= 60)]
    print(f"pct of data not outliers: {df.shape[0] / original_dataset_length}")

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [45]:
df_train = read_dataframe(f'./data/{dataset_colour.value}_tripdata_2023-01.parquet')
df_val = read_dataframe(f'./data/{dataset_colour.value}_tripdata_2023-02.parquet')

loaded ./data/yellow_tripdata_2023-01.parquet
no. of columns in the file: 19
duration stdev: 42.59435124195458
pct of data not outliers: 0.9812202822125979
loaded ./data/yellow_tripdata_2023-02.parquet
no. of columns in the file: 19
duration stdev: 42.84210176105113
pct of data not outliers: 0.9800944077722545


In [26]:
len(df_train), len(df_val)

(3009173, 2855951)

In [27]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [46]:
# categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
print(f"number of dimensions in the one-hot encoded sparse matrix: {X_train.ndim}")

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

number of dimensions in the one-hot encoded sparse matrix: 2


In [47]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [50]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [51]:
y_pred = lr.predict(X_train)

print(f"the RMSE on the train dataset is: {root_mean_squared_error(y_train, y_pred)}")

the RMSE on the train dataset is: 7.658406582175197


In [52]:
y_pred = lr.predict(X_val)

print(f"the RMSE on the validation dataset is: {root_mean_squared_error(y_val, y_pred)}")

the RMSE on the validation dataset is: 7.820096870991671


In [53]:
model_dir = Path("models")
model_dir.mkdir(exist_ok=True)
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [None]:
lr = Lasso(0.01)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)