In [7]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

--2025-06-10 20:59:30--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 65.8.245.178, 65.8.245.50, 65.8.245.51, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|65.8.245.178|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47673370 (45M) [application/x-www-form-urlencoded]
Saving to: ‘yellow_tripdata_2023-01.parquet’


2025-06-10 20:59:32 (31.9 MB/s) - ‘yellow_tripdata_2023-01.parquet’ saved [47673370/47673370]

--2025-06-10 20:59:32--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 65.8.245.178, 65.8.245.50, 65.8.245.51, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|65.8.245.178|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47748012 (46M) [application/x-www

In [60]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet

--2025-06-10 21:31:27--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 65.8.245.50, 65.8.245.51, 65.8.245.171, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|65.8.245.50|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 54222699 (52M) [binary/octet-stream]
Saving to: ‘yellow_tripdata_2023-04.parquet’


2025-06-10 21:31:28 (68.8 MB/s) - ‘yellow_tripdata_2023-04.parquet’ saved [54222699/54222699]



In [8]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_squared_error

In [9]:
def preprocess_trip_data(data):

  data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])
  data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])

  categorical = ['PULocationID', 'DOLocationID']
  data[categorical] = data[categorical].astype(str)

  return data

In [10]:
def add_trip_duration(data):

  data["Duration"] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']
  data["Duration"] = data["Duration"].apply(lambda td: td.total_seconds() / 60)

  return data

In [11]:
def filter(data):

  data = data[(data["Duration"] >= 1) & (data["Duration"] <= 60)]
  return data

In [12]:
data_train = pd.read_parquet('/content/yellow_tripdata_2023-01.parquet')
data_val = pd.read_parquet('/content/yellow_tripdata_2023-02.parquet')

In [56]:
data_val.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'Airport_fee', 'Duration',
       'year', 'month'],
      dtype='object')

In [13]:
data_train = preprocess_trip_data(data_train)
data_val = preprocess_trip_data(data_val)

In [14]:
data_train.shape, data_val.shape

((3066766, 19), (2913955, 19))

In [15]:
data_train = add_trip_duration(data_train)
data_val = add_trip_duration(data_val)

In [16]:
data_train['Duration'].std()

42.594351241920904

In [17]:
data_train = filter(data_train)
data_val = filter(data_val)

In [18]:
dv = DictVectorizer()
categorical = ['PULocationID', 'DOLocationID']

train_dict = data_train[categorical].to_dict(orient="records")
X_train = dv.fit_transform(train_dict)
y_train = data_train['Duration'].values
print(len(dv.feature_names_))

val_dict = data_val[categorical].to_dict(orient="records")
X_val = dv.transform(val_dict)
y_val = data_val['Duration'].values

515


In [19]:
reg = LinearRegression().fit(X_train, y_train)

In [20]:
y_pred = reg.predict(X_val)

In [21]:
rmse = root_mean_squared_error(y_val, y_pred)
rmse

7.811818743246608

In [26]:
print(f"Standard deviation of predicted duration: {y_pred.std():.2f}")

Standard deviation of predicted duration: 6.15


In [28]:
df_result = pd.DataFrame()

In [34]:
data_val['year'] = data_val['tpep_pickup_datetime'].dt.year
data_val['month'] = data_val['tpep_pickup_datetime'].dt.month

In [40]:
df_result['ride_id'] = (
    data_val['year'].astype(str).str.zfill(4) + '/' +
    data_val['month'].astype(str).str.zfill(2) + '_' +
    data_val.index.astype(str)
)

In [42]:
df_result["pred"] = y_pred

In [43]:
df_result.head()

Unnamed: 0,ride_id,pred
0,2023/02_0,11.367797
3,2023/02_3,49.694937
4,2023/02_4,15.187994
5,2023/02_5,11.220791
6,2023/02_6,18.294678


In [46]:
output_file = "pred_file.parquet"

df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [61]:
data_apr = pd.read_parquet('/content/yellow_tripdata_2023-04.parquet')

In [62]:
data_apr = preprocess_trip_data(data_apr)

In [63]:
data_apr = add_trip_duration(data_apr)

In [64]:
data_apr['year'] = data_apr['tpep_pickup_datetime'].dt.year
data_apr['month'] = data_apr['tpep_pickup_datetime'].dt.month

In [65]:
def run(year, month):

    val_filtered = data_apr[(data_val['year'] == year) & (data_apr['month'] == month)]

    categorical = ['PULocationID', 'DOLocationID']

    dv = DictVectorizer()
    train_dict = data_train[categorical].to_dict(orient="records")
    X_train = dv.fit_transform(train_dict)
    y_train = data_train['Duration'].values

    val_dict = val_filtered[categorical].to_dict(orient="records")
    X_val = dv.transform(val_dict)

    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_val)

    print(round(y_pred.mean(), 2))


# Run for April 2023 as per the assinment
run(2023, 4)

14.61


In [59]:
print(data_apr[['year', 'month']].drop_duplicates().sort_values(['year', 'month']))


         year  month
1387271  2008     12
1312010  2009      1
169      2023      1
0        2023      2
2736451  2023      3
