## MLOps Zoomcamp - Homework 1
The data: NYC Taxi, Jan - Feb 2023, Yellow Taxis


In [28]:
import pandas as pd
import numpy as np
import os
import datetime

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


### Question 1 - Downloading the data

In [29]:
#uploading the data

jan_df = pd.read_parquet("data/yellow_tripdata_2023-01.parquet")
feb_df = pd.read_parquet("data/yellow_tripdata_2023-02.parquet")

#the dimensions of the jan_df
jan_df.shape

(3066766, 19)

### Question 2 - Computing duration

In [30]:
#calculate duration in minutes

jan_df['duration_in_minutes'] = (jan_df['tpep_dropoff_datetime'] - jan_df['tpep_pickup_datetime']).dt.total_seconds() / 60
np.std(jan_df['duration_in_minutes'])

42.5943442974141

### Question 3 - Dropping Outliers

In [31]:
filtered_jan_df = jan_df[(jan_df['duration_in_minutes'] >= 1) & (jan_df['duration_in_minutes'] <= 60)]
filtered_jan_df.shape[0]/jan_df.shape[0] * 100

98.1220282212598

### Question 4 - One-hot encoding

In [32]:
location_ids = filtered_jan_df[['PULocationID', 'DOLocationID']]

location_ids['PULocationID'] = location_ids['PULocationID'].astype(str)
location_ids['DOLocationID'] = location_ids['DOLocationID'].astype(str)

#to save the time i will use only the slice of the data, as it will then result into too high computational time
location_ids = location_ids.head(100000)

data_dicts = location_ids.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X = dv.fit_transform(data_dicts)


print(X)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_ids['PULocationID'] = location_ids['PULocationID'].astype(str)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_ids['DOLocationID'] = location_ids['DOLocationID'].astype(str)


In [33]:
X.shape

(100000, 482)

### Question 5 - Training a model. RMSE on train

In [36]:
#to save the time i will use only the slice of the data, as it will then result into too high computational time
y = filtered_jan_df['duration_in_minutes'].head(100000).values
model = LinearRegression()
model.fit(X, y)


In [37]:
y_pred = model.predict(X)

rmse = np.sqrt(mean_squared_error(y, y_pred))

print(f'The RMSE on training is {rmse}')

The RMSE on training is 7.6005433498131145


### Question 6 - Evaluating the model

In [40]:
feb_df['duration_in_minutes'] = (feb_df['tpep_dropoff_datetime'] - feb_df['tpep_pickup_datetime']).dt.total_seconds() / 60
feb_df = feb_df[(feb_df['duration_in_minutes'] >= 1) & (feb_df['duration_in_minutes'] <= 60)]
#to save the time i will use only the slice of the data, as it will then result into too high computational time
y_val = feb_df['duration_in_minutes'].head(100000).values

location_ids_feb = feb_df[['PULocationID', 'DOLocationID']]

location_ids_feb['PULocationID'] = location_ids_feb['PULocationID'].astype(str)
location_ids_feb['DOLocationID'] = location_ids_feb['DOLocationID'].astype(str)

#to save the time i will use only the slice of the data, as it will then result into too high computational time
location_ids_feb = location_ids_feb.head(100000)
data_dicts_feb = location_ids_feb.to_dict(orient='records')

X_val = dv.fit_transform(data_dicts)

y_pred_feb = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred_feb))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_ids_feb['PULocationID'] = location_ids_feb['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_ids_feb['DOLocationID'] = location_ids_feb['DOLocationID'].astype(str)


In [41]:
print(f'The RMSE on evaluating is {rmse}')

The RMSE on evaluating is 11.883100400209337
