**This notebook is dedicated to show the training process for the Linear Regression Model**

In [None]:
from src import data_utils, preprocessing
import matplotlib.pyplot as plt
import numpy as np

Only run the cell below by removing the # symbol if you haven't downloaded the zones data

In [None]:
# run this cell to download the required data files containing the trip and zones information

# data_utils.download_zones_data('https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip')

In [None]:
# first we get our cleaned data set
df_og = data_utils.clean_trip_data('yellow_tripdata_2022-05.parquet')

In [None]:
# now we get our X, and ys
X, y_travel_time, y_fare_amount = data_utils.get_feature_target(df_og)

In [None]:
# and we then get our training and testing sets
# we are going to get two sets of y labels, one for travel_time and another for fare_amount
X_train, X_test, y_train_travel_time, y_test_travel_time, y_train_fare_amount, y_test_fare_amount = data_utils.get_train_test_sets(X, y_travel_time, y_fare_amount)

In [None]:
# we can further split the training set into training and validation
X_train, X_val, y_train_travel_time, y_val_travel_time, y_train_fare_amount, y_val_fare_amount = data_utils.get_train_test_sets(X_train, y_train_travel_time, y_train_fare_amount)

In [None]:
# now using the training, validation, and testing sets we can preprocess our data
X_train, X_val, X_test = preprocessing.preprocess_data(X_train, X_val, X_test)

In [None]:
column_names_order = ['PULocationID','DOLocationID','improvement_surcharge','congestion_surcharge','airport_fee','day','month','is_weekend','distance_between_zones','morning','afternoon','night']

**Initial Results with no finetuning**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score

In [None]:
#FARE AMOUNT LINEAR REGRESSION
fare_amount_logreg = LinearRegression()
fare_amount_logreg.fit(X_train, y_train_fare_amount)

In [None]:
preds_fare_logreg = fare_amount_logreg.predict(X_val)
# measure the results
mse = mean_squared_error(y_test_fare_amount, preds_fare_logreg)
mae = mean_absolute_error(y_test_fare_amount, preds_fare_logreg)
rmse = root_mean_squared_error(y_test_fare_amount, preds_fare_logreg)
# agregar r2
print(mse)
print(mae)
print(rmse)

In [None]:
#TRAVEL TIME LINEAR REGRESSION
travel_time_logreg = LinearRegression()
travel_time_logreg.fit(X_train, y_train_travel_time)

In [None]:
preds_travel_logreg = travel_time_logreg.predict(X_test)
# measure the results
mse = mean_squared_error(y_test_travel_time, preds_travel_logreg)
mae = mean_absolute_error(y_test_travel_time, preds_travel_logreg)
rmse = root_mean_squared_error(y_test_travel_time, preds_travel_logreg)
# agregar r2
print(mse)
print(mae)
print(rmse)