In [19]:
# Libraries
import os
import pickle
import matplotlib.pyplot as plt
import requests
import pandas as pd
import seaborn as sns

# Scikit-Learn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error, mean_squared_error

In [9]:
# Import Data
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet')

# Top 5
df.head(5)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-01-01 00:46:55,2024-01-01 00:58:25,N,1.0,236,239,1.0,1.98,12.8,1.0,0.5,3.61,0.0,,1.0,21.66,1.0,1.0,2.75
1,2,2024-01-01 00:31:42,2024-01-01 00:52:34,N,1.0,65,170,5.0,6.54,30.3,1.0,0.5,7.11,0.0,,1.0,42.66,1.0,1.0,2.75
2,2,2024-01-01 00:30:21,2024-01-01 00:49:23,N,1.0,74,262,1.0,3.08,19.8,1.0,0.5,3.0,0.0,,1.0,28.05,1.0,1.0,2.75
3,1,2024-01-01 00:30:20,2024-01-01 00:42:12,N,1.0,74,116,1.0,2.4,14.2,1.0,1.5,0.0,0.0,,1.0,16.7,2.0,1.0,0.0
4,2,2024-01-01 00:32:38,2024-01-01 00:43:37,N,1.0,74,243,1.0,5.14,22.6,1.0,0.5,6.28,0.0,,1.0,31.38,1.0,1.0,0.0


In [10]:
# Data Preparation
df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
df = df[(df.duration >= 1) & (df.duration <= 60)]

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
df[categorical] = df[categorical].astype(str)

In [17]:
# Train Model
train_dicts = df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mse_calculated = root_mean_squared_error(y_train, y_pred)
print(f'Error Metric: {mse_calculated:.2f}')

Error Metric: 7.12


**Download Data**

In [48]:
# Function to check if path exists
def check_path(load_path:str):
    """Check if the path exists, otherwise create it in the main folder"""
    if not os.path.exists(load_path):
        # Create folder
        os.mkdir(load_path)

    return load_path

# Function to download parquet file
def download_parquet_file(load_path, year, month):
    """Download parquet file from web"""
    # Check Path
    load_path = check_path(load_path)

    # Make request
    url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_{year}-{month:02d}.parquet'
    response = requests.get(url)

    if response.status_code:
        file_path = f'{load_path}/green_tripdata_{year}-{month:02d}.parquet'
        with open(file_path, "wb") as file:
            file.write(response.content)
        print(f"{file_path.split('/')[2]} downloaded successfully")
    else:
        raise Exception(f'{url} is not available')

In [49]:
PARENT_FOLDER = f'{os.path.dirname(os.getcwd())}/data'

# Download 2024-01
download_parquet_file(PARENT_FOLDER, 2024, 1)

# Download 2024-02
download_parquet_file(PARENT_FOLDER, 2024, 2)

green_tripdata_2024-01.parquet downloaded successfully
green_tripdata_2024-02.parquet downloaded successfully


**Data Preparation**

In [63]:
import glob

def load_parquet_file(folder_data:str):
    """Load data from folder selected"""
    return pd.read_parquet(folder_data)

def data_preparation(dataset:pd.DataFrame):
    """Data Preparation on raw data"""
    # Make Copy
    df = dataset.copy()

    # Add Duration Variable
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    # Filter Data on duration variable
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    # Cast Variables --> str
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [71]:
# Parquet files
parquet_files = glob.glob(f'{PARENT_FOLDER}/*.parquet')

# Load Data
file_202401 = load_parquet_file(parquet_files[0])
file_202402 = load_parquet_file(parquet_files[1])

# Transform Data
df_train = data_preparation(file_202401)
df_val = data_preparation(file_202402)


f'Dim File 2024-01: {df_train.shape}', f'Dim File 2024-02: {df_val.shape}'

('Dim File 2024-01: (54373, 21)', 'Dim File 2024-02: (51497, 21)')

**Data Modeling**

In [72]:
# Add Variable
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [75]:
# Target
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [77]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mse_calculated = root_mean_squared_error(y_val, y_pred)
print(f'Error Metric: {mse_calculated:.2f}')

Error Metric: 5.99


In [81]:
# Lasso Regression
lr = Lasso(0.01)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mse_calculated = root_mean_squared_error(y_val, y_pred)
print(f'Error Metric: {mse_calculated:.2f}')

Error Metric: 8.03


In [None]:
# Serialize the best model: Linear Regression
path_model = check_path(f'{os.path.dirname(PARENT_FOLDER)}/models')
with open(f'{path_model}/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)