In [1]:
# import
import os
import numpy as np
import pandas as pd
import pyarrow.parquet as pa
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# # Lets plot
# import seaborn as sns
# import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# setting path to the data directory
CURRENT_DIRECTORY = os.getcwd()
PARENT_DIRECTORY = os.path.dirname(CURRENT_DIRECTORY)
DATA_PATH = os.path.join(PARENT_DIRECTORY, '_data')

In [3]:
# read the data
def read_data(data):
    if data.endswith('.parquet'):
        data = pa.read_table(data)
        df = data.to_pandas() # converting to pandas df
        df.columns = df.columns.str.lower()
        return df
    elif data.endswith('.csv'):
        df = pd.read_csv(data)
        df.columns = df.columns.str.lower()
        return df

    else:
        return 'Not valid format'

In [4]:
# To calculate the standard deviation of the pick and drop time in minutes
def standard_deviation(data):
    data['duration'] = pd.to_datetime(data['tpep_dropoff_datetime']) - pd.to_datetime(data['tpep_pickup_datetime'])
    # Convert duration to total seconds
    data['duration'] = data['duration'].dt.total_seconds()
    # Convert seconds to hours and minutes
    data['duration'] = data['duration'] / 60
    # Standard deviation
    return data, data['duration'].std()

In [5]:
def outliers(data):
    january_outliers = data[(data['duration']>=1)&(data['duration']<=60)]
    return january_outliers, (january_outliers.shape[0] / data.shape[0]) * 100

In [6]:
# One-hot encoding
def One_hot_encoding(data, vectorise=None):
    # Converting pick up and drop off location id into strings
    data['pulocationid'] = data['pulocationid'].astype(str)
    data['dolocationid'] = data['dolocationid'].astype(str)

    # Converting DataFrame into a list of dictionaries
    df_dict = data[['pulocationid', 'dolocationid']].to_dict(orient='records')

    # Initialise a DictVectorizer if not provided
    if vectorise is None:
        vectorise = DictVectorizer(sparse=False)
        X = vectorise.fit_transform(df_dict)
        return X, vectorise
    else:
        X = vectorise.transform(df_dict)
        return X

In [7]:
# Define RMSE function
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

In [8]:
def training(data, X_train):
    y_train = data['duration'].values

    linear_regression = LinearRegression()
    linear_regression.fit(X_train, y_train)
    y_prediction = linear_regression.predict(X_train)

    # Calculate the Root Mean Square Error
    RMSE = rmse(y_train, y_prediction)
    return linear_regression, y_train, y_prediction, RMSE

-----

In [9]:
# READ JANUARY DATA
january_data_path = os.path.join(DATA_PATH, 'yellow_tripdata_2023-01.parquet')
january = read_data(january_data_path)
# READ FEBRUARY DATA
february_data_path = os.path.join(DATA_PATH, 'yellow_tripdata_2023-02.parquet')
february = read_data(february_data_path)

In [10]:
january, january_duriation_std_dev = standard_deviation(january)
print('Standard Deviation of Pick and Drop time for the month of January (time in minutes)', january_duriation_std_dev)

Standard Deviation of Pick and Drop time for the month of January (time in minutes) 42.59435124195458


In [11]:
january_df, records_left = outliers(january)
print('Fraction of the records left after dropping the outliers = ', records_left)

Fraction of the records left after dropping the outliers =  98.1220282212598


In [12]:
# Fit DictVectorizer and transform January data
X_train, vectorise = One_hot_encoding(january_df, vectorise=None)
print('*******************************************')
print('*******************************************')
print(f'Feature Matrix size = {X_train.shape}')

*******************************************
*******************************************
Feature Matrix size = (3009173, 515)


In [13]:
linear_regression, y_train, y_prediction, RMSE = training(january_df, X_train)
print(f'Training Root Mean Square Error = {RMSE}')

Training Root Mean Square Error = 7.649265256581374


In [14]:
february, february_duriation_std_dev = standard_deviation(february)
print('Standard Deviation of Pick and Drop time for the month of FEbruary (time in minutes)', february_duriation_std_dev)

Standard Deviation of Pick and Drop time for the month of FEbruary (time in minutes) 42.84210176105113


In [15]:
february_df, records_left = outliers(february)
print('Fraction of the records left after dropping the outliers = ', records_left)

Fraction of the records left after dropping the outliers =  98.00944077722545


In [16]:
# Transform February data using the fitted DictVectorizer
X_val = One_hot_encoding(february_df, vectorise)
print('*******************************************')
print('*******************************************')
print(f'Validation Feature Matrix size = {X_val.shape}')

*******************************************
*******************************************
Validation Feature Matrix size = (2855951, 515)


In [17]:
# Predict on February data
y_val = february_df['duration'].values
y_pred = linear_regression.predict(X_val)
print(f'Validation RMSE: {rmse(y_val, y_pred)}')

Validation RMSE: 12365689.451843796


-----