In [1]:
import pandas as pd
import numpy as np

## Answering Question 1

In [2]:
df = pd.read_parquet('./fhv_tripdata_2021-01.parquet')
print(f'Shape of the data: {df.shape}')
df.head(3)

Shape of the data: (1154112, 7)


Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013


## Answering Question 2

In [3]:
df.pickup_datetime = pd.to_datetime(df.pickup_datetime)
df.dropOff_datetime = pd.to_datetime(df.dropOff_datetime)

In [4]:
df['duration'] = (df.dropOff_datetime-df.pickup_datetime)
df.duration = df.duration.apply(lambda y: y.total_seconds()/60)

In [5]:
print(f'Mean of total duration: {df.duration.mean():.2f}')

Mean of total duration: 19.17


## Answering Question 3

In [6]:
print(f'Percentage of missing values in the Pick up location ID: {df.PUlocationID.isna().sum()/df.shape[0]*100:.2f}%')

Percentage of missing values in the Pick up location ID: 83.03%


## Answering Question 4

In [7]:
df.fillna(-1, inplace=True)
df.head(1)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,-1.0,-1.0,-1,B00009,17.0


In [8]:
categorical = ['PUlocationID', 'DOlocationID']
df[categorical] = df[categorical].astype(str)
df = df[(df.duration >= 1) & (df.duration <= 60)]

#### Importing DictVector to One hot encode the data

In [9]:
from sklearn.feature_extraction import DictVectorizer

In [10]:
train_dict = df[categorical].to_dict(orient='records')
dv = DictVectorizer()

In [11]:
train_dict[0]

{'PUlocationID': '-1.0', 'DOlocationID': '-1.0'}

In [12]:
X_train = dv.fit_transform(train_dict)
print(f'Dimention after OHE: {X_train.shape[1]}')

Dimention after OHE: 525


In [13]:
y_train = df['duration'].values
y_train

array([17.        , 17.        ,  8.28333333, ..., 16.2       ,
       19.43333333, 36.        ])

## Answering Question 5

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [15]:
lr =  LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [16]:
y_pred = lr.predict(X_train)
print(f'RMSE in training: {mean_squared_error(y_train, y_pred, squared=False):.2f}')

RMSE in training: 10.53


## Answering Question 6

In [17]:
dx = pd.read_parquet('./fhv_tripdata_2021-02.parquet')
dx.head(1)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014


In [18]:
dx.pickup_datetime = pd.to_datetime(dx.pickup_datetime)
dx.dropOff_datetime = pd.to_datetime(dx.dropOff_datetime)

dx['duration'] = (dx.dropOff_datetime-dx.pickup_datetime)
dx.duration = dx.duration.apply(lambda y: y.total_seconds()/60)

dx = dx[(dx.duration >= 1) & (dx.duration <= 60)]

In [19]:
valid_dict = dx[categorical].to_dict(orient='records')
dx.fillna(-1, inplace=True)
X_valid = dv.transform(valid_dict)
X_valid.shape

<990113x525 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [20]:
y_valid = dx['duration'].values

In [21]:
y_valid_pred = lr.predict(X_valid)
print(f'RMSE Validation: {mean_squared_error(y_valid, y_valid_pred, squared=False):.2f}')

RMSE Validation: 12.86
