In [36]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [8]:
df_jan = pd.read_parquet('../input/forhire-vehicle-trip-records-mlops/fhv_tripdata_2021-01.parquet')
df_jan
# the number of data rows /records are 1154112 

In [9]:
df_jan.columns

In [24]:
df_jan.info()

In [10]:
df_jan['duration'] = df_jan.dropOff_datetime - df_jan.pickup_datetime # calculate the duration column 


In [11]:
df_jan.duration = df_jan.duration.apply(lambda td: td.total_seconds() / 60) # convert the duration column to minutes

In [12]:
df_jan.duration.mean() # average trip duration in January

In [13]:
df_jan.duration.plot()

In [14]:
ax = df_jan.duration.plot.box(figsize=(10, 8)) # to visualise the outliers better

In [15]:
check_range = df_jan.duration.between(1,60) # default is inclusive
check_range

In [16]:
check_range.value_counts()
# The outliers are 44286 values that are outside the limit 1-60

In [17]:
# so now drop the unwanted outliers
df_jan = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]

In [18]:
check_range2 = df_jan.duration.between(1,60)
check_range2.value_counts()

In [19]:
#features that we are using
categorical = ['PUlocationID', 'DOlocationID']

In [20]:

# multiple columns appraoch
df_jan[["PUlocationID", "DOlocationID"]] = df_jan[["PUlocationID", "DOlocationID"]].fillna(-1)

In [21]:
df_jan.PUlocationID.value_counts(normalize=True) * 100
# Nans are 83%

In [22]:
from sklearn.feature_extraction import DictVectorizer
# one hot encoding
categorical = ['PUlocationID', 'DOlocationID']
df_jan[categorical] = df_jan[categorical].astype(str)




train_dicts = df_jan[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)


In [23]:
X_train.shape

In [26]:
sns.distplot(df_jan.duration)

In [27]:
df_jan.describe()

In [31]:
target = 'duration'
y_train =df_jan[target].values
y_train

In [34]:
lr = LinearRegression()
lr.fit(X_train,y_train)

In [40]:
y_pred = lr.predict(X_train)
sns.distplot(y_pred,label='predictions')
sns.distplot(y_train,label='Actual')
plt.legend()

In [37]:
mean_squared_error(y_train,y_pred,squared=False)

In [50]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [51]:
#Evaluate the model on the test set
df_val = read_dataframe('../input/forhire-vehicle-trip-records-mlops/fhv_tripdata_2021-02.parquet')


In [53]:
val_dicts = df_val[categorical ].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [54]:
y_val = df_val[target].values

In [55]:
y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)