In [None]:
%pwd
%ls
%cd ..
%cd /content/drive/MyDrive

In [None]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = pd.read_parquet('yellow_tripdata_2022-01.parquet')

In [None]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [None]:
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df['duration'] = df.duration.apply(lambda td: td.total_seconds() / 60)

In [None]:
df.duration.describe().count

<bound method Series.count of count    2.463931e+06
mean     1.421220e+01
std      4.644531e+01
min     -3.442400e+03
25%      6.316667e+00
50%      1.018333e+01
75%      1.616667e+01
max      8.513183e+03
Name: duration, dtype: float64>

In [None]:
with_outliers = len(df) # get the count from the describe()
no_outliers   = ((df.duration >= 1) & (df.duration <= 60)).sum()

no_outliers / with_outliers

0.9827547930522406

In [None]:
df1 = df[(df.duration >= 1) & (df.duration <= 60)].copy()

categorias = ['PULocationID', 'DOLocationID'] 

df1[categorias] = df1[categorias].fillna(-1).astype('int')
df1[categorias] = df1[categorias].astype('str')

In [None]:
training_dictionaries = df1[categorias].to_dict(orient = 'records')

In [None]:
dv = DictVectorizer()
X_train = dv.fit_transform(training_dictionaries)

In [None]:
X_train.shape

(2421440, 515)

In [None]:
y_train = df1.duration.values

In [None]:
lr = LinearRegression()
lr.fit(X_train ,y_train )

In [None]:
y_pred = lr.predict(X_train)
mean_squared_error(y_train, y_pred, squared = False)

6.986190742248472

In [None]:
categorias = ['PULocationID', 'DOLocationID'] 

def new_data(filename):
  df = pd.read_parquet(filename)
  df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
  df['duration'] = df.duration.apply(lambda td: td.total_seconds() / 60)
  
  df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
  categorias = ['PULocationID', 'DOLocationID'] 
  df[categorias] = df[categorias].fillna(-1).astype('int')
  df[categorias] = df[categorias].astype('str')

  return df





In [71]:
df_val = new_data('yellow_tripdata_2022-02.parquet')

In [73]:

val_dict = df_val[categorias].to_dict(orient='records')

In [74]:
X_val = dv.transform(val_dict)

In [79]:
y_val = df_val.duration.values
y_pred = lr.predict(X_val)

In [80]:
mean_squared_error(y_val, y_pred, squared = False)

7.78640662117552