In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import GradientBoostingRegressor

import pickle
import os

In [8]:
def read_dataframe(link):
    df=pd.read_parquet(link)

    df.lpep_dropoff_datetime=pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime=pd.to_datetime(df.lpep_pickup_datetime)

    df['duration']=df.lpep_dropoff_datetime-df.lpep_pickup_datetime
    df.duration=df.duration.apply(lambda td : td.total_seconds()/60)

    df=df[(df.duration>=1) & (df.duration<=60)]

    categorical=['PULocationID','DOLocationID']
    numerical=['trip_distance']

    df[categorical]=df[categorical].astype(str)
    
    return df

In [9]:
train_link="https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet"
val_link="https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet"

In [10]:
categorical=['PULocationID','DOLocationID']
numerical=['trip_distance']

In [11]:
df_train=read_dataframe(train_link)
df_val=read_dataframe(val_link)

In [12]:
len(df_train),len(df_val)

(73908, 61921)

In [17]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

def train_model(model, df_train, df_val, categorical, numerical, target="duration"):
    # Initialize DictVectorizer
    dv = DictVectorizer()

    # Convert training data to dictionary format and transform using DictVectorizer
    train_dicts = df_train[categorical + numerical].to_dict(orient="records")
    X_train = dv.fit_transform(train_dicts)

    # Convert validation data to dictionary format and transform using DictVectorizer
    val_dicts = df_val[categorical + numerical].to_dict(orient="records")
    X_val = dv.transform(val_dicts)

    # Define target variable
    y_train = df_train[target].values
    y_val = df_val[target].values

    # Train the model
    model.fit(X_train, y_train)

    # Predict on validation data
    y_pred = model.predict(X_val)

    # Calculate and return Root Mean Squared Error (RMSE)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    print(f"RMSE: {rmse}")

    return dv, model


In [18]:
model=GradientBoostingRegressor()

dv,model=train_model(model=GradientBoostingRegressor(),
            df_train=df_train,
            df_val=df_val,categorical=categorical,numerical=numerical)

RMSE: 6.586400128565974


In [19]:
import os
import pickle

# Ensure the 'models' directory exists
os.makedirs('models', exist_ok=True)

# Save the model and dictionary vectorizer
with open('models/xgboost_model.bin', 'wb') as file_out:
    pickle.dump((dv, model), file_out)


## Homework starts here

In [20]:
yello_taxi_link_jan_23="https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
yello_taxi_link_feb_23="https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet"

In [21]:
train_df=pd.read_parquet(yello_taxi_link_jan_23)
val_df=pd.read_parquet(yello_taxi_link_feb_23)

In [22]:
train_df.shape

(3066766, 19)

### Answer to Q1 is 19

In [23]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [24]:
train_df.tpep_dropoff_datetime=pd.to_datetime(train_df.tpep_dropoff_datetime)
train_df.tpep_pickup_datetime=pd.to_datetime(train_df.tpep_pickup_datetime)

In [25]:
train_df['duration']=train_df.tpep_dropoff_datetime-train_df.tpep_pickup_datetime

In [26]:
train_df.duration=train_df.duration.apply(lambda td : td.total_seconds()/60)

In [27]:
train_df['duration'].describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

### Answer to Q2 is 42.59

In [28]:
r,c=df_train.shape

In [29]:
df_train=df_train[(df_train.duration>=1) & (df_train.duration<=60)]

In [30]:
# Assuming df_train is your DataFrame
filtered_df = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]

# Display the filtered DataFrame
print(filtered_df.shape)


(73908, 21)


In [31]:
(73908/3066766)*100

2.4099654163376014

### Answer to Q3 is 98%

In [32]:
model=LinearRegression()

NameError: name 'LinearRegression' is not defined

In [None]:
train_model(model=model,
            df_train=df_train,
            df_val=df_val,
            categorical=categorical,numerical=numerical)