In [1]:
!python -V

Python 3.9.12


In [2]:
import pandas as pd

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [4]:
# Q1. Downloading the data

df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

print(f"Number of columns: {len(df.columns)}")

Number of columns: 19


In [5]:
# Q2. Computing duration

df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

print(f"Standard deviation of trips duration: {round(df.duration.std(axis=0), 2)}")

Standard deviation of trips duration: 42.59


In [6]:
# Q3. Dropping outliers

df_train = df[(df.duration >= 1) & (df.duration <= 60)]

print(f"Percantage of records left after dropping outliers: {round((df_train.shape[0]/df.shape[0])*100)}%")

Percantage of records left after dropping outliers: 98%


In [7]:
# Q4. One-hot encoding

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df_train[categorical] = df_train[categorical].astype(str)

train_dicts = df_train[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

print(f"Number of columns of feature matrix: {X_train.shape[1]}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[categorical] = df_train[categorical].astype(str)


Number of columns of feature matrix: 516


In [8]:
# Q5. Training a model

target = 'duration'
y_train = df_train[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

print(f"RMSE on training set: {round(mean_squared_error(y_train, y_pred, squared=False), 3)}")

RMSE on training set: 7.649


In [9]:
# Q6. Evaluating the model

def read_dataframe(filename, categorical):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    df[categorical] = df[categorical].astype(str)
    
    return df

df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet', categorical)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
dv.transform(val_dicts)

X_val = dv.transform(val_dicts)

y_val = df_val[target].values
y_pred = lr.predict(X_val)

print(f"RMSE on validation set: {round(mean_squared_error(y_val, y_pred, squared=False), 3)}")

RMSE on validation set: 7.811
