In [None]:
!pip install pyarrow pandas matplotlib seaborn scikit-learn

In [None]:
!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet
!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [2]:
january_df = pd.read_parquet("fhv_tripdata_2021-01.parquet")
february_df = pd.read_parquet("fhv_tripdata_2021-02.parquet")
january_df  # Question 1


Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037
...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285


In [3]:
def add_duration_column(df):
    df["duration"] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)


add_duration_column(january_df)
add_duration_column(february_df)
january_df.duration.mean()  # Question 2


19.167224093791006

In [4]:
# sns.displot(january_df.duration)

In [5]:
def drop_outliers(df):
    return df[(df.duration >= 1) & (df.duration <= 60)]


initial_len = len(january_df)
january_df = drop_outliers(january_df)
february_df = drop_outliers(february_df)

initial_len - len(january_df)


44286

In [7]:
def prepare_location_ids(df):
    df.PUlocationID = df.PUlocationID.fillna('-1').astype(str)
    df.DOlocationID = df.DOlocationID.fillna('-1').astype(str)

prepare_location_ids(january_df)
prepare_location_ids(february_df)

# Question 3
len(january_df[january_df.PUlocationID == '-1']) / len(january_df)


0.8352732770722617

In [8]:
feature_columns = ["PUlocationID", "DOlocationID"]

dv = DictVectorizer()

train_dicts = january_df[feature_columns].to_dict(orient="records")
test_dicts = february_df[feature_columns].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)
X_test = dv.transform(test_dicts)
# Question 4
X_train.shape


(1109826, 525)

In [9]:
target = "duration"
y_train = january_df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

# Question 5
mean_squared_error(y_train, y_pred, squared=False)


10.528519425310185

In [10]:
y_test = february_df[target].values
y_test_pred = lr.predict(X_test)

# Question 6
mean_squared_error(y_test, y_test_pred, squared=False)


11.014285828610237