ref : https://www.kaggle.com/aiswaryaramachandran/eda-baseline-model-0-40-rmse

Previous kernel(EDA) : https://www.kaggle.com/samkim7351/200603-ny-taxi-study-01-eda?scriptVersionId=35969066

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', '{:.5f}'.format)

from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import pickle
from haversine import haversine_vector
import pyproj

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nyc-taxi-trip-duration/sample_submission.zip
/kaggle/input/nyc-taxi-trip-duration/train.zip
/kaggle/input/nyc-taxi-trip-duration/test.zip


## Load data

In [2]:
train = pd.read_csv('../input/nyc-taxi-trip-duration/train.zip')
test = pd.read_csv('../input/nyc-taxi-trip-duration/test.zip')

print(train.shape)
display(train.head(2))

print(test.shape)
test.head(2)

(1458644, 11)


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.98215,40.76794,-73.96463,40.7656,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.98042,40.73856,-73.99948,40.73115,N,663


(625134, 9)


Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.98813,40.73203,-73.99017,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.9642,40.67999,-73.95981,40.6554,N


## Feature Engineering
- I will drop unneeded column : dropoff_datetime. (from train dataset)
- I will only keep lat lng rounded to third decimal places.

In [3]:
# wday_list = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
# wday_dict = {key : idx for idx, key in enumerate(wday_list)}
kmeans_pickup = KMeans(n_clusters=8, random_state=0).fit(train[['pickup_latitude', 'pickup_longitude']].values)
kmeans_dropoff = KMeans(n_clusters=8, random_state=0).fit(train[['dropoff_latitude', 'dropoff_longitude']].values)

def ftr_engineer(df, kmeans_pickup=kmeans_pickup, kmeans_dropoff=kmeans_dropoff):
    # Drop unneeded column
    if 'dropoff_datetime' in df.columns:
        print('>>> Droping column - dropoff_datetime')
        df = df.drop(columns='dropoff_datetime')
        pass
    
    # Convert column to datetime
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    
    # Generate datetime related features
    df['pickup_date'] = df['pickup_datetime'].dt.date
    df['pickup_day'] = df['pickup_datetime'].dt.day
    df['pickup_hour'] = df['pickup_datetime'].dt.hour
    df['pickup_dayofweek'] = df['pickup_datetime'].dt.dayofweek
    
    # Round lat, lng columns to the third decimal place
    for prefix in ['pickup', 'dropoff']:
        df[f'{prefix}_longitude'] = df[f'{prefix}_longitude'].round(3)
        df[f'{prefix}_latitude'] = df[f'{prefix}_latitude'].round(3)
        pass
    
    # Generate trip_distance column
    df['trip_distance'] = haversine_vector( df[['pickup_latitude', 'pickup_longitude']].to_numpy(),
                                            df[['dropoff_latitude', 'dropoff_longitude']].to_numpy() )
    
    # Generate bearing column
    df['bearing'] = pyproj.Geod(ellps='WGS84').inv( df['pickup_longitude'].to_numpy(), df['pickup_latitude'].to_numpy(),
                                                    df['dropoff_longitude'].to_numpy(), df['dropoff_latitude'].to_numpy() )[0]
    
    # Generate cluster column
    df['pickup_cluster'] = kmeans_pickup.predict(df[['pickup_latitude', 'pickup_longitude']].values)
    df['dropoff_cluster'] = kmeans_dropoff.predict(df[['dropoff_latitude', 'dropoff_longitude']].values)
    
    return df


bfr_shape = train.shape
train = ftr_engineer(train)
test = ftr_engineer(test)

print(f'>>> shape of train dataset changed from {train.shape} to {train.shape}')

>>> Droping column - dropoff_datetime
>>> shape of train dataset changed from (1458644, 18) to (1458644, 18)
