In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data load

In [None]:
train=pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/train.zip")
test=pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/test.zip")

In [None]:
weather = pd.read_csv("../input/weather-data-in-new-york-city-2016/weather_data_nyc_centralpark_2016(1).csv")
weather.head(30)

In [None]:
train.head()

In [None]:
print(sorted(weather['precipitation'].unique()))
print(sorted(weather['snow fall'].unique()))
print(sorted(weather['snow depth'].unique()))

# Variable assignment and column drop

Formula definitions

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
from math import radians, cos, sin, asin, sqrt

In [None]:
# Haversine formula for distance

def haversine(row):
    lon1 = row['pickup_longitude']
    lat1 = row['pickup_latitude']
    lon2 = row['dropoff_longitude']
    lat2 = row['dropoff_latitude']
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6369 * c
    return km

In [None]:
# Direction formula
def ft_degree(row):
    lon1 = row['pickup_longitude']
    lat1 = row['pickup_latitude']
    lon2 = row['dropoff_longitude']
    lat2 = row['dropoff_latitude']
    km = 6369
    lon_delta_rad = np.radians(lon2 - lon1)
    lat1, lon1, lat2, lon2 = map(np.radians, (lat1, lon1, lat2, lon2))
    y = np.sin(lon_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lon_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [None]:
# manhattan distance
# def manhattan(row):
#     lon1 = row['pickup_longitude']
#     lat1 = row['pickup_latitude']
#     lon2 = row['dropoff_longitude']
#     lat2 = row['dropoff_latitude']
#     lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
#     dlon = lon2 - lon1 
#     dlat = lat2 - lat1
#     a = sin(dlon/2)**2
#     c = 2 * asin(sqrt(a))
#     lon_dist = 6369 * c
#     a = sin(dlat/2)**2
#     c = 2 * asin(sqrt(a))
#     lat_dist = 6369 * c
#     km = abs(lat_dist) + abs(lon_dist)
#     return km

In [None]:
#Rush Hour Formula
#NYC Rush Hour reference from Staten Island Ferry Schedule
#6am - 10am Morning Rush
#10am - 4pm Day
#4pm - 8pm Evening Rush
#8pm - 6am Night
def rush_hour_f(row):
    rhour = row['real_hour']
    if (6 <= rhour) & (rhour <= 10):
        return 1
    if (10 < rhour) & (rhour < 16):
        return 2
    if (16 <= rhour) & (rhour <= 20):
        return 3
    return 0

In [None]:
weather.head(20)

In [None]:
weather['precipitation'] = pd.to_numeric(weather['precipitation'], errors='coerce')
weather['snow fall'] = pd.to_numeric(weather['snow fall'], errors='coerce')
weather['snow depth'] = pd.to_numeric(weather['snow depth'], errors='coerce')
weather = weather.fillna(0)

In [None]:
weather.head(20)

In [None]:
weather.info()

Train

In [None]:
#Drop outliers
train = train[(train.trip_duration < 1000000)]
train = train[train['pickup_longitude'].between(-75, -73)]
train = train[train['pickup_latitude'].between(40, 42)]
train = train[train['dropoff_longitude'].between(-75, -73)]
train = train[train['dropoff_latitude'].between(40, 42)]
duration = train['trip_duration']
train['trip_duration'] = np.log(train['trip_duration'].values)

In [None]:
#encoding binary categorical
#train = pd.concat([train, pd.get_dummies(train['store_and_fwd_flag'])], axis=1)
#test = pd.concat([test, pd.get_dummies(test['store_and_fwd_flag'])], axis=1)

#train.drop(['store_and_fwd_flag'], axis=1, inplace=True)
#test.drop(['store_and_fwd_flag'], axis=1, inplace=True)

#train = pd.concat([train, pd.get_dummies(train['vendor_id'])], axis=1)
#test = pd.concat([test, pd.get_dummies(test['vendor_id'])], axis=1)

#train.drop(['vendor_id'], axis=1, inplace=True)
#test.drop(['vendor_id'], axis=1, inplace=True)

#no var in test
train.drop(['dropoff_datetime'], axis=1, inplace=True)
train.head()

In [None]:
#Rush hour formula takes a moment ~30s
#old
#trainX = train.drop(columns=['id', 'vendor_id','trip_duration'])
#trainY = train[['trip_duration']]

encoder.fit(train['store_and_fwd_flag'])
train['store_and_fwd_flag'] = encoder.transform(train['store_and_fwd_flag'])

train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['month'] = train['pickup_datetime'].dt.month
train['week'] = train['pickup_datetime'].dt.week
train['weekday'] = train['pickup_datetime'].dt.weekday
train['hour'] = train['pickup_datetime'].dt.hour
train['minute'] = train['pickup_datetime'].dt.minute
train['minute_of_day'] = train['hour'] * 60 + train['minute']
train['real_hour'] = train['minute_of_day'] / 60

train['rush_hour'] = train.apply(rush_hour_f, axis=1)
train['is_weekend'] = train['weekday'] > 4
encoder.fit(train['is_weekend'])
train['is_weekend'] = encoder.transform(train['is_weekend'])

#for weather
train['year'] = train['pickup_datetime'].dt.year
train['day'] = train['pickup_datetime'].dt.day

train.drop(['minute'], axis=1, inplace=True)
train.drop(['hour'], axis=1, inplace=True)
train.drop(['pickup_datetime'], axis=1, inplace=True)

In [None]:
weather["date"] = pd.to_datetime(weather["date"])
weather['year'] = weather['date'].dt.year
weather_2016 = weather[weather["year"] == 2016]
weather_2016.drop(["year"], axis=1, inplace=True)

train['date']=pd.to_datetime(train[['year','month','day']],errors='coerce')
left_merge = pd.merge(left=train, right=weather_2016, on="date", how="left")
train = left_merge.loc[:, left_merge.columns != 'date']

In [None]:
train.info()

In [None]:
train.head(20)

In [None]:
print(sorted(train['precipitation'].unique()))
print(sorted(train['snow fall'].unique()))
print(sorted(train['snow depth'].unique()))

In [None]:
#Manhattan and direction formula takes a moment ~130s #old

#Haversine and direction formula takes a moment ~130s
#train['distance'] = train.apply(haversine, axis=1)
train['distance'] = train.apply(haversine, axis=1)
train['direction'] = train.apply(ft_degree, axis=1)
train.head()

In [None]:
train.head(10)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

train.boxplot(column='distance', return_type='axes');

In [None]:
train.boxplot(column='direction', return_type='axes');

In [None]:
#Speed in mph
train['speed'] = train.distance / duration * 2236.936292
train.head()

In [None]:
train.boxplot(column='speed', return_type='axes');

In [None]:
#NYS max speed limit 55mph
train = train[(train.speed < 65)]
train.drop(['speed'], axis = 1, inplace=True)

In [None]:
import seaborn as sns
#Correlations between variables
fig, ax = plt.subplots(figsize=(14,5))  
sns.heatmap(data=train.corr(), annot=True, cmap = plt.cm.RdYlBu_r, linewidths=.1, ax=ax).set_title('Correlations between variables');

In [None]:
train.describe()

In [None]:
#extra var: 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 
Y_var = train['trip_duration']
X_var = train.drop(columns=['id', 'trip_duration'])
X_var.head()

Test

In [None]:
test.head()

In [None]:
#Rush hour formula takes a moment ~15s
#old
#testX = test.drop(columns=['id','vendor_id'])
#testY = test[['id']]

encoder.fit(test['store_and_fwd_flag'])
test['store_and_fwd_flag'] = encoder.transform(test['store_and_fwd_flag'])

test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])
test['month'] = test['pickup_datetime'].dt.month
test['week'] = test['pickup_datetime'].dt.week
test['weekday'] = test['pickup_datetime'].dt.weekday
test['hour'] = test['pickup_datetime'].dt.hour
test['minute'] = test['pickup_datetime'].dt.minute
test['minute_of_day'] = test['hour'] * 60 + test['minute']
test['real_hour'] = test['minute_of_day'] / 60

test['rush_hour'] = test.apply(rush_hour_f, axis=1)
test['is_weekend'] = test['weekday'] > 4
encoder.fit(test['is_weekend'])
test['is_weekend'] = encoder.transform(test['is_weekend'])

#for weather
test['year'] = test['pickup_datetime'].dt.year
test['day'] = test['pickup_datetime'].dt.day


test.drop(['minute'], axis=1, inplace=True)
test.drop(['hour'], axis=1, inplace=True)
test.drop(['pickup_datetime'], axis=1, inplace=True)

In [None]:
weather["date"] = pd.to_datetime(weather["date"])
weather['year'] = weather['date'].dt.year
weather_2016 = weather[weather["year"] == 2016]
weather_2016.drop(["year"], axis=1, inplace=True)

test['date']=pd.to_datetime(test[['year','month','day']],errors='coerce')
left_merge_test = pd.merge(left=test, right=weather_2016, on="date", how="left")
test = left_merge_test.loc[:, left_merge_test.columns != 'date']

In [None]:
#Manhattan and direction formula takes a moment ~60s #old

#Haversine and direction formula takes a moment ~55s
#test['distance'] = test.apply(haversine, axis=1)
test['distance'] = test.apply(haversine, axis=1)
test['direction'] = test.apply(ft_degree, axis=1)
test.head()

In [None]:
#extra var: , 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'
X_var_test = test.drop(columns=['id'])

In [None]:
X_var_test.shape

In [None]:
from sklearn.model_selection import train_test_split

trainXsplit, testXsplit, trainYsplit, testYsplit = train_test_split(X_var, Y_var, test_size=625134, random_state=42)
trainXsplit.shape, trainYsplit.shape, testXsplit.shape, testYsplit.shape

# Model LightGBM

In [None]:
train.info()

In [None]:
#~317s
from sklearn.metrics import mean_squared_error as MSE
import lightgbm as lgb

lgb_params = {
    #'metric' : 'rmse',
    'learning_rate': 0.1,
    'max_depth': 25,
    'num_leaves': 1000, 
    'objective': 'regression',
    'feature_fraction': 0.9,
    'bagging_fraction': 0.5,
    'max_bin': 1000
#     ,'verbosity': 2
    }

#Training on all labeled data using the best parameters
lgb_df = lgb.Dataset(X_var, Y_var)
#lgb_df = lgb.Dataset(trainXsplit, trainYsplit)
lgb_model = lgb.train(lgb_params, lgb_df, num_boost_round=1500)

#modelX.fit(trainXsplit, trainYsplit, verbose=True)

In [None]:
#~60s
test_col = X_var.columns
#y_pred = np.exp(lgb_model.predict(test[test_col]))
y_pred = lgb_model.predict(X_var_test[test_col])

# Submission

In [None]:
submission = pd.DataFrame({'id': test.id, 'trip_duration': np.exp(y_pred)})
submission.to_csv('submission.csv', index=False)
submission.head()

In [None]:
submission.shape

# Visualization and accuracy

In [None]:
#continuous df doesnt work
# lgb_cross = lgb.cv(lgb_params, lgb_df, num_boost_round=1500)

In [None]:
#from sklearn.metrics import accuracy_score
#modelX.score(X_var, Y_var)

# Sources
Tutorials
* https://python-visualization.github.io/folium/quickstart.html
* https://www.kaggle.com/dcstang/create-table-of-contents-in-a-notebook#Introduction
* https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop_duplicates.html
* https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
* https://pandas.pydata.org/docs/reference/api/pandas.Series.between.html
* https://stackoverflow.com/questions/32980087/pandas-not-in-in-and-between
* https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html
* https://stackoverflow.com/questions/53872905/iterate-over-first-n-rows-in-pandas
* https://stackoverflow.com/questions/15891038/change-column-type-in-pandas
* https://pandas.pydata.org/docs/reference/api/pandas.to_numeric.html
* https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html


Datasets
* https://www.kaggle.com/c/nyc-taxi-trip-duration/data
* https://www.kaggle.com/datasets/mathijs/weather-data-in-new-york-city-2016

Other notebooks and reference data
* https://www.kaggle.com/alexisbcook/manipulating-geospatial-data
* https://www.kaggle.com/camnugent/geospatial-feature-engineering-and-visualization
* https://www.kaggle.com/quentinmonmousseau/ml-workflow-lightgbm-0-37-randomforest-0-39
* https://www.kaggle.com/brianfong192/predicting-trip-durations-with-xgboost-lb-0-433?scriptVersionId=1509647
* https://www.gps-coordinates.net/
* https://www.siferry.com/schedules.html


Extra
* https://www.sheknows.com/living/articles/1126288/man-get-hit-nyc-taxi-playing-pokemon-go/
* https://abcnews.go.com/Travel/ny-la-taxi-ride-friends-celebrating-birthday/story?id=13452198