In [None]:
import numpy as np 
import pandas as pd 
import zipfile

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_z = zipfile.ZipFile('/kaggle/input/nyc-taxi-trip-duration/train.zip')
train = pd.read_csv(train_z.open('train.csv'))
train.head()


In [None]:
train.shape

In [None]:
train.info()

pickup_datetime and dropoff_datetime have to be converted from the type object to datetime. The store_and_fwd_flag column is a categorical column. We'll convert it to numeric value. 

## Data Preprocessing
Converting categorical feature to numeric

In [None]:
train["store_and_fwd_flag"].value_counts()

In [None]:
f = lambda x: 0 if x =='N' else 1
train['store_and_fwd_flag'] = train['store_and_fwd_flag'].apply(lambda x: f(x))
train.head()

Converting object type to datetime.

In [None]:
train["dropoff_datetime"] = pd.to_datetime(train["dropoff_datetime"], format='%Y-%m-%d %H:%M:%S')
train["pickup_datetime"] = pd.to_datetime(train["pickup_datetime"], format='%Y-%m-%d %H:%M:%S')

In [None]:
train.info()

## Feature Engineering
Creating feature like month, date, hour and minutes from the pickup_datetime feature.

In [None]:
train["pickup_month"] = train["pickup_datetime"].dt.month
train["pickup_day"] = train["pickup_datetime"].dt.day
train["pickup_weekday"] = train["pickup_datetime"].dt.weekday
train["pickup_hour"] = train["pickup_datetime"].dt.hour
train["pickup_minute"] = train["pickup_datetime"].dt.minute

Calculating the latidutanal and longitudanal differences between the pickup and dropoff locations and then calculating the shortest path between each pairs of points. 
The latidutanal and longitudanal differences gives a sense of direction as well.

In [None]:
train["latitude_difference"] = train["dropoff_latitude"] - train["pickup_latitude"]
train["longitude_difference"] = train["dropoff_longitude"] - train["pickup_longitude"]

In [None]:
from geopy.distance import geodesic
def get_distance(source_lat, source_long, dest_lat, dest_long):
    #distance in Miles between the source and the destination.
    
    distance = geodesic((source_lat, source_long),(dest_lat, dest_long)).miles
    return distance


In [None]:
train['distance'] = train.apply(lambda x: get_distance(x.pickup_latitude, x.pickup_longitude,x.dropoff_latitude, x.dropoff_longitude), axis=1)
train.head()

## Anomaly Detection

In [None]:
import seaborn as sns
sns.scatterplot(data=train, x="distance", y="trip_duration")

In [None]:
from sklearn.ensemble import IsolationForest
clf = IsolationForest(random_state = 42, contamination = 0.01)
train['Anomaly'] = clf.fit_predict(train[['distance', 'trip_duration']])
train.Anomaly.value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.title("Outlier vs. Normal Trips")
plt.rcParams['figure.figsize'] = [15, 7]

plt.scatter(train.loc[train.Anomaly == -1, ['distance']], 
                 train.loc[train.Anomaly == -1, ['trip_duration']], c='red')
plt.scatter(train.loc[train.Anomaly == 1, ['distance']], 
                 train.loc[train.Anomaly == 1, ['trip_duration']], c='green')
plt.show()

In [None]:
train = train.loc[train['Anomaly'] == 1].copy()

## Including New York weather data in prediction

In [None]:
weather_df = pd.read_csv('../input/knycmetars2016/KNYC_Metars.csv')
weather_df.head()

In [None]:
weather_df["Time"] = pd.to_datetime(weather_df["Time"])
weather_df["pickup_year"] = weather_df["Time"].dt.year
weather_df["pickup_month"] = weather_df["Time"].dt.month
weather_df["pickup_day"] = weather_df["Time"].dt.day
weather_df["pickup_hour"] = weather_df["Time"].dt.hour

In [None]:
weather_df = weather_df[weather_df["pickup_year"] == 2016]

In [None]:
weather_df.head()

Merging the weather dataframe with the train dataframe 

In [None]:
final_df = train.copy()
final_df= pd.merge(final_df, weather_df[["Temp.", "pickup_month", "pickup_day", "pickup_hour", "Windchill", "Humidity", "Pressure", "Dew Point", "Visibility", "Wind Dir", 
                                            "Wind Speed", "Gust Speed", "Precip", "Conditions"]], how = "left", on = ["pickup_month", "pickup_day", "pickup_hour"])

In [None]:
final_df.head()

Preprocessing the categorical feature of Conditions

In [None]:
final_df["Conditions"].unique()

Most of these weather conditions are really similar and it does not make sense to encode them separately. We'll group them together into 4 weather categories. The same reasons apply grouping the wind directions into 10 categories. 

In [None]:
final_df["Conditions"] = final_df["Conditions"].fillna('Unknown')

In [None]:
weather_dict = {'Overcast' : 0, 
                'Haze' : 0,
                'Partly Cloudy' : 0, 
                'Mostly Cloudy' : 0, 
                'Scattered Clouds' : 0, 
                'Light Freezing Fog' : 0,
                
                'Unknown' : 1,
                'Clear' : 2, 
                
                'Heavy Rain' : 3, 
                'Rain' : 3, 
                'Light Freezing Rain' : 3,
                'Light Rain' : 3, 
                
                'Heavy Snow' : 4,
                'Light Snow' : 4,
                'Snow' : 4}
final_df["Conditions"] = final_df["Conditions"].apply(lambda x: weather_dict[x])            

In [None]:
final_df["Wind Dir"].unique()

In [None]:
final_df["Wind Dir"] = final_df["Wind Dir"].fillna('Unknown')

wind_dir_dict = {'East' : 0,
                 'ENE' : 0, 
                 'ESE' : 0, 
                 
                 'West' : 1, 
                 'WSW' : 1,
                 'WNW' : 1,
                 
                 'South' : 2, 
                 'SSE' : 2,   
                 'SSW' : 2,
                 
                 'North' : 3, 
                 'NNE' : 3, 
                 'NNW' : 3,
                 
                 'Variable' : 4, 
                 'Calm' : 5, 
                 'SW' : 6, 
                 'NW' : 7, 
                 'NE' : 8, 
                 'SE' : 9, 
                 'Unknown' : 10
                }
final_df["Wind Dir"] = final_df["Wind Dir"].apply(lambda x: wind_dir_dict[x]) 

## LightGBM Model
We'll compare model performance with and without the weather data. We'll use the Root Mean Squared Logaritmic Error as our evaluation metric

In [None]:
X = final_df.drop(["trip_duration", "id", "vendor_id", "pickup_datetime", "dropoff_datetime"], axis=1)
y = final_df["trip_duration"]

In [None]:
#Split the data into training, test, and valdiation sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [None]:
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

In [None]:
import lightgbm as lgb
params = {
    'boosting':'gbdt',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
}
nrounds = 2000

In [None]:
dtrain = lgb.Dataset(X_train, np.log(y_train+1))
dval = lgb.Dataset(X_val, np.log(y_val+1), reference=dtrain)
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [None]:
bst = lgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                valid_sets = [dtrain, dval],
                valid_names = ['train', 'valid'],
                categorical_feature = [20, 24]
                )

In [None]:
pred = np.exp(bst.predict(X_test)) - 1