# Setup

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import boxcox
import matplotlib.pyplot as plt

import seaborn as sns
train = pd.read_csv('../input/nyc-taxi-trip-duration/train.zip',parse_dates=['pickup_datetime','dropoff_datetime'])
test = pd.read_csv('../input/nyc-taxi-trip-duration/test.zip',parse_dates=['pickup_datetime'])

train['trip_duration'],_ = boxcox(train['trip_duration'])
train = train[train['trip_duration'] !=0].reset_index(drop=True)

y = train.trip_duration

train = train.drop(['trip_duration','vendor_id','store_and_fwd_flag'],axis=1)
test = test.drop(['vendor_id','store_and_fwd_flag'],axis=1)

print ('SETUP COMPLETE')

In [None]:
train.head()

# Feature Engineering

In [None]:
train.info()

## Creating Distance feature

In [None]:
train['x'] = (train.pickup_longitude - train.dropoff_longitude)**2
train['y'] = (train.pickup_latitude - train.dropoff_latitude)**2
train['distance_sqrt'] = train.x + train.y
train['distance'] = np.sqrt(train.distance_sqrt) * 1000

test['x'] = (test.pickup_longitude - test.dropoff_longitude)**2
test['y'] = (test.pickup_latitude - test.dropoff_latitude)**2
test['distance_sqrt'] = test.x + test.y
test['distance'] = np.sqrt(test.distance_sqrt) *1000

train = train.drop(['x','y','distance_sqrt'],axis=1)
test = test.drop(['x','y','distance_sqrt'],axis=1)
print(len(train))

## Creating Month, Day, Hour features

In [None]:
train = train.assign(pmonth=train.pickup_datetime.dt.month, pday = train.pickup_datetime.dt.day, phour = train.pickup_datetime.dt.hour)
test = test.assign(pmonth=test.pickup_datetime.dt.month, pday = test.pickup_datetime.dt.day, phour = test.pickup_datetime.dt.hour)
print(len(train))

## Dropping pickup and dropoff time features

In [None]:
X_train = train.drop(['pickup_datetime','dropoff_datetime','id'],axis=1)
X_test = test.drop(['pickup_datetime','id'],axis=1)
print(len(train))
print(len(X_train))

In [None]:
X_train.info()

## Creating Pickup Boroughs for train dataset

In [None]:
train_pickup_borough = []
for j,k in zip(X_train.pickup_longitude,X_train.pickup_latitude):
    if ((j>=-74.040269) and (j<=-73.865036)) and ((k>=40.574031) and (k<=40.736728)):
        train_pickup_borough.append('Brooklyn')
    elif ((j>=-74.245856) and (j<=-73.057562)) and ((k>=40.502863) and (k<=40.647234)):
        train_pickup_borough.append('Staten_island')
    elif ((j>=-74.011645) and (j<=-73.913772)) and ((k>=40.703384) and (k<=40.876058)):
        train_pickup_borough.append('Manhattan')
    elif ((j>=-73.931573) and (j<=-73.781473)) and ((k>=40.797766) and (k<=40.912460)):
        train_pickup_borough.append('Bronx')
    else: train_pickup_borough.append('Queens')
Borough_ptrain = pd.DataFrame(train_pickup_borough,columns=['pickup_borough'])
X_train = pd.concat([X_train,Borough_ptrain], axis=1)

## Creating Pickup Borough for test dataset

In [None]:
test_pickup_borough = []
for g,h in zip(X_test.pickup_longitude,X_test.pickup_latitude):
    if ((g>=-74.040269) and (g<=-73.865036)) and ((h>=40.574031) and (h<=40.736728)):
        test_pickup_borough.append('Brooklyn')
    elif ((g>=-74.245856) and (g<=-73.057562)) and ((h>=40.502863) and (h<=40.647234)):
        test_pickup_borough.append('Staten_island')
    elif ((g>=-74.011645) and (g<=-73.913772)) and ((h>=40.703384) and (h<=40.876058)):
        test_pickup_borough.append('Manhattan')
    elif ((g>=-73.931573) and (g<=-73.781473)) and ((h>=40.797766) and (h<=40.912460)):
        test_pickup_borough.append('Bronx')
    else: test_pickup_borough.append('Queens')
Borough_ptest = pd.DataFrame(test_pickup_borough,columns=['pickup_borough'])
X_test = pd.concat([X_test,Borough_ptest],axis=1)
X_test.info()

## Creating Dropoff Borough for train dataset 

In [None]:
train_dropoff_borough = []
for d,f in zip(X_train.dropoff_longitude,X_train.dropoff_latitude):
    if ((d>=-74.040269) and (d<=-73.865036)) and ((f>=40.574031) and (f<=40.736728)):
        train_dropoff_borough.append('Brooklyn')
    elif ((d>=-74.245856) and (d<=-73.057562)) and ((f>=40.502863) and (f<=40.647234)):
        train_dropoff_borough.append('Staten_island')
    elif ((d>=-74.011645) and (d<=-73.913772)) and ((f>=40.703384) and (f<=40.876058)):
        train_dropoff_borough.append('Manhattan')
    elif ((d>=-73.931573) and (d<=-73.781473)) and ((f>=40.797766) and (f<=40.912460)):
        train_dropoff_borough.append('Bronx')
    else: train_dropoff_borough.append('Queens')
Borough_dtrain = pd.DataFrame(train_dropoff_borough,columns=['dropoff_borough'])
X_train = pd.concat([X_train,Borough_dtrain],axis=1)
X_train.info()

## Creating Dropoff Borough for test dataset

In [None]:
test_dropoff_borough = []
for a,s in zip(X_test.dropoff_longitude,X_test.dropoff_latitude):
    if ((a>=-74.040269) and (a<=-73.865036)) and ((s>=40.574031) and (s<=40.736728)):
        test_dropoff_borough.append('Brooklyn')
    elif ((a>=-74.245856) and (a<=-73.057562)) and ((s>=40.502863) and (s<=40.647234)):
        test_dropoff_borough.append('Staten_island')
    elif ((a>=-74.011645) and (a<=-73.913772)) and ((s>=40.703384) and (s<=40.876058)):
        test_dropoff_borough.append('Manhattan')
    elif ((a>=-73.931573) and (a<=-73.781473)) and ((s>=40.797766) and (s<=40.912460)):
        test_dropoff_borough.append('Bronx')
    else: test_dropoff_borough.append('Queens')
Borough_dtest = pd.DataFrame(test_dropoff_borough,columns=['dropoff_borough'])
X_test = pd.concat([X_test,Borough_dtest],axis=1)
X_test.info()

## Creating avg borough speed feature

In [None]:
X_train['speed'] = X_train['distance'] / y

In [None]:
from typing import List
def assign_borough_speed(df: pd.DataFrame) -> pd.DataFrame:
    borough_speed = []
    for colitter in df['dropoff_borough']:
        if colitter == 'Staten_island':
            borough_speed.append(20.179187)
        elif colitter == 'Bronx':
            borough_speed.append(14.042448)
        elif colitter == 'Queens':
            borough_speed.append(12.183943)
        elif colitter == 'Brooklin':
            borough_speed.append(4.583516)
        else: borough_speed.append(3.450198)
    
    borough_speed = pd.DataFrame(borough_speed, columns=['borough_speed'])
    return pd.concat([df, borough_speed], axis=1) 

In [None]:
X_train = assign_borough_speed(df=X_train)
X_test = assign_borough_speed(df=X_test)

## Transforming borough from characters to numbers

In [None]:
X_train.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X_train['pickup_borough'] = encoder.fit_transform(X_train['pickup_borough'])
X_test['pickup_borough'] =  encoder.transform(X_test['pickup_borough'])
X_train['dropoff_borough'] = encoder.transform(X_train['dropoff_borough'])
X_test['dropoff_borough'] =  encoder.transform(X_test['dropoff_borough'])
X_train.info()

In [None]:
X_train.head()

In [None]:
X_train = X_train.drop(['speed'],axis=1)
X_train.head()

# Model

In [None]:
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(X_train,y)
preds = model.predict(X_test)

In [None]:
from scipy.special import inv_boxcox
preds = inv_boxcox(preds,_)

In [None]:
output = pd.DataFrame({'Id':test.id,'trip_duration':preds})
output.to_csv('submission.csv', index=False)
output.head()