# imports

In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from geopy import distance
import math

from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam, SGD
from keras.callbacks import ModelCheckpoint

# Data Load

In [None]:
#base_path =  "./dataset/"
base_path = "../input/nyc-taxi-trip-duration/"

In [None]:
os.listdir(base_path)

# Data Exploration

In [None]:
data_df = pd.read_csv(base_path+"train.csv")

In [None]:
data_df.head()

In [None]:
data_df.describe()

### vendor_id

In [None]:
sns.boxplot(x='vendor_id', y='trip_duration', data=data_df)

Data point bigger then 500000

In [None]:
data_df[(data_df['trip_duration']>500000)].count()

**Therefore permanently droppping these rows**

In [None]:
data_df.drop(data_df[data_df['trip_duration'] > 500000].index, inplace=True)

### Distribution of trip_duration

In [None]:
sns.distplot(data_df['trip_duration'], bins=20000)

Since it is left skewed, so we will take log(with any base) of it.

In [None]:
sns.distplot(np.log(data_df['trip_duration']), bins=20000)

In [None]:
data_df['trip_duration'] = data_df['trip_duration'].apply(math.log)

### vendor_id

In [None]:
data_df = data_df[(data_df['trip_duration']<100000)]

In [None]:
data_df.reset_index(drop = True, inplace=True)

In [None]:
sns.countplot(x = 'vendor_id', data=data_df)

In [None]:
sns.boxplot(x='vendor_id', y='trip_duration', data=data_df)

From the above graph we can see **vendor_id** can play a key role

### passenger_count

In [None]:
sns.countplot(x = 'passenger_count', data=data_df)

In [None]:
sns.boxplot(x = 'passenger_count', y = 'trip_duration', data=data_df)

**passenger_count** is an important feature

### store_and_fwd_flag

In [None]:
sns.countplot(x = 'store_and_fwd_flag', data=data_df)

In [None]:
data_df[data_df['store_and_fwd_flag']=='Y']['trip_duration'].describe()

In [None]:
sns.distplot(data_df[data_df['store_and_fwd_flag']=='Y']['trip_duration'], bins=1000)

In [None]:
data_df[data_df['store_and_fwd_flag']=='N']['trip_duration'].describe()

In [None]:
sns.distplot(data_df[data_df['store_and_fwd_flag']=='N']['trip_duration'], bins=1000)

In [None]:
sns.boxplot(x = 'store_and_fwd_flag', y = 'trip_duration', data=data_df)

### Time

In [None]:
def strtodatetime(x):
    return datetime.strptime(x, "%Y-%m-%d %H:%M:%S")

In [None]:
data_df['pickup_datetime'] = data_df['pickup_datetime'].apply(strtodatetime)

Only considering pickup time foe day of week

In [None]:
def dayofweek(x):
    return x.weekday()

In [None]:
data_df['day_of_week'] = data_df['pickup_datetime'].apply(dayofweek)

In [None]:
sns.boxenplot(x='day_of_week', y='trip_duration', data=data_df)

Will see with and without this feature.

### Distance

In [None]:
distance.distance((data_df['pickup_latitude'].iloc[0],
                  data_df['pickup_longitude'].iloc[0]),
                 (data_df['dropoff_latitude'].iloc[0],
                 data_df['dropoff_longitude'].iloc[0])).m

In [None]:
def dist(x):
   return distance.distance((x[0], x[1]),
                 (x[2], x[3])).m

In [None]:
data_df['dist'] = data_df[['pickup_latitude',
         'pickup_longitude',
         'dropoff_latitude',
         'dropoff_longitude']].apply(lambda x:dist(x), axis=1)

In [None]:
sns.distplot(data_df['dist'], bins=1000)

In [None]:
data_df[(data_df['pickup_latitude']==data_df['dropoff_latitude']) &
        (data_df['pickup_longitude']==data_df['dropoff_longitude'])].count()

There are data with same pickup and drop location, so we will replace dist 0 with 1, and then take log of it.

In [None]:
#data_df.drop(data_df[(data_df['pickup_latitude']==data_df['dropoff_latitude']) &
#        (data_df['pickup_longitude']==data_df['dropoff_longitude'])].index,
#            inplace=True)

In [None]:
data_df['dist'].replace(to_replace=0, value=1, inplace=True)

In [None]:
#log
data_df['dist'] = data_df['dist'].apply(math.log)

In [None]:
sns.distplot(data_df['dist'], bins=1000)

### Time Conversion

In [None]:
#currently handling data only day wise
MAX_SECONDS_IN_DAY = 24*60*60
def timetosectosincosday(x):
    #x = datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
    initial_date = datetime(x.year, x.month, x.day, 0 , 0, 0) # 1 is for day
    sec = (x-initial_date).total_seconds()
    sin = math.sin(2*math.pi*(sec/MAX_SECONDS_IN_DAY))
    cos = math.cos(2*math.pi*(sec/MAX_SECONDS_IN_DAY))
    return sin, cos

In [None]:
data_df['pickup_sin_sec'] ,data_df['pickup_cos_sec'] = zip(*data_df['pickup_datetime'].map(timetosectosincosday))

In [None]:
#data day of week
MAX_DAY_IN_WEEK = 6 # from 0-6
def dayofweektosincosday(day):
    #x = datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
    sin = math.sin(2*math.pi*(day/MAX_DAY_IN_WEEK))
    cos = math.cos(2*math.pi*(day/MAX_DAY_IN_WEEK))
    return sin, cos

In [None]:
data_df['sin_day'] ,data_df['cos_day'] = zip(*data_df['day_of_week'].map(dayofweektosincosday))

# Normalize

In [None]:
df_processed =  data_df[['vendor_id',
                        'passenger_count',
                        'pickup_sin_sec',
                        'pickup_cos_sec',
                        'pickup_longitude',
                        'pickup_latitude',
                        'dropoff_longitude',
                        'dropoff_latitude',
                        'sin_day',
                        'cos_day',
                        'dist',
                        'trip_duration']]

In [None]:
X = data_df[['vendor_id',
            'passenger_count',
            'pickup_sin_sec',
            'pickup_cos_sec',
            'pickup_longitude',
            'pickup_latitude',
            'dropoff_longitude',
            'dropoff_latitude',
            'sin_day',
            'cos_day',
            'dist']]

In [None]:
y = data_df[['trip_duration']]

In [None]:
standardScalarX = StandardScaler().fit(X)
X  = standardScalarX.transform(X)

In [None]:
standardScalarY = StandardScaler().fit(y)
y = standardScalarY.transform(y)

In [None]:
X.shape, y.shape

# Model

In [None]:
model = Sequential()
model.add(Dense(128, activation = 'relu',input_shape=(X.shape[1],)))
model.add(Dense(128, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(1))

In [None]:
model.compile(optimizer=Adam(lr=0.00001),
              metrics=['mean_squared_error'], 
              loss='mean_squared_error')

In [None]:
#filepath="weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='mean_squared_error', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
history = model.fit(X, y,
                   batch_size=32,
                   validation_split=0.2,
                   epochs=50, 
                   callbacks=callbacks_list)

In [None]:
history.history.keys()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("Model Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["train", "validation"])
plt.savefig('loss.png')

In [None]:
model.save('model.hdf5')

# Test Set

In [None]:
os.listdir(base_path)

In [None]:
test_df = pd.read_csv(base_path+"test.csv")

In [None]:
test_df.head()

Feature Used
**['vendor_id',
            'passenger_count',
            'pickup_sin_sec',
            'pickup_cos_sec',
            'pickup_longitude',
            'pickup_latitude',
            'dropoff_longitude',
            'dropoff_latitude',
            'sin_day',
            'cos_day',
            'dist']**

In [None]:
test_df['pickup_datetime'] = test_df['pickup_datetime'].apply(strtodatetime)

In [None]:
test_df['day_of_week'] = test_df['pickup_datetime'].apply(dayofweek)

In [None]:
test_df['sin_day'] ,test_df['cos_day'] = zip(*test_df['day_of_week'].map(dayofweektosincosday))

In [None]:
test_df['dist'] = test_df[['pickup_latitude',
         'pickup_longitude',
         'dropoff_latitude',
         'dropoff_longitude']].apply(lambda x:dist(x), axis=1)

In [None]:
test_df['dist'].replace(to_replace=0, value=1, inplace=True)

In [None]:
test_df['dist'] = test_df['dist'].apply(math.log)

In [None]:
test_df['pickup_sin_sec'] ,test_df['pickup_cos_sec'] = zip(*test_df['pickup_datetime'].map(timetosectosincosday))

In [None]:
test_df_processed =  test_df[['vendor_id',
                            'passenger_count',
                            'pickup_sin_sec',
                            'pickup_cos_sec',
                            'pickup_longitude',
                            'pickup_latitude',
                            'dropoff_longitude',
                            'dropoff_latitude',
                            'sin_day',
                            'cos_day',
                            'dist']]

In [None]:
test_data =  standardScalarX.transform(test_df_processed)

In [None]:
y_pred = model.predict(test_data)

In [None]:
y_pred =standardScalarY.inverse_transform(y_pred)
y_pred = np.exp(y_pred)

In [None]:
result = test_df[['id']]

In [None]:
result['trip_duration'] = pd.DataFrame(data=y_pred, columns=['trip_duration'])

In [None]:
result.head()

In [None]:
result.to_csv("submission.csv", index = False)