In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read in 'TRAIN_DATA' values
TRAIN_DATA = 6000000

In [None]:
train = pd.read_csv("../input/new-york-city-taxi-fare-prediction/train.csv", nrows = TRAIN_DATA)
test = pd.read_csv("../input/new-york-city-taxi-fare-prediction/test.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

In [None]:
# Checking the missing values
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.describe()

## Data Cleaning

In [None]:
# Checking the rows with missing values
train[train.isnull().any(axis=1)]

### Missing values can be of two reasons
- Failed to enter the data
- Since it is related to drop-off location, passeger might had booked and later cancelled.

Anyhow it is safe to remove these rows since it is only (39/TRAIN_DATA)*100 = 0.00065% of the total rows.

In [None]:
#Drop the rows with missing values
train.dropna(inplace=True)

In [None]:
train.shape

In [None]:
train.columns

In [None]:
print('Pick-up boundaries of training set')
print('max value of longitude:',train['pickup_longitude'].max())
print('min value of longitude:',train['pickup_longitude'].min())
print('max value of latitude:',train['pickup_latitude'].max())
print('max value of latitude:',train['pickup_longitude'].max())
print("\n*******\n")
print('Drop-off boundaries of training set')
print('max value of longitude:',train['dropoff_longitude'].max())
print('min value of longitude:',train['dropoff_longitude'].min())
print('max value of latitude:',train['dropoff_latitude'].max())
print('max value of latitude:',train['dropoff_longitude'].max())

In [None]:
print('Pick-up boundaries of test set')
print('max value of longitude:',test['pickup_longitude'].max())
print('min value of longitude:',test['pickup_longitude'].min())
print('max value of latitude:',test['pickup_latitude'].max())
print('max value of latitude:',test['pickup_longitude'].max())
print("\n*******\n")
print('Drop-off boundaries of test set')
print('max value of longitude:',test['dropoff_longitude'].max())
print('min value of longitude:',test['dropoff_longitude'].min())
print('max value of latitude:',test['dropoff_latitude'].max())
print('max value of latitude:',test['dropoff_longitude'].max())

#### The coordinates of NY city is 40.7128° N (+40.7128), 74.0060° W (-74.0060).
- So, there are outliers in the training set and we should remove those rows
- Let's take following range
- longitude:-76 to -72
- latitude: 38 to 42

In [None]:
train = train[(-76 <= train['pickup_longitude']) & (train['pickup_longitude'] <= -72)]
train = train[(-76 <= train['dropoff_longitude']) & (train['dropoff_longitude'] <= -72)]
train = train[(38 <= train['pickup_latitude']) & (train['pickup_latitude'] <= 42)]
train = train[(38 <= train['dropoff_latitude']) & (train['dropoff_latitude'] <= 42)]

In [None]:
train.shape

In [None]:
# Checking the fare range
print("Max fare value:", train['fare_amount'].max())
print("Min fare value:", train['fare_amount'].min())

In [None]:
len(train[train['fare_amount']<0])

#### Fare amount cannot be negative. There are 235 entries with negative fare value. Since the count is less I am not going for any assumptions or imputations. I am simply dropping those rows.

In [None]:
# Dropping the rows with fare value < 0
train = train[train['fare_amount']>=0]

In [None]:
train.shape

In [None]:
# Checking outliers in fare_amount
plt.figure(figsize=(12,4))
sns.boxplot(train['fare_amount'])

In [None]:
len(train[train['fare_amount']>200])

200 seems to be high value and there are 159 entries with fare price more than 200. 

In [None]:
# Dropping the rows with fare value >200
train = train[train['fare_amount']<=200]

In [None]:
train.shape

### Feature Engineering

#### 1. 'pickup_datetime' is in UTC. Converting it into NY time
 - Coordinated Universal Time is 4 hours ahead of New York, NY. 
 - So, we have to reduce 4 hours from the given time to get the actual NY time

In [None]:
import datetime

In [None]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime']) - datetime.timedelta(hours=4)

In [None]:
train['Year'] = train['pickup_datetime'].dt.year
train['Month'] = train['pickup_datetime'].dt.month
train['Day'] = train['pickup_datetime'].dt.day
train['Hour'] = train['pickup_datetime'].dt.hour
train['Minutes'] = train['pickup_datetime'].dt.minute
train['Day of Week'] = train['pickup_datetime'].dt.dayofweek

In [None]:
train.head(2)

In [None]:
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime']) - datetime.timedelta(hours=4)

In [None]:
test['Year'] = test['pickup_datetime'].dt.year
test['Month'] = test['pickup_datetime'].dt.month
test['Day'] = test['pickup_datetime'].dt.day
test['Hour'] = test['pickup_datetime'].dt.hour
test['Minutes'] = test['pickup_datetime'].dt.minute
test['Day of Week'] = test['pickup_datetime'].dt.dayofweek

#### 2. Distance Calculation
- Here pickup and drop off locations are given as coordinates.
- Then, distance between places can be calculated by using Haversine formula
- Here distance is calculated in miles


In [None]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians

In [None]:
def haversine(df):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    lat1= np.radians(df["pickup_latitude"])
    lat2 = np.radians(df["dropoff_latitude"])
    #### Based on the formula  x1=drop_lat,x2=dropoff_long 
    dlat = np.radians(df['dropoff_latitude']-df["pickup_latitude"])
    dlong = np.radians(df["dropoff_longitude"]-df["pickup_longitude"])
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlong/2)**2

    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    r = 3956 #  Radius of earth in miles. Use 6371 for kilometers
    return c * r

In [None]:
train['Total distance']=haversine(train)

test['Total distance']=haversine(test)

In [None]:
test.head(2)

### Plots

In [None]:
# Fare vs distance (Taking 1000 samples)
sns.lmplot(x='Total distance', y='fare_amount', data=train[1:1000])

#### Fare amount will be more during:
- Peak hours: 8-10 am or 4-6 pm (Monday - Friday)
- Early morning: 0-5 am (all days)
- Late night: 10-11.59 pm (all days)

- PS: Day of Week: 0 is Monday and 6 is Sunday

In [None]:
def is_peak_hour(df):
    peak = False
    if df['Day of Week'] >= 0 and df['Day of Week'] <= 4:
        if (df['Hour'] >= 8 and df['Hour'] <= 10) or (df['Hour'] >= 16 and df['Hour'] <= 18):
            peak = True
        else:
            peak = False
    else:
        peak = False
    return peak

In [None]:
train['Peak hours'] = train.apply(is_peak_hour, axis=1)
test['Peak hours'] = test.apply(is_peak_hour, axis=1)

In [None]:
early_late_hours = [0,1,2,3,4,5,22,23]
train['Early late hours'] = train['Hour'].apply(lambda x: x in early_late_hours)
test['Early late hours'] = test['Hour'].apply(lambda x: x in early_late_hours)

In [None]:
train.head()

In [None]:
test.head(2)

In [None]:
# Just to make sure, everything will be in numbers
train['Peak hours'] = train['Peak hours'].replace({True: 1, False: 0})
train['Early late hours'] = train['Early late hours'].replace({True: 1, False: 0})

In [None]:
test['Peak hours'] = test['Peak hours'].replace({True: 1, False: 0})
test['Early late hours'] = test['Early late hours'].replace({True: 1, False: 0})

In [None]:
test.head(2)

In [None]:
train.head()

### Feature Selection

In [None]:
drop_columns = ['key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 
                'dropoff_longitude', 'dropoff_latitude', 'Minutes']

In [None]:
train.head(2)

In [None]:
train.drop(drop_columns, axis = 1, inplace=True)
test.drop(drop_columns, axis = 1, inplace=True)

In [None]:
# Trying by dropping more columns
#drop_columns_addtnl = ['Day', 'Hour', 'Day of Week']

In [None]:
#train.drop(drop_columns_addtnl, axis = 1, inplace=True)
#test.drop(drop_columns_addtnl, axis = 1, inplace=True)

In [None]:
train.head()

In [None]:
test.head(2)

### Prepare the data sets for model training

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train.drop(['fare_amount'], axis=1).values
y = train['fare_amount'].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=4)

### Model Training

#### 1. XG Boost Regressor

In [None]:
import xgboost

In [None]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)

In [None]:
xgb_pred = xgb_reg.predict(X_val)

### Model Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
xgb_mae = mean_absolute_error(y_val, xgb_pred)
xgb_mse = mean_squared_error(y_val, xgb_pred)
xgb_rmse = np.sqrt(xgb_mse)

In [None]:
print('XG Boost Regressor Performance:-')
print(f'MAE: {xgb_mae}\nMSE:{xgb_mse}\nRMSE:{xgb_rmse}')

In [None]:
sns.distplot(y_val-xgb_pred)

#### 2. ANN

In [None]:
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
from sklearn import preprocessing

In [None]:
# Scale data
# Note: Scaling is needed for DL models
scaler = preprocessing.MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test)

In [None]:
NN_model = Sequential()

# Input Layer
NN_model.add(Dense(128, activation='relu', input_dim=X_train_scaled.shape[1]))

# Hidden Layers
NN_model.add(Dense(256, activation='relu'))
NN_model.add(Dense(256, activation='relu'))
NN_model.add(Dense(256, activation='relu'))

# Output Layer
NN_model.add(Dense(1, activation='linear'))

In [None]:
#Compiling the model
NN_model.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [None]:
NN_model.summary()

In [None]:
# Fitting the model
NN_history = NN_model.fit(x=X_train_scaled, y=y_train, batch_size=512, epochs=50, 
                    validation_data=(X_val_scaled, y_val), shuffle=True)

In [None]:
from keras.layers import Dropout, BatchNormalization

In [None]:
NN_model1 = Sequential()

# Input Layer
NN_model1.add(Dense(128, kernel_initializer='normal', activation='relu', input_dim=X_train_scaled.shape[1]))
#NN_model1.add(BatchNormalization())

# Hidden Layers
NN_model1.add(Dense(256, kernel_initializer='normal', activation='relu'))

NN_model1.add(Dense(256, kernel_initializer='normal', activation='relu'))

NN_model1.add(Dense(256, kernel_initializer='normal', activation='relu'))

NN_model1.add(Dense(256, kernel_initializer='normal', activation='relu'))

NN_model1.add(Dense(256, kernel_initializer='normal', activation='relu'))

# Output Layer
NN_model1.add(Dense(1, kernel_initializer='normal', activation='linear'))

#Compiling the model
NN_model1.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [None]:
NN_model1.summary()

In [None]:
# Fitting the model
NN_history1 = NN_model1.fit(x=X_train_scaled, y=y_train, batch_size=512, epochs=50, 
                    validation_data=(X_val_scaled, y_val), shuffle=True)

In [None]:
# Plot the loss of NN_history
plt.plot(NN_history.history['loss'], label='train loss')
plt.plot(NN_history.history['val_loss'], label='valdn loss')
plt.legend()
plt.show()

In [None]:
print('Model:NN_history')
print('Min value of training Loss:', min(NN_history.history['loss']))
print('Min value of validation Loss:', min(NN_history.history['val_loss']))

In [None]:
# Plot the loss of NN_history1
plt.plot(NN_history1.history['loss'], label='train loss')
plt.plot(NN_history1.history['val_loss'], label='valdn loss')
plt.legend()
plt.show()

In [None]:
print('Model:NN_history1')
print('Min value of training Loss:', min(NN_history1.history['loss']))
print('Min value of validation Loss:', min(NN_history1.history['val_loss']))

In [None]:
# Make prediction with NN_model1
NN_prediction = NN_model1.predict(test_scaled, verbose=1)

In [None]:
NN_prediction

In [None]:
submission = pd.read_csv('../input/new-york-city-taxi-fare-prediction/sample_submission.csv')
submission['fare_amount'] = NN_prediction
submission.to_csv('submission_NN.csv', index=False)
submission.head()

In [None]:
print('Saved file: ' + filename)