In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from haversine import haversine
from sklearn import model_selection

**Data Cleaning, Exploratory Data Analysis, and Feature Engineering**

In [None]:
train_df = pd.read_csv('../input/train.csv', nrows=10000000)
train_df.head(2)


In [None]:
train_df.dtypes

In [None]:
train_df.shape

In [None]:
print(train_df.isnull().sum())

In [None]:
# Given that nulls are lesser in number, let's remove these from the training dataset
print('Old size: %d' %len(train_df))
train_df = train_df.dropna(how='any', axis='rows')
print('New size: %d' %len(train_df))

In [None]:
train_df.describe()

In [None]:
# Fare amount can't be negative
# No. of passenger can neither be 0 not be 208 so dropping anything more than 8 passengers
train_df = train_df[(train_df.fare_amount>0)]
train_df = train_df[(train_df.passenger_count>0) & (train_df.passenger_count<9)]
train_df.shape

Latitude of NYC: 40.730610
Longitude of NYC: -73.935242
Source: https://www.latlong.net/place/new-york-city-ny-usa-1848.html
NYC is 13.4 miles long and 2.3 wide at it's widest part and 1 degree change in longitude and latitude means 50+ miles distance and hence latitude can be (39, 42) and longitude can be (-75, -72).

In [None]:
# Dropping invalid pick-up and drop locations
train_df = train_df[(train_df.pickup_longitude > -75) & (train_df.pickup_longitude < -72)]
train_df = train_df[(train_df.dropoff_longitude > -75) & (train_df.dropoff_longitude < -72)]
train_df = train_df[(train_df.pickup_latitude > 39) & (train_df.pickup_latitude < 42)]
train_df = train_df[(train_df.dropoff_latitude > 39) & (train_df.dropoff_latitude < 72)]
train_df.shape

Below factors can impact taxi fare:
1. Distance travelled 
2. Time and day of the week to take busy hours and weekday vs weekend into consideration. Weekday holiday can also be one factor but we can ignore this for now.

So let's add features which can take above into consideration.

To add distance, we will use ‘haversine’ formula. Source - https://www.movable-type.co.uk/scripts/latlong.html

Haversine formula:	
a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)  |
c = 2 ⋅ atan2( √a, √(1−a) ) |
d = R ⋅ c
where	φ is latitude, λ is longitude, R is earth’s radius (mean radius = 6,371km) and angles should be in radians

In [None]:
def add_distance_feature(df):
    distance= []
    for index, row in df.iterrows():
        distance.append(haversine((row['pickup_latitude'], row['pickup_longitude']),
                                    (row['dropoff_latitude'], row['dropoff_longitude'])))
    df['distance'] = distance

In [None]:
add_distance_feature(train_df)
train_df.head(2)

Now's add features to consider time and day of travel.

In [None]:
def add_time_and_day_feature(df):
    df['pickup_datetime']  = pd.to_datetime(df['pickup_datetime'])
    df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
    df['hour_of_day'] = df['pickup_datetime'].dt.hour
    
add_time_and_day_feature(train_df)
train_df.head(2)

In [None]:
train_df.describe()

In [None]:
train_df = train_df[(train_df.distance > 0.25)]
train_df.describe()

**Modelling**

Let's first brake training data into train and test data for validation and try different standard models and use RMSE as evaluation parameter as mentioned at contest info page.

In [None]:
X = train_df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','distance','day_of_week','hour_of_day']]
y = train_df['fare_amount'].values
 
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)

print(X_train.shape, y_train.shape)

**Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

**Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

**Making Prediction on the test dataset**

In [None]:
test_df = pd.read_csv('../input/test.csv')
add_distance_feature(test_df)
add_time_and_day_feature(test_df)
test_X = test_df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','distance','day_of_week','hour_of_day']]

test_y_predictions = reg.predict(test_X)

submission = pd.DataFrame({'key':test_df.key, 'fare_amount':test_y_predictions}, 
                          columns=['key', 'fare_amount'])

submission.to_csv('submission.csv', index=False)

print(os.listdir('.'))