In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Case Study 1: New York City Taxi Fare Prediction



The dataset here consists of historical data pertaining to fare amounts (inclusive of tolls) for taxi rides in New York City. There are various attributes here including the fare amount, pickup times, pickup and dropoff co-ordinates and the passenger count! 

The key idea here is if we can build a model on this datset to predict the potential taxi fare for a future taxi ride in NYC given that we know the other attibutes except fare

__Main Objective:__ Given a ride's pickup time, pickup and dropoff coordinates along with the total passengers riding, build a model to predict the fare for a NYC taxi (regression)


In [None]:

#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from warnings import filterwarnings
filterwarnings('ignore')


# Load and View the Dataset

There are over __50 million__ datapoints in this dataset! We load around __10 million__ datapoints for this case study 

The data is available at https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/data from where you can download it.

We recommend using the kaggle API and the following command via CLI to get it.

__`kaggle competitions download -c new-york-city-taxi-fare-prediction`__

In [None]:
%%time

df = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/train.csv', nrows=10000000)
df.shape

In [None]:
df.head()

# EDA

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# key
#it is duplicate of pickup_datetime, we can drop the column

df.drop('key', axis=1, inplace=True)
df.shape

### Fare amount (target)

In [None]:
sns.distplot(df['fare_amount'])

In [None]:
sns.boxplot(df['fare_amount'])

In [None]:
print('Skewness :', df['fare_amount'].skew())
print('Kurtosis :', df['fare_amount'].kurt())

In [None]:
#The variable is highly right skewed and highly peaked.
#There are ouliers as seen in box plot, we need to treat the outliers

### pickup_datetime

In [None]:
df['pickup_datetime'].dtype

In [None]:
#the datatype is object instead of datetime, we change the datatype and further analyse

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S UTC')


In [None]:
df.info()

In [None]:
df['pickup_datetime'].min(), df['pickup_datetime'].max()
#the dataset contains almost 6.5years of data

### passenger_count

In [None]:
sns.distplot(df['passenger_count'])

#the data is right skewed and it contains outliers

In [None]:
df['passenger_count'].value_counts()

#count of passengers are nearly 129 in one ride, which is highly impossible.
#we can consider the trips where passengers are between 0 to 8 members

## Corelation


In [None]:
sns.heatmap(df.corr(), annot=True)

#none of the variables are corelated with the target variable

## Data pre-processing

In [None]:
#There are no redundant variables present, we can check for duplicates

df.duplicated().value_counts()

In [None]:
#there are 58 duplicate records, we can drop them by keeping the frist record

df.drop_duplicates(inplace=True, ignore_index=True)
df.shape

### NUll values

In [None]:
df.isnull().sum()

#there are 69null values, which is less than 0.0001%
#we can remove those records from the dataframe

In [None]:
df.dropna(inplace=True)
df.shape

In [None]:
#we can remove the records where passengers are more than 8

df = df[(df['passenger_count'] >0) & (df['passenger_count']<=8)]
df.shape

In [None]:
sns.distplot(df['passenger_count'])

## Outlier treatment

In [None]:
#we can use IQR method quantileremove the outliers from target column
sns.distplot(df['fare_amount'])

In [None]:
q1 = df['fare_amount'].quantile(0.25)
q3 = df['fare_amount'].quantile(0.75)

iqr = q3 - q1
ul = q3 + 1.5*iqr
ll = q1 - 1.5*iqr

In [None]:
df[(df['fare_amount'] > ul) | (df['fare_amount'] < ll)].shape

#there are round 8-9% of outliers, instead of removing it we can take a range of fare price (0.1 to 1000)

In [None]:
df = df[df['fare_amount'].between(0.1, 1000)]
df.shape

In [None]:
#we can deal with the data where pickup and drop location is in US
#US latitudes = 30 to 50
#US longitutdes = -65 to -125

In [None]:
df = df[df['pickup_latitude'].between(30, 50)]
df.shape

In [None]:
df = df[df['dropoff_latitude'].between(30, 50)]
df.shape

In [None]:
#we can do the same for longitude

df = df[df['pickup_longitude'].between(-125, -65)]
df.shape

In [None]:
df = df[df['dropoff_longitude'].between(-125, -65)]
df.shape

In [None]:
df_copy = df.copy()

In [None]:
y = df['fare_amount']
X = df.drop('fare_amount', axis=1)

In [None]:
#train test split

from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

# 1. Base model

### 1.1 Linear regression

In [None]:
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score, mean_squared_error

In [None]:
#we can remove pickup date from X_train and X_test

X_train1 = X_train.drop('pickup_datetime', axis=1)
X_test1 = X_test.drop('pickup_datetime', axis=1)

In [None]:
lir = LinearRegression()
lir.fit(X_train1, y_train)

In [None]:
y_test_pred = lir.predict(X_test1)

In [None]:
r2 = r2_score(y_test, y_test_pred)

print('R-sq:', r2)

In [None]:
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5
print('RMSE:', rmse)

## XGBRegressor

In [None]:
import xgboost as xgb

In [None]:
%%time

xgr = xgb.XGBRegressor(objective='reg:linear', n_estimators=50, max_depth=5, n_jobs=-1, random_state=42)
xgr.fit(X_train1, y_train)

In [None]:
y_test_pred = xgr.predict(X_test1)

In [None]:
r2 = r2_score(y_test, y_test_pred)
print('R-sq:', r2)

In [None]:
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5
print('RMSE:', rmse)

#### We have __R2 = 0.7726__ and __RMSE = 4.63__

# 2. We can create new fetures from Pickup DateTime features
==> Year, Month, Day, Hour, Day_of_week

In [None]:
import datetime as dt

In [None]:
X_train['year'] = X_train['pickup_datetime'].dt.year
X_train['month'] = X_train['pickup_datetime'].dt.month
X_train['day'] = X_train['pickup_datetime'].dt.day
X_train['hour'] = X_train['pickup_datetime'].dt.hour
X_train['day_of_week'] = X_train['pickup_datetime'].dt.weekday


X_test['year'] = X_test['pickup_datetime'].dt.year
X_test['month'] = X_test['pickup_datetime'].dt.month
X_test['day'] = X_test['pickup_datetime'].dt.day
X_test['hour'] = X_test['pickup_datetime'].dt.hour
X_test['day_of_week'] = X_test['pickup_datetime'].dt.weekday

In [None]:
#we can remove pickup date from X_train and X_test

X_train1 = X_train.drop('pickup_datetime', axis=1)
X_test1 = X_test.drop('pickup_datetime', axis=1)


In [None]:
%%time

xgr = xgb.XGBRegressor(objective='reg:linear', n_estimators=50, max_depth=5, n_jobs=-1, random_state=42)
xgr.fit(X_train1, y_train)

In [None]:
y_test_pred = xgr.predict(X_test1)

In [None]:
r2 = r2_score(y_test, y_test_pred)
print('R-sq:', r2)

In [None]:
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5
print('RMSE:', rmse)

#### We have __R2 = 0.7911__ and __RMSE = 4.43__

#### This is an improvement of __2% R-sq__ and __0.2__ drop in RMSE

# 3. Calculating the trip distance - Haversine Distance

The Earth is round but big, so we can consider it flat for short distances. However, flat-Earth formulas for calculating the distance between two points start showing noticeable errors when the distance is more than about 20 kilometers

Therefore, calculating distances on a sphere needs to consider spherical geometry

The haversine formula is a very accurate way of computing distances between two points on the surface of a sphere using the latitude and longitude of the two points



Haversine Formula
The word "Haversine" comes from the function: haversine(θ) = sin²(θ/2)

We can further derive the following:

a = sin²(φB - φA/2) + cos φA * cos φB * sin²(λB - λA/2)

c = 2 * atan2( √a, √(1−a) )

d = R ⋅ c

The following equation where,

φ is latitude
λ is longitude
R is earth’s radius
d is the haversine distance
Note that angles need to be in radians to pass to trig functions

Source: https://community.esri.com/groups/coordinate-reference-systems/blog/2017/10/05/haversine-formula 

In [None]:
from math import radians, cos, sin, asin, sqrt

AVG_EARTH_RADIUS_KM = 6371.0088
AVG_EARTH_RADIUS_MI = 3958.7613

def haversine(start_coord, end_coord, miles=False):
    # get earth radius in required units
    if miles:
        avg_earth_radius = AVG_EARTH_RADIUS_MI
    else:
        avg_earth_radius = AVG_EARTH_RADIUS_KM

    # unpack latitude/longitude
    lat1, lng1 = start_coord
    lat2, lng2 = end_coord

    # convert all latitudes/longitudes from decimal degrees to radians
    lat1, lng1, lat2, lng2 = map(radians, (lat1, lng1, lat2, lng2))

    # calculate haversine
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = sin(lat * 0.5) ** 2 + cos(lat1) * cos(lat2) * sin(lng * 0.5) ** 2
    
    return 2 * avg_earth_radius * asin(sqrt(d))

In [None]:
X_train.drop(columns=['pickup_datetime'], inplace=True)
X_test.drop(columns=['pickup_datetime'], inplace=True)

In [None]:
X_train['haversine_dist'] = X_train.apply(lambda row: haversine(start_coord=(row['pickup_latitude'], 
                                                                             row['pickup_longitude']),
                                                                end_coord=(row['dropoff_latitude'], 
                                                                           row['dropoff_longitude'])), axis=1)

X_test['haversine_dist'] = X_test.apply(lambda row: haversine(start_coord=(row['pickup_latitude'], 
                                                                             row['pickup_longitude']),
                                                                end_coord=(row['dropoff_latitude'], 
                                                                           row['dropoff_longitude'])), axis=1)
X_train.head()

In [None]:
%%time

xgr = xgb.XGBRegressor(objective='reg:linear', n_estimators=50, max_depth=5, n_jobs=-1, random_state=42)
xgr.fit(X_train, y_train)

In [None]:
y_test_pred = xgr.predict(X_test)

r2 = r2_score(y_test, y_test_pred)
print('R-sq:', r2)

rmse = mean_squared_error(y_test, y_test_pred) ** 0.5
print('RMSE:', rmse)

#### We have __R2 = 0.8209__ and __RMSE = 4.11__

#### This is an improvement of __3% R-sq__ and __0.32__ drop in RMSE

# 4. PolynomialFeatures

In [None]:
from sklearn.preprocessing import  PolynomialFeatures

In [None]:
poly = PolynomialFeatures(degree=1)

In [None]:
polyXtrain = poly.fit_transform(X_train)
polyXtest = poly.transform(X_test)

In [None]:
%%time

xgr = xgb.XGBRegressor(objective='reg:linear', n_estimators=50, max_depth=5, n_jobs=-1, random_state=42)
xgr.fit(polyXtrain, y_train)

In [None]:
y_test_pred = xgr.predict(polyXtest)

r2 = r2_score(y_test, y_test_pred)
print('R-sq:', r2)

rmse = mean_squared_error(y_test, y_test_pred) ** 0.5
print('RMSE:', rmse)

#### We have __R2 = 0.8209__ and __RMSE = 4.11__
#### There is no difference, we can consider this as our final model