# TAXI Fare Prediction - EDA Parts

## 0. Data Load

#### Import librairies

In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import geopandas as gpd
from shapely.geometry import Point, Polygon

import ssl

context = ssl._create_unverified_context()
plt.style.use('fivethirtyeight')

### Load data
As the entire dataset is about 55M rows, only part of the dataset is used for EDA

In [None]:
train_df = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', nrows=500_000)
test_df = pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv', nrows=500_000)

print('Number of train: {}'.format(train_df.shape))
print('Number of test: {}'.format(test_df.shape))

In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

### Match the datetime format
This process is time-consuming. Therefore matching the datetime format after data cleaning is better.<br>
However, I perform it previously to check the datetime format during the EDA and claning.

In [None]:
train_df['key'] = pd.to_datetime(train_df['key'])
train_df['pickup_datetime'] = pd.to_datetime(train_df['pickup_datetime'])

test_df['key'] = pd.to_datetime(test_df['key'])
test_df['pickup_datetime'] = pd.to_datetime(test_df['pickup_datetime'])

### Check missing value
Before checking the data, the basic level of data cleaning is performed. <br>
First of all, we should check the missing value.

In [None]:
train_df.isnull().sum().sort_values(ascending=False)

In [None]:
test_df.isnull().sum().sort_values(ascending=False)

The number of missing value can be changed according to the data size which is selected on the data loading step. <br>
Portion of the missing value is quite small to neglectable. </br>
Therefore just <code>dropna</code> the missing values.

In [None]:
train_df = train_df.dropna()
test_df = test_df.dropna() # Although there is no missing value in test dataset, perform the dropna

In [None]:
print('Number of Missing values in train: {}'.format(train_df.isnull().sum().sum()))
print('Number of Missing values in test: {}'.format(test_df.isnull().sum().sum()))

In conclusion, there is no missing values or null values in the dataset.

## 1. Data Cleaning
Basic level of EDA is performed using the train dataset above.

### Fare_amount

In [None]:
print('Previous dataset: {}'.format(len(train_df)))
train_df = train_df[train_df.fare_amount>0]
print('Corrected dataset: {}'.format(len(train_df)))

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10,4))
sns.kdeplot(train_df['fare_amount'].values, ax=axs[0]).set_title("distribution of fare amount")
sns.kdeplot(np.log(train_df['fare_amount'].values), ax=axs[1]).set_title("Distribution of log-scaled fare_amount")

### Passenger_count

In [None]:
fig, axs = plt.subplots(figsize=(8,4))
plt.hist(train_df['passenger_count'].values)
plt.title("distribution of passenger")

In [None]:
print('Previous dataset: {}'.format(len(train_df)))
train_df = train_df[train_df.passenger_count>0]
train_df = train_df[train_df.passenger_count<13]
print('Corrected dataset: {}'.format(len(train_df)))

In [None]:
fig, axs = plt.subplots(figsize=(8,4))
plt.hist(train_df['passenger_count'].values)
plt.title("Corrected distribution of passenger")

### Location of pickup and dropoff
The location of New York city is -74.0063889 (longitude) and 40.7141667 (latitude).

In [None]:
print('Max and Min pickup longitude: {} and {}'.format(max(train_df.pickup_longitude), min(train_df.pickup_longitude)))
print('Max and Min dropout longitude: {} and {}'.format(max(train_df.dropoff_longitude), min(train_df.dropoff_longitude)))

print('Max and Min pickup latitude: {} and {}'.format(max(train_df.pickup_latitude), min(train_df.pickup_latitude)))
print('Max and Min dropout latitude: {} and {}'.format(max(train_df.dropoff_latitude), min(train_df.dropoff_latitude)))

print('Mean pickup latitude: {}'.format(np.mean(train_df.pickup_latitude)))
print('Mean dropout latitude: {}'.format(np.mean(train_df.dropoff_latitude)))

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = train_df.pickup_longitude, y = train_df.pickup_latitude, color='blue')
ax.scatter(train_df.dropoff_longitude, train_df.dropoff_longitude, color='red')
plt.show()

Since there are some outliers, we should remove them.<br>
##### Reference -  [NYC Taxi Fare - Data Exploration](https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration)

In [None]:
def select_within_boundingbox(df, BB):
    return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
           (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
           (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
           (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])

Based on the coordinate of the test dataset, bounding box can be created. </br>
Max and Min <b>longitude</b> of test dataset

In [None]:
min(test_df.pickup_longitude.min(), test_df.dropoff_longitude.min()), \
max(test_df.pickup_longitude.max(), test_df.dropoff_longitude.max())

Max and Min <b>latitude</b> of test dataset

In [None]:
min(test_df.pickup_latitude.min(), test_df.dropoff_latitude.min()), \
max(test_df.pickup_latitude.max(), test_df.dropoff_latitude.max())

Load NYC map

In [None]:
BB = (-74.3, -72.9, 40.5, 41.7)
BB_zoom = (-74.1, -73.75, 40.6, 40.9)

In [None]:
print('Old size: %d' % len(train_df))
train_df = train_df[select_within_boundingbox(train_df, BB)]
print('New size: %d' % len(train_df))

In [None]:
def plot_on_map(df, BB, s=10, alpha=0.2):
    fig, axs = plt.subplots(1, 3, figsize=(20,5))
    axs[0].scatter(df.pickup_longitude, df.pickup_latitude, zorder=1, alpha=alpha, c='r', s=s)
    axs[0].set_xlim((BB[0], BB[1]))
    axs[0].set_ylim((BB[2], BB[3]))
    axs[0].set_title('Pickup locations')
    #axs[0].imshow(extend=BB)

    axs[1].scatter(df.dropoff_longitude, df.dropoff_latitude, zorder=1, alpha=alpha, c='b', s=s)
    axs[1].set_xlim((BB[0], BB[1]))
    axs[1].set_ylim((BB[2], BB[3]))
    axs[1].set_title('Dropoff locations')
    #axs[1].imshow()
    
    axs[2].scatter(df.pickup_longitude, df.pickup_latitude, zorder=1, alpha=alpha, c='r', s=s, label='pickup')
    axs[2].scatter(df.dropoff_longitude, df.dropoff_latitude, zorder=1, alpha=alpha, c='b', s=s, label='dropoff')
    axs[2].set_xlim((BB[0], BB[1]))
    axs[2].set_ylim((BB[2], BB[3]))
    plt.legend(loc='upper left')

In [None]:
plot_on_map(train_df, BB, s=1, alpha=0.3)

In [None]:
plot_on_map(train_df, BB_zoom, s=1, alpha=0.3)

Location Data cleaning

In [None]:
print('Previous dataset: {}'.format(len(train_df)))
train_df = train_df[train_df.pickup_longitude>-75]
train_df = train_df[train_df.pickup_longitude<-73]
train_df = train_df[train_df.pickup_latitude>40]
train_df = train_df[train_df.pickup_latitude<42]

train_df = train_df[train_df.dropoff_longitude>-75]
train_df = train_df[train_df.dropoff_longitude<-73]
train_df = train_df[train_df.dropoff_latitude>40]
train_df = train_df[train_df.dropoff_latitude<42]
print('Corrected dataset: {}'.format(len(train_df)))

## 2. Data creating

### Distance parameter

In [None]:
# Accurate form
def cal_dist(plo, pla, dlo, dla):
    # plo = pickup longtitude
    # pla = pickup latitude
    # dlo = dropoff longitude
    # dla = dropoff latitude
    data = [train_df, test_df]
    R = 6373.0
    
    for i in data:
    
        lat1 = np.radians(i[pla])
        lat2 = np.radians(i[dla])
        lon1 = np.radians(i[plo])
        lon2 = np.radians(i[dlo])
    
        dlon = abs(lon2-lon1)
        dlat = abs(lat2-lat1)
    
        a = np.sin(dlat / 2) **2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) **2
        c = 2*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
        dist = R * c
        i['dist'] = dist
    return dist

# Simple form
def dist(plo, pla, dlo, dla):
    dist = np.abs(dlo - plo) + np.abs(dla - pla)
    return dist

In [None]:
train_df['dist'] = 0.0
cal_dist('pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude')
cal_dist('pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
fig, axs = plt.subplots(1,2, figsize=(15,4))
sns.kdeplot(train_df['dist'].values, ax=axs[0]).set_title("Distance distribution of Train data")
sns.kdeplot(test_df['dist'].values, ax=axs[1]).set_title("Distance distribution of Test data")

### Distance from Airports
Main airports in NYC. <br>
1. NYC - city center
2. JFK
3. EWR
4. LGR

In [None]:
nyc = (-74.0063889, 40.7141667)
jfk = (-73.7822222222, 40.6441666667)
ewr = (-74.175, 40.69)
lgr = (-73.87, 40.77)

In [None]:
#def dist(plo, pla, dlo, dla):
# pla plo dla dlo
def cal_airport(plo, pla, dlo, dla):
    data = [train_df, test_df]
    for i in data:
        i['pickup_nyc'] = dist(nyc[0], nyc[1], i['pickup_longitude'], i['pickup_latitude'])
        i['dropoff_nyc'] = dist(nyc[0], nyc[1], i['dropoff_longitude'], i['dropoff_latitude'])
        
        i['pickup_jfk'] = dist(jfk[0], jfk[1], i['pickup_longitude'], i['pickup_latitude'])
        i['dropoff_jfk'] = dist(jfk[0], jfk[1], i['dropoff_longitude'], i['dropoff_latitude'])
        
        i['pickup_ewr'] = dist(ewr[0], ewr[1], i['pickup_longitude'], i['pickup_latitude'])
        i['dropoff_ewr'] = dist(ewr[0], ewr[1], i['dropoff_longitude'], i['dropoff_latitude'])
        
        i['pickup_lgr'] = dist(lgr[0], lgr[1], i['pickup_longitude'], i['pickup_latitude'])
        i['dropoff_lgr'] = dist(lgr[0], lgr[1], i['dropoff_longitude'], i['dropoff_latitude'])

In [None]:
train_df[['pickup_nyc', 'dropoff_nyc', 'pickup_jfk', 'dropoff_jfk', 'pickup_ewr', 'dropoff_ewr', 'pickup_lgr', 'dropoff_lgr']] = 0.0
test_df[['pickup_nyc', 'dropoff_nyc', 'pickup_jfk', 'dropoff_jfk', 'pickup_ewr', 'dropoff_ewr', 'pickup_lgr', 'dropoff_lgr']] = 0.0

cal_airport('pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude')
train_df.head()

In [None]:
test_df.head()

In [None]:
fig, axs = plt.subplots(1,4, figsize=(20,4))
sns.kdeplot(train_df['pickup_nyc'].values, ax=axs[0]).set_title("Pickup from NYC")
sns.kdeplot(train_df['pickup_jfk'].values, ax=axs[1]).set_title("Pickup from JFK")
sns.kdeplot(train_df['pickup_ewr'].values, ax=axs[2]).set_title("Pickup from EWR")
sns.kdeplot(train_df['pickup_lgr'].values, ax=axs[3]).set_title("Pickup from LGR")

In [None]:
fig, axs = plt.subplots(1,4, figsize=(20,4))
sns.kdeplot(np.log(train_df['pickup_nyc'].values), ax=axs[0]).set_title("Pickup from NYC(log)")
sns.kdeplot(np.log(train_df['pickup_jfk'].values), ax=axs[1]).set_title("Pickup from JFK(log)")
sns.kdeplot(np.log(train_df['pickup_ewr'].values), ax=axs[2]).set_title("Pickup from EWR(log)")
sns.kdeplot(np.log(train_df['pickup_lgr'].values), ax=axs[3]).set_title("Pickup from LGR(log)")

### Datetime

In [None]:
train_df['hour'] = train_df['pickup_datetime'].dt.hour
train_df['day'] = train_df['pickup_datetime'].dt.day
train_df['month'] = train_df['pickup_datetime'].dt.month
train_df['year'] = train_df['pickup_datetime'].dt.year

In [None]:
train_df.head()

In [None]:
test_df['hour'] = test_df['pickup_datetime'].dt.hour
test_df['day'] = test_df['pickup_datetime'].dt.day
test_df['month'] = test_df['pickup_datetime'].dt.month
test_df['year'] = test_df['pickup_datetime'].dt.year

In [None]:
test_df.head()

## 3. Modeling
#### Reference: [Stacked Regressions: Top 4% on LeaderBoard - House Prices](https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard)

In [None]:
import lightgbm as lgbm
import xgboost as xgb
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
train = train_df.drop(['key','pickup_datetime'], axis=1)
test = test_df.drop(['key', 'pickup_datetime'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('fare_amount', axis=1), train['fare_amount'], test_size=0.3)

## Cross validate model

In [None]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [None]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

#### LASSO Regression

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

#### Elastic Net Regression

In [None]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

#### XGBoost

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =42, nthread = -1)

#### LightGBM

In [None]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.005, n_estimators=100,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

### Base models scores

In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) # mean & the standard deviation

In [None]:
score = rmsle_cv(ENet)
print("\nENet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())) # mean & the standard deviation

In [None]:
#score = rmsle_cv(KRR)
#print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
#score = rmsle_cv(GBoost)
#print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

#### Stacking models

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1) 

In [None]:
averaged_models = AveragingModels(models = (ENet,lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
averaged_models.fit(X_train.values, y_train)
stacked_train_pred = averaged_models.predict(X_train.values)
stacked_pred = np.expm1(averaged_models.predict(test.values))
print(rmsle(y_train, stacked_train_pred))

#### NEW XGB and LGBM Models for Ensemble
The parameters for model are refered from [this notebook](https://www.kaggle.com/madhurisivalenka/cleansing-eda-modelling-lgbm-xgboost-starters))

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
params = {'max_depth':7,
          'eta':1,
          'objective':'reg:linear',
          'eval_metric':'rmse',
          'learning_rate':0.05
         }
num_rounds = 50

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(test)
model_xgbm = xgb.train(params, dtrain, num_rounds)

In [None]:
xgb_train_pred = model_xgbm.predict(dtest)
xgb_pred = np.expm1(model_xgbm.predict(dtest))

In [None]:
params = {
    'boosting_type':'gbdt',
    'objective': 'regression',
    'nthread': -1,
    'verbose': 0,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'max_depth': -1,
    'subsample': 0.8,
    'subsample_freq': 1,
    'colsample_bytree': 0.6,
    'reg_lambda': 0.001,
    'metric': 'rmse',
    'min_split_gain': 0.5,
    'min_child_weight': 1,
    'min_child_samples': 10,
    'scale_pos_weight':1,
    'force_col_wise':True
    }
train_set = lgb.Dataset(X_train, y_train, silent=True)
model_lgbm = lgb.train(params, train_set = train_set)

In [None]:
lgb_train_pred = model_lgbm.predict(X_train)
lgb_pred = np.expm1(model_lgbm.predict(test.values))
print(rmsle(y_train, lgb_train_pred))

In [None]:
print('Number of test: {}'.format(lgb_pred.shape))

#### Ensemble

In [None]:
'''RMSE on the entire Train data when averaging'''

#print('RMSLE score on train data:')
#print(rmsle(y_train,stacked_train_pred*0.70 +
#               xgb_train_pred*0.15 + lgb_train_pred*0.15 ))

In [None]:
ensemble = stacked_pred*0.70 + xgb_pred*0.15 + lgb_pred*0.15
print(len(ensemble))
ensemble

## 4. Submission

In [None]:
sub=pd.read_csv('../input/new-york-city-taxi-fare-prediction/sample_submission.csv')

In [None]:
sub.head()

In [None]:
sub['fare_amount'] = ensemble
sub.to_csv('submission.csv',index=False)
sub.head()