In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Import necessary librairies**

In [None]:
import time
notebookstart= time.time()

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings("ignore")

from scipy import stats
from scipy.stats import norm, skew
# Simply works with Numpy, Matplotlib, Pandas, Sympy etc. 
# SciPy provides numerical integral routines and differential equations interpreters, algorithms to root out equations, standard continuous/differentiated probability distributions, and various statistical tools.

# **Collecting Data**

In [None]:
df_train =  pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', nrows = 100000, parse_dates=["pickup_datetime"])

df_train.head()

In [None]:
df_test =  pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv', parse_dates=["pickup_datetime"])

df_test.head()

# **Exploratory data analysis**

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.shape, df_test.shape

* **Target Variable**

**Fare amount** is the variable we need to predict. So let's do some analysis on this variable first.

In [None]:
sns.distplot(df_train['fare_amount'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df_train['fare_amount'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Fare amount distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(df_train['fare_amount'], plot=plt)
plt.show()

The target variable is skewed. As (linear) models love normally distributed data , we need to transform this variable and make it more normally distributed.

* **Log-transformation of the target variable**

In [None]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
df_train["fare_amount"] = np.log1p(df_train["fare_amount"])

#Check the new distribution 
sns.distplot(df_train['fare_amount'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df_train['fare_amount'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Fare amount distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(df_train['fare_amount'], plot=plt)
plt.show()

* **Missing values**

In [None]:
df_train.isnull().sum()

Missing values are very small, so it is believed to have a small impact on predictions. Therefore, it seems safe to remove it from the dataset.

In [None]:
df_train = df_train.dropna()

In [None]:
df_train.isnull().sum()

In [None]:
df_train.shape

* **Fare amount** : dollar amount of the cost of the taxi ride. 
* **Passenger count** :  indicating the number of passengers in the taxi ride.

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = df_train['passenger_count'], y = df_train['fare_amount'])
plt.ylabel('fare_amount', fontsize=13)
plt.xlabel('passenger_count', fontsize=13)
plt.show()

In [None]:
df_train['passenger_count'].value_counts()

In [None]:
df_train[df_train['fare_amount'] > 600]['fare_amount'].value_counts()

In [None]:
df_train[df_train['fare_amount'] < 0]['fare_amount'].value_counts()

We can find the outliers.
1. Passenger_count == 200 in the bottom right can be judged by the outlier value.
2. The high fare in the upper left is far from the distribution, so it is judged to be outlier.
3. Fare determines that a rate below zero cannot exist as an outlier.


**Note** :

Eliminating outliers is stable for creating robust models. Therefore, we will remove the abnormalities found above later.

* **Fare amount**
* **pickup datetime** : value indicating when the taxi ride started.

Let's find out the rate for each time zone!
Data will be divided by year, month, date, time, and day.

In [None]:
df_train["year"] = df_train["pickup_datetime"].dt.year
df_train["month"] = df_train["pickup_datetime"].dt.month
df_train["day"] = df_train["pickup_datetime"].dt.day
df_train["hour"] = df_train["pickup_datetime"].dt.hour
df_train["dayofweek"] = df_train["pickup_datetime"].dt.dayofweek

In [None]:
df_train

In [None]:
fig, axes = plt.subplots(nrows=5)
fig.set_size_inches(18,14)

plt.sca(axes[0])
plt.xticks(rotation=30, ha='right')
axes[0].set(ylabel='fare_amount',title="Annual Fare")
sns.pointplot(data = df_train, x="year", y="fare_amount", ax=axes[0])

plt.sca(axes[1])
plt.xticks(rotation=30, ha='right')
axes[1].set(ylabel='fare_amount',title="Monthly Fare")
sns.pointplot(data = df_train, x="month", y="fare_amount", ax=axes[1])

plt.sca(axes[2])
plt.xticks(rotation=30, ha='right')
axes[2].set(ylabel='fare_amount',title="Daily Fare")
sns.pointplot(data = df_train, x="day", y="fare_amount", ax=axes[2])

plt.sca(axes[3])
plt.xticks(rotation=30, ha='right')
axes[3].set(ylabel='fare_amount',title="hourly Fare")
sns.pointplot(data = df_train, x="hour", y="fare_amount", ax=axes[3])

plt.sca(axes[4])
plt.xticks(rotation=30, ha='right')
axes[4].set(ylabel='fare_amount',title="Fare by Day")
sns.pointplot(data = df_train, x="dayofweek", y="fare_amount", ax=axes[4])

Let's interpret the results!

1. Fare increases over the years.
2. Fare is high at the beginning of the month (4th to 6th)

Please let me know if you have any other information you can find out.

* **pickup_longitude** - float for longitude coordinate of where the taxi ride started.
* **pickup_latitude** - float for latitude coordinate of where the taxi ride started.
* **dropoff_longitude** - float for longitude coordinate of where the taxi ride ended.
* **dropoff_latitude** - float for latitude coordinate of where the taxi ride ended.

**Note** :

Latitude and longitude are a pair of numbers (coordinates) used to describe a position on the plane of a geographic coordinate system. The numbers are in decimal degrees format and range from -90 to 90 for latitude and -180 to 180 for longitude.

In [None]:
df_train[(df_train['pickup_longitude'] > 180) | (df_train['pickup_longitude'] < -180)]['pickup_longitude'].value_counts()

In [None]:
df_train[(df_train['pickup_latitude'] > 90) | (df_train['pickup_latitude'] < -90)]['pickup_latitude'].value_counts()

In [None]:
df_train[(df_train['dropoff_longitude'] > 180) | (df_train['dropoff_longitude'] < -180)]['dropoff_longitude'].value_counts()

In [None]:
df_train[(df_train['dropoff_latitude'] > 90) | (df_train['dropoff_latitude'] < -90)]['dropoff_latitude'].value_counts()

Values outside the latitude and longitude range can be determined as outliers, so we decided to remove them.

# **Data Cleaning and Feature Engineering**

* Reference : https://www.kaggle.com/nicapotato/taxi-rides-time-analysis-and-oof-lgbm

This work has already referred to well-organized kernels.

In [None]:
train = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', nrows = 5000, index_col = "key")
train = train.dropna()
test_df = pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv', index_col = "key")
testdex = test_df.index

In [None]:
def prepare_distance_features(df):
    # Distance is expected to have an impact on the fare
    df['longitude_distance'] = abs(df['pickup_longitude'] - df['dropoff_longitude'])
    df['latitude_distance'] = abs(df['pickup_latitude'] - df['dropoff_latitude'])

    # Straight distance
    df['distance_travelled'] = (df['longitude_distance'] ** 2 + df['latitude_distance'] ** 2) ** .5
    df['distance_travelled_sin'] = np.sin((df['longitude_distance'] ** 2 * df['latitude_distance'] ** 2) ** .5)
    df['distance_travelled_cos'] = np.cos((df['longitude_distance'] ** 2 * df['latitude_distance'] ** 2) ** .5)
    df['distance_travelled_sin_sqrd'] = np.sin((df['longitude_distance'] ** 2 * df['latitude_distance'] ** 2) ** .5) ** 2
    df['distance_travelled_cos_sqrd'] = np.cos((df['longitude_distance'] ** 2 * df['latitude_distance'] ** 2) ** .5) ** 2

    # Haversine formula for distance
    # Haversine formula:	a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
    R = 6371e3 # Metres
    phi1 = np.radians(df['pickup_latitude'])
    phi2 = np.radians(df['dropoff_latitude'])
    phi_chg = np.radians(df['pickup_latitude'] - df['dropoff_latitude'])
    delta_chg = np.radians(df['pickup_longitude'] - df['dropoff_longitude'])
    a = np.sin(phi_chg / 2) + np.cos(phi1) * np.cos(phi2) * np.sin(delta_chg / 2)
    c = 2 * np.arctan2(a ** .5, (1-a) ** .5)
    d = R * c
    df['haversine'] = d

    # Bearing
    # Formula:	θ = atan2( sin Δλ ⋅ cos φ2 , cos φ1 ⋅ sin φ2 − sin φ1 ⋅ cos φ2 ⋅ cos Δλ )
    y = np.sin(delta_chg * np.cos(phi2))
    x = np.cos(phi1) * np.sin(phi2) - np.sin(phi1) * np.cos(phi2) * np.cos(delta_chg)
    df['bearing'] = np.arctan2(y, x)

    return df

def prepare_time_features(df):
    df['pickup_datetime'] = df['pickup_datetime'].str.replace(" UTC", "")
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S')
    df['hour_of_day'] = df.pickup_datetime.dt.hour
    df['week'] = df.pickup_datetime.dt.week
    df['month'] = df.pickup_datetime.dt.month
    df["year"] = df.pickup_datetime.dt.year
    df['day_of_year'] = df.pickup_datetime.dt.dayofyear
    df['week_of_year'] = df.pickup_datetime.dt.weekofyear
    df["weekday"] = df.pickup_datetime.dt.weekday
    df["quarter"] = df.pickup_datetime.dt.quarter
    df["day_of_month"] = df.pickup_datetime.dt.day
    
    return df

# Airport Features - By Albert van Breenmen
# https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration
def dist(pickup_lat, pickup_long, dropoff_lat, dropoff_long):  
    distance = np.abs(dropoff_lat - pickup_lat) + np.abs(dropoff_long - pickup_long)
    
    return distance

def airport_feats(train,test_df):
    for data in [train,test_df]:
        nyc = (-74.0063889, 40.7141667)
        jfk = (-73.7822222222, 40.6441666667)
        ewr = (-74.175, 40.69)
        lgr = (-73.87, 40.77)
        data['distance_to_center'] = dist(nyc[1], nyc[0],
                                          data['pickup_latitude'], data['pickup_longitude'])
        data['pickup_distance_to_jfk'] = dist(jfk[1], jfk[0],
                                             data['pickup_latitude'], data['pickup_longitude'])
        data['dropoff_distance_to_jfk'] = dist(jfk[1], jfk[0],
                                               data['dropoff_latitude'], data['dropoff_longitude'])
        data['pickup_distance_to_ewr'] = dist(ewr[1], ewr[0], 
                                              data['pickup_latitude'], data['pickup_longitude'])
        data['dropoff_distance_to_ewr'] = dist(ewr[1], ewr[0],
                                               data['dropoff_latitude'], data['dropoff_longitude'])
        data['pickup_distance_to_lgr'] = dist(lgr[1], lgr[0],
                                              data['pickup_latitude'], data['pickup_longitude'])
        data['dropoff_distance_to_lgr'] = dist(lgr[1], lgr[0],
                                               data['dropoff_latitude'], data['dropoff_longitude'])
    return train, test_df

# Percentile
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

# Build ime Aggregate Features
def time_agg(train, test_df, vars_to_agg, vars_be_agg):
    for var in vars_to_agg:
        agg = train.groupby(var)[vars_be_agg].agg(["sum","mean","std","skew",percentile(80),percentile(20)])
        if isinstance(var, list):
            agg.columns = pd.Index(["fare_by_" + "_".join(var) + "_" + str(e) for e in agg.columns.tolist()])
        else:
            agg.columns = pd.Index(["fare_by_" + var + "_" + str(e) for e in agg.columns.tolist()]) 
        train = pd.merge(train,agg, on=var, how= "left")
        test_df = pd.merge(test_df,agg, on=var, how= "left")
    
    return train, test_df

# Clean dataset from https://www.kaggle.com/gunbl4d3/xgboost-ing-taxi-fares
def clean_df(df):
    return df[(df.fare_amount > 0) & 
            (df.pickup_longitude > -80) & (df.pickup_longitude < -70) &
            (df.pickup_latitude > 35) & (df.pickup_latitude < 45) &
            (df.dropoff_longitude > -80) & (df.dropoff_longitude < -70) &
            (df.dropoff_latitude > 35) & (df.dropoff_latitude < 45)]
print("Cleaning Functions Defined..")

In [None]:
print("Percent of Training Set with Zero and Below Fair: ", round(((train.loc[train["fare_amount"] <= 0, "fare_amount"].shape[0]/train.shape[0]) * 100),5))
print("Percent of Training Set 200 and Above Fair: ", round((train.loc[train["fare_amount"] >= 200, "fare_amount"].shape[0]/train.shape[0]) * 100,5))
train = train.loc[(train["fare_amount"] > 0) & (train["fare_amount"] <= 200),:]
print("\nPercent of Training Set with Zero and Below Passenger Count: ", round((train.loc[train["passenger_count"] <= 0, "passenger_count"].shape[0]/train.shape[0]) * 100,5))
print("Percent of Training Set with Nine and Above Passenger Count: ", round((train.loc[train["passenger_count"] >= 9, "passenger_count"].shape[0]/train.shape[0]) * 100,5))
train = train.loc[(train["passenger_count"] > 0) & (train["passenger_count"] <= 9),:]

# Clean Training Set
train = clean_df(train)

# Distance Features
train = prepare_distance_features(train)
test_df = prepare_distance_features(test_df)
train,test_df = airport_feats(train,test_df)

# Time Features
train = prepare_time_features(train)
test_df = prepare_time_features(test_df)

# Ratios
train["fare_to_dist_ratio"] = train["fare_amount"] / ( train["distance_travelled"]+0.0001)
train["fare_npassenger_to_dist_ratio"] = (train["fare_amount"] / train["passenger_count"]) /( train["distance_travelled"]+0.0001)

# Time Aggregate Features
train, test_df = time_agg(train, test_df,
                          vars_to_agg  = ["passenger_count", "weekday", "quarter", "month", "year", "hour_of_day",
                                          ["weekday", "month", "year"], ["hour_of_day", "weekday", "month", "year"]],
                          vars_be_agg = "fare_amount")

* **Time Range**

In [None]:
train_time_start = train.pickup_datetime.min()
train_time_end = train.pickup_datetime.max()
print("Train Time Starts: {}, Ends {}".format(train_time_start,train_time_end))
test_time_start = test_df.pickup_datetime.min()
test_time_end = test_df.pickup_datetime.max()
print("Test Time Starts: {}, Ends {}".format(test_time_start,test_time_end))

* **missing values**

In [None]:
train_data_na = (train.isnull().sum() / len(train)) * 100
train_data_na = train_data_na.drop(train_data_na[train_data_na == 0].index).sort_values(ascending=False)[:30]

In [None]:
missing_data = pd.DataFrame({'Missing Ratio' :train_data_na})
missing_data.head(20)

In [None]:
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='45')
sns.barplot(x=train_data_na.index, y=train_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

In [None]:
test_data_na = (test_df.isnull().sum() / len(test_df)) * 100
test_data_na = test_data_na.drop(test_data_na[test_data_na == 0].index).sort_values(ascending=False)[:30]

missing_data = pd.DataFrame({'Missing Ratio' :test_data_na})
missing_data.head(20)

* **Imputing missing values**

In [None]:
# df.mode() : The value that occurs most frequently

# train['haversine'] = train['haversine'].fillna(train['haversine'].mode()[0])
# test_df['haversine'] = test_df['haversine'].fillna(test_df['haversine'].mode()[0])

train['fare_by_weekday_month_year_skew'] = train['fare_by_weekday_month_year_skew'].fillna(train['fare_by_weekday_month_year_skew'].mode()[0])
test_df['fare_by_weekday_month_year_skew'] = test_df['fare_by_weekday_month_year_skew'].fillna(test_df['fare_by_weekday_month_year_skew'].mode()[0])

train['fare_by_weekday_month_year_std'] = train['fare_by_weekday_month_year_std'].fillna(train['fare_by_weekday_month_year_std'].mode()[0])
test_df['fare_by_weekday_month_year_std'] = test_df['fare_by_weekday_month_year_std'].fillna(test_df['fare_by_weekday_month_year_std'].mode()[0])

# train['fare_by_hour_of_day_weekday_month_year_skew'] = train['fare_by_hour_of_day_weekday_month_year_skew'].fillna(train['fare_by_hour_of_day_weekday_month_year_skew'].mode()[0])
# test_df['fare_by_hour_of_day_weekday_month_year_skew'] = test_df['fare_by_hour_of_day_weekday_month_year_skew'].fillna(test_df['fare_by_hour_of_day_weekday_month_year_skew'].mode()[0])

# train['fare_by_hour_of_day_weekday_month_year_std'] = train['fare_by_hour_of_day_weekday_month_year_std'].fillna(train['fare_by_hour_of_day_weekday_month_year_std'].mode()[0])
# test_df['fare_by_hour_of_day_weekday_month_year_std'] = test_df['fare_by_hour_of_day_weekday_month_year_std'].fillna(test_df['fare_by_hour_of_day_weekday_month_year_std'].mode()[0])

# test_df['fare_by_hour_of_day_weekday_month_year_sum'] = test_df['fare_by_hour_of_day_weekday_month_year_sum'].fillna(test_df['fare_by_hour_of_day_weekday_month_year_sum'].mode()[0])

# test_df['fare_by_hour_of_day_weekday_month_year_mean'] = test_df['fare_by_hour_of_day_weekday_month_year_mean'].fillna(test_df['fare_by_hour_of_day_weekday_month_year_mean'].mode()[0])

# test_df['fare_by_hour_of_day_weekday_month_year_percentile_80'] = test_df['fare_by_hour_of_day_weekday_month_year_percentile_80'].fillna(test_df['fare_by_hour_of_day_weekday_month_year_percentile_80'].mode()[0])

# test_df['fare_by_hour_of_day_weekday_month_year_percentile_20'] = test_df['fare_by_hour_of_day_weekday_month_year_percentile_20'].fillna(test_df['fare_by_hour_of_day_weekday_month_year_percentile_20'].mode()[0])

In [None]:
test_data_na = (test_df.isnull().sum() / len(test_df)) * 100
test_data_na = test_data_na.drop(test_data_na[test_data_na == 0].index).sort_values(ascending=False)[:30]

missing_data = pd.DataFrame({'Missing Ratio' :test_data_na})
missing_data.head(20)

In [None]:
train_data_na = (train.isnull().sum() / len(train)) * 100
train_data_na = train_data_na.drop(train_data_na[train_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :train_data_na})
missing_data.head(20)

* Target Variable

In [None]:
sns.distplot(train['fare_amount'] , fit=norm);

(mu, sigma) = norm.fit(train['fare_amount'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Fare amount distribution')

fig = plt.figure()
res = stats.probplot(train['fare_amount'], plot=plt)
plt.show()

In [None]:
train["fare_amount"] = np.log1p(train["fare_amount"])

sns.distplot(train['fare_amount'] , fit=norm);

(mu, sigma) = norm.fit(train['fare_amount'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Fare amount distribution')

fig = plt.figure()
res = stats.probplot(train['fare_amount'], plot=plt)
plt.show()

In [None]:
train.head(5)

* **Keep Relevant Variables**

In [None]:
y_train = train.fare_amount.values
features_drop = ['pickup_datetime','fare_by_hour_of_day_weekday_month_year_skew', 'fare_by_hour_of_day_weekday_month_year_std', 'fare_by_hour_of_day_weekday_month_year_sum', 'fare_by_hour_of_day_weekday_month_year_mean','fare_by_hour_of_day_weekday_month_year_percentile_80','fare_by_hour_of_day_weekday_month_year_percentile_20','haversine']
test_df.drop(features_drop, axis = 1, inplace=True)
train = train[test_df.columns]
print("Does Train feature equal test feature?: ", all(train.columns == test_df.columns))

In [None]:
train.shape, test_df.shape, len(y_train)

# **Mdelling**

I thank the kernel for allowing me to study improved modeling.
(https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard)

In [None]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

**Define a cross validation strategy**

We use the cross_val_score function of Sklearn. However this function has not a shuffle attribut, we add then one line of code, in order to shuffle the dataset prior to cross-validation

In [None]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

# **Base models**

* **LASSO Regression**

This model may be very sensitive to outliers. So we need to made it more robust on them. For that we use the sklearn's Robustscaler() method on pipeline

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=45))

* **Elastic Net Regression**

In [None]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

* **Kernel Ridge Regression**

In [None]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

* **Gradient Boosting Regression**

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

* **XGBoost**

In [None]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

* **LightGBM**

In [None]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

* **Base models scores**

In [None]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

**Stacking models**

Simplest Stacking approach : Averaging base models

We begin with this simple approach of averaging base models. We build a new class to extend scikit-learn with our model and also to laverage encapsulation and code reuse (inheritance)

* **Averaged base models class**

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [None]:
averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

* **Stacking averaged Models Class**

In [None]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

**Stacking Averaged models Score**

To make the two approaches comparable (by using the same number of models) , we just average Enet KRR and Gboost, then we add lasso as meta-model.

In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

* **Ensembling StackedRegressor, XGBoost and LightGBM**

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

# **Final Training and Prediction**

* **StackedRegressor**

In [None]:
stacked_averaged_models.fit(train.values, y_train)
stacked_train_pred = stacked_averaged_models.predict(train.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(test_df.values))
print(rmsle(y_train, stacked_train_pred))

* **XGBoost**

In [None]:
model_xgb.fit(train, y_train)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test_df))
print(rmsle(y_train, xgb_train_pred))

* **LightGBM**

In [None]:
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test_df.values))
print(rmsle(y_train, lgb_train_pred))

In [None]:
'''RMSE on the entire Train data when averaging'''

print('RMSLE score on train data:')
print(rmsle(y_train,stacked_train_pred*0.70 +
               xgb_train_pred*0.15 + lgb_train_pred*0.15 ))

* **Ensemble prediction**

In [None]:
ensemble = stacked_train_pred*0.70 + xgb_train_pred*0.15 + lgb_train_pred*0.15

In [None]:
sub = pd.read_csv('../input/new-york-city-taxi-fare-prediction/sample_submission.csv')
submission = pd.DataFrame()
submission['key'] = sub['key']
submission['fare_amount'] = ensemble
submission.to_csv('submission_ensemble_1.csv',index=False)