# Taxi Fare Predictions

Here we use the New York Taxi Fare Prediction dataset to evaluate the performance of different ML models for taxi fare predictions.

First of all, let us import the libraries we are going to use.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

### Import data

Import the train dataset, with a number of rows equal to 500.000.

In [None]:
train_df = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', nrows=500_000)

train_df.head()

This dataset contains information on taxi rides, including the taxi fare, the pickup datetime, the pickup and dropoff locations, and the number of passenger per ride. The taxi fare is the target variable and we have the others to build the predictors.

### Remove NaN values

First, remove nan values


In [None]:
def drop_nan_values(df): return df.dropna()

train_df = drop_nan_values(train_df)
print(train_df.isnull().sum())
print('shape:', train_df.shape)

### Remove outliers





Now, we detect and remove outliers based on computing percentiles.

First, review the statistics of data

In [None]:
train_df.describe()

Second, define a functions to produce scatterplots of the target variable against the input variables, so that we can inspect the cuts in the data volume.

In [None]:
# Function to produce multiple scatter plots
def multi_scatter_plot(df, features, target):
    
    def single_plot(df, xlabel, ylabel):
        x = df[xlabel]
        fig, axes = plt.subplots(figsize=(6, 4))
        axes.scatter(x, y, alpha=.3)
        axes.set(xlabel=xlabel, ylabel=ylabel)
        axes.legend()
        plt.tight_layout()
        plt.show()
    
    def double_plot(df, x_labels, ylabel):
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        for i, xlabel in enumerate(x_labels):
            x = df[xlabel]
            axes[i].scatter(x, y, alpha=.3)
            axes[i].set(xlabel=xlabel, ylabel=ylabel)
            axes[i].legend()
        plt.tight_layout()
        plt.show()
    
    def multi_plot(df, x_labels, ylabel):
        n = len(x_labels)//2
        fig, axes = plt.subplots(n, 2, figsize=(12, 4*n))
        count=0
        for i in range(n):
            for j in range(2):
                xlabel = x_labels[count]
                x = df[xlabel]
                axes[i][j].scatter(x, y, alpha=.3)
                axes[i][j].set(xlabel=xlabel, ylabel=ylabel)
                axes[i][j].legend()
                count += 1
        plt.tight_layout()
        plt.show()
    
    y = df[target]
    ylabel = target
    x_labels = features
    
    if len(x_labels)==1:
        xlabel = x_labels[0]
        single_plot(df, xlabel, ylabel)
        
    elif len(x_labels)==2:
        double_plot(df, x_labels, ylabel)
        
    elif len(x_labels)==3:
        double_plot(df, x_labels[:2], ylabel)
        single_plot(df, x_labels[-1], ylabel)
        
    else:
        multi_plot(df, x_labels, ylabel)
        if len(x_labels)%2!=0:           
            single_plot(df, x_labels[-1], ylabel)
            

features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
target = 'fare_amount'
#multi_scatter_plot(train_df, features, target)

By computing percentiles, we drop data in the outermost region of the data volume as well as zero points.

In [None]:
percentiles = {'fare_amount': (.001,.9995),
               'pickup_longitude': (0.0005, 0.98),
               'pickup_latitude': (0.02, 0.999),
               'dropoff_longitude': (0.001, 0.98),
               'dropoff_latitude': (0.02, 0.98),
               'passenger_count':(0.001, 1.),}

In [None]:
def drop_percentiles(df, percentiles):
    for key in percentiles.keys():
        per1, per2 = percentiles[key]
        low, high = train_df[key].quantile(per1), train_df[key].quantile(per2)
        df = df[(df[key] > low) & (df[key] < high)]
    return df

print('Old size: %d' % len(train_df))
train_df = drop_percentiles(train_df, percentiles)
print('New size: %d' % len(train_df))

In [None]:
multi_scatter_plot(train_df, features, target)

### Absolute distance

As critical information, we use the pickup and dropoff locations to build new columns with the absolute longitudinal distances and the absolute distance traveled.

In [None]:
def add_abs_distances(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()
    df['abs_distance'] = np.sqrt(np.square(df.abs_diff_longitude) + np.square(df.abs_diff_latitude))
    return df

train_df = add_abs_distances(train_df)

### Pickup date and time

Let us extract from the "pickup_datetime" the year, month, day and hour and place them in new columns.

In [None]:
def add_date_time(df):
    # extract information from 'pickup_datetime'
    df[['year', 'month', 'day']] = df.pickup_datetime.str.split(' ', expand=True).iloc[:,0].str.split('-', expand=True).astype('int64')
    df['hour'] = df.pickup_datetime.str.split(' ', expand=True).iloc[:,1].str.split(':', expand=True).iloc[:,0].astype('int64')
    # removing 'key' and 'pickup_datetime' columns
    df = df.drop(columns = ['key', 'pickup_datetime'])#, inplace=True)
    return df
    
train_df = add_date_time(train_df)

Now, plot these new columns.

In [None]:
features = ['abs_diff_longitude', 'abs_diff_latitude', 'abs_distance', 'passenger_count']
target = 'fare_amount'
multi_scatter_plot(train_df, features, target)

## Feature selection

First, split the train dataset into train and test datasets

In [None]:
train_df, test_df = train_test_split(train_df, test_size=.2, random_state=1)

print(train_df.shape)
print(test_df.shape)

### Linear correlation

Next, let's compute the correlation matrix to identify those features that are most correlated with the target and those that are most correlated to each other. The former type corresponds to the best predictors and the latter type those that introduce multicollinearity. For two variables, $x$ and $y$, the formula is as follows

$\sum_{i,j}(x_i-\bar{x}))(y_j-\bar{y})/\sigma_i \sigma_j$

We do this just after separating a test data set, and use the train data set only, so as not to involve the test data set in any feature selection procedures.

In [None]:
def corr_heatmap(df):
    corr_data = df.corr()
    fig, ax = plt.subplots(figsize=(10,8))
    # Add title
    #plt.title(title, fontsize=12)
    # Heatmap showing the amount of genomes with the same MIC for each MIC, by antibiotic
    sns.heatmap(corr_data, annot=corr_data, cmap='Blues', cbar=True, fmt='.2f')
    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')
    plt.show()
    
corr_heatmap(train_df)

The correlation heatmap shows that there are sizeable correlation between the target and some of the features. High correlation is reached between the target and the absolute distances traveled, median correlation between the target and the pickup and dropoff longitudes, and low correlation with the pickup and dropoff latitudes, passenger counts and time variables. 

However, high correlation is also found between feature variables. "The performance of some algorithms can deteriorate if two or more variables are tightly related, called multicollinearity".

Thus, we can safely reduce the number of columns, using only the features with high correlation with the target but uncorrelated with each other.

## Modeling and Validation

We will do the model evaluation in two different ways:
- i) by using the train and test datasets, 
- ii) by cross-validation, using the whole train dataset reloaded again

First of all, define the models

In [None]:
# specify models custom models
def NNRegressor():
    return MLPRegressor(hidden_layer_sizes=(4,), activation='tanh', solver='sgd', tol=0.01, n_iter_no_change=50, verbose=False)


models = {'DTR': DecisionTreeRegressor(),
          'RFR': RandomForestRegressor(),
          'LR': LinearRegression(),
          'SGDR': SGDRegressor(),
          'NNR': NNRegressor()}

Then, define the target and the feature variables.

In [None]:
# Define the target and the feature variables in decreasing level of importance
sel_features = ['abs_distance', 'abs_diff_latitude', 'pickup_longitude', 'dropoff_longitude']
X_train, y_train = train_df[sel_features], train_df.fare_amount
X_test, y_test = test_df[sel_features], test_df.fare_amount

### Generalization or test error

"Test error, also referred to as generalization error, is the prediction error
over an independent test sample". Let us compute here the root mean squared error.

In [None]:
starttime = time.time()

# Define an empty list to save the root mean squared error as the test error
rmse_errt = []
for model in models.values():
    # Always scale the input. The most convenient way is to use a pipeline.
    model = make_pipeline(StandardScaler(), model)
    # fit the model
    model.fit(X_train, y_train)
    # make predictions and compute the root mean squared error
    predictions = model.predict(X_test)
    rmse_errt.append(mean_squared_error(y_test, predictions, squared=False))

for i, model in enumerate(models):
    print('%s RMSE = %.2f' % (model, rmse_errt[i]))

print('Time: {:0.2f} seconds'.format(time.time() - starttime))

### Cross-validation

Now we validate the models by means of cross-validation. Here, we use the entire training dataset

In [None]:
# define a function that includes all previous preprocessing steps
def preprocessing(df):
    df = drop_nan_values(df)
    df = drop_percentiles(df, percentiles)
    df = add_abs_distances(df)
    df = add_date_time(df)
    return df

In [None]:
# Reload the training data, preprocess it, and define the input and output variables
train_df = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', nrows=500_000)
train_df = preprocessing(train_df)
X_train, y_train = train_df[sel_features], train_df.fare_amount

# define model evaluation method (n_splits = 1/test_size)
cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=0)

starttime = time.time()

# evaluate the models
rmse_per_model = []
for model in models.values():
    model = make_pipeline(StandardScaler(), model)
    rmse = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
    rmse = np.abs(rmse)
    rmse_per_model.append(rmse)
    
print('Time: {:0.2f} seconds'.format(time.time() - starttime))

## Results

Let us show the results of error measurements in a box whisker plot

In [None]:
# define a custom box whisker plot
def box_whisker_plot(data, labels, user_errt):
    mean = np.mean(data, axis=1)
    y = np.array(range(1,len(mean)+1))
    fig, ax = plt.subplots(figsize=(6,4))
    ax.set_title('RMSE per model')
    ax.set_xlabel('RMSE')
    ax.set_ylabel('Model')
    ax.boxplot(data, labels=labels, vert=False, whis=(0,100))
    ax.scatter(user_errt, y,  marker='^', label='Test error')
    ax.scatter(mean, y,  marker='^', label='C-V error')
    plt.legend()
    plt.show()

box_whisker_plot(rmse_per_model, models.keys(), rmse_errt)

As final remarks:

- Cross-validation estimates are in agreement with the generalization error.
- The Random Forest Regressor performs better in both validation schemes.