## First version based on the starter code published at __[kaggle](https://www.kaggle.com/dster/nyc-taxi-fare-starter-kernel-simple-linear-model)__

In [2]:
import numpy as np # linear algebra
import pandas as pd # CSV file I/O (e.g. pd.read_csv)
import os # reading the input files we have access to

print(os.listdir('../datasets/kaggle/new-york-city-taxi-fare-prediction'))

['submission.csv', 'train.csv.zip', 'train.csv', 'GCP-Coupons-Instructions.rtf', 'sample_submission.csv', 'test.csv']


### Load training data

Loading all 55 million rows which does require a lot of RAM, about 18GB for the whole notebook to run. If you have <=16GB, loading a subset should work.

In [3]:
train_df =  pd.read_csv('../datasets/kaggle/new-york-city-taxi-fare-prediction/train.csv')
train_df.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

### Add features
Add the absolute differences between longitude and latitude values as features. They are a measure of the distance between the start and end of the trips.

In [4]:
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

add_travel_vector_features(train_df)

### Prune rows with null values

In [5]:
print(train_df.isnull().sum())

key                     0
fare_amount             0
pickup_datetime         0
pickup_longitude        0
pickup_latitude         0
dropoff_longitude     376
dropoff_latitude      376
passenger_count         0
abs_diff_longitude    376
abs_diff_latitude     376
dtype: int64


In [6]:
print('Old size: %d' % len(train_df))
train_df = train_df.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(train_df))

Old size: 55423856
New size: 55423480


### Prune rows that whose longitude and latitude diff values are outliers

In [7]:
import matplotlib.pyplot as plt
plot = train_df.iloc[:2000].plot.scatter('abs_diff_longitude', 'abs_diff_latitude')

In [8]:
print('Old size: %d' % len(train_df))
train_df = train_df[(train_df.abs_diff_longitude < 5.0) & (train_df.abs_diff_latitude < 5.0)]
print('New size: %d' % len(train_df))

Old size: 55423480
New size: 55308916


### Construct training set with the selected features

In [9]:
# Construct and return an Nx3 input matrix for our linear model
# using the travel vector, plus a 1.0 for a constant bias term.
def get_input_matrix(df):
    return np.column_stack((df.abs_diff_longitude, df.abs_diff_latitude, np.ones(len(df))))

train_X = get_input_matrix(train_df)
train_y = np.array(train_df['fare_amount'])

print(train_X.shape)
print(train_y.shape)

(55308916, 3)
(55308916,)


### Compute weights using numpy's lstsq

In [10]:
(w, _, _, _) = np.linalg.lstsq(train_X, train_y, rcond = None)
print(w)

[179.69009171   9.24874474   6.64366403]


### Computer weights using the Ordinary Least Squares method

In [11]:
w_OLS = np.matmul(np.matmul(np.linalg.inv(np.matmul(train_X.T, train_X)), train_X.T), train_y)
print(w_OLS)

[147.19164103  76.41483685   6.41059403]


In [12]:
from sklearn.metrics import mean_squared_error
from math import sqrt

### Compute RMSE using the weights obtained from both the methods

In [13]:
train_y_w = np.matmul(train_X, w).round(decimals = 2)
rmse_w = sqrt(mean_squared_error(train_y, train_y_w))
print("rmse linalg.lstsq =", rmse_w)

rmse linalg.lstsq = 19.55202121145499


In [14]:
train_y_wOLS = np.matmul(train_X, w_OLS).round(decimals = 2)
rmse_wOLS = sqrt(mean_squared_error(train_y, train_y_wOLS))
print("rmse ordinary least squares =", rmse_wOLS)

rmse ordinary least squares = 19.47256206805313


### Use the weights obtained from OLS to make predictions

In [15]:
test_df = pd.read_csv('../datasets/kaggle/new-york-city-taxi-fare-prediction/test.csv')
test_df.dtypes

key                   object
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [16]:
# Reuse the above helper functions to add our features and generate the input matrix.
add_travel_vector_features(test_df)
test_X = get_input_matrix(test_df)
# Predict fare_amount on the test set using our model (w) trained on the training set.
test_y_predictions = np.matmul(test_X, w_OLS).round(decimals = 2)

# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': test_df.key, 'fare_amount': test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('../datasets/kaggle/new-york-city-taxi-fare-prediction/submission.csv', index = False)

print(os.listdir('.'))

['.ipynb_checkpoints', 'NewYorkCityTaxiFarePrediction.ipynb']


### On submission, a score of 5.74367 was received on the test set which puts it at 532 out of 628 submissions