In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
print(os.listdir('../input'))

['.DS_Store', 'test.csv', 'GCP-Coupons-Instructions.rtf', 'train.csv', 'sample_submission.csv']


### Setup training data

In [3]:
train_df = pd.read_csv('../input/train.csv', nrows=10000000)

In [4]:
train_df.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [5]:
train_df.shape

(10000000, 8)

Create two new features representing the "travel vector" between the start and end points of the taxi ride, in both longitude and latitude coordinates.

In [6]:
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

In [7]:
add_travel_vector_features(train_df)

In [8]:
train_df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_diff_longitude,abs_diff_latitude
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,0.002701,0.009041
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,0.03678,0.070701
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,0.008504,0.010708
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,0.004437,0.024949
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,0.01144,0.015754


### Explore and prune outliers

First let's see if there are any `Nan`s in the dataset

In [9]:
print(train_df.isnull().sum())

key                    0
fare_amount            0
pickup_datetime        0
pickup_longitude       0
pickup_latitude        0
dropoff_longitude     69
dropoff_latitude      69
passenger_count        0
abs_diff_longitude    69
abs_diff_latitude     69
dtype: int64


There are a small amount, so let's remove them from the dataset.

In [14]:
print('Old size: %d' % len(train_df))
train_df = train_df.dropna(how='any', axis='rows')
print('New size: %d' % len(train_df))

Old size: 9999931
New size: 9999931


Now let's quickly plot a subset of our travel vector features to see its distribution.

In [17]:
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
plot = train_df.iloc[:2000].plot.scatter('abs_diff_longitude', 'abs_diff_latitude')

In [None]:
plt.show()

The values should be very small since it should all be differences between GPS coordinates within one city. For reference, one degree of latitude is about 69 miles. However, we can see the dataset has extreme values which do not make sense. Let's remove those values from our training set. Based on the scatterplot, it looks like we can safely exclude values above 5( though remember the scatterplot is only showing the first 2000 rows...)

In [None]:
print('Old size: %d' % len(train_df))
train_df = train_df[(train_df.abs_diff_longitude < 5.0) & (train_df.abs_diff_latitude < 5.0)]
print('New size: %d' % len(train_df))

### Train our model

Our model will take the form $X*w = y$ where $X$ is a matrix of input features, and $y$ is a column of the target variable, `fare_amount`, for each row. The weight column $w$ is what we will "learn".

In [None]:
# Construct and return an Nx3 input matrix for our linear model
# using the travel vector, plus a 1.0 for a constant bias term.
def get_input_matrix(df):
    return np.column_stack((df.abs_diff_longitude, df.abs_diff_latitude, np.ones(len(df))))

In [None]:
train_X = get_input_matrix(train_df)
train_y = np.array(train_df['fare_amount'])

print(train_X.shape)
print(train_y.shape)

Now let's use `numpy`'s `lstsq` library function to find the optimal weight column $w$.

In [None]:
(w, _, _, _) = np.linalg.lstsq(train_X, train_y, rcond=None)
print(w)

These weights pass a quick sanity check, since we'd expect the first two values -- the weights for the absolute longitude and latitude differences -- to be positive, as more distance should imply a higher fare, and we'd expect the bias term to loosely represent the cost of a very short ride.

Side note: we can actually calculate the weight clumn $w$ directly using the Ordinary Least Squares method:

$ w = (X^T * X) ^ (-1)*X^T*y$

In [None]:
w_OLS = np.matmul(np.matmul(np.linalg.inv(np.matmul(train_X.T, train_X)), train_X.T), train_y)

In [None]:
print(w_OLS)

### Make predictions on the test set

In [None]:
test_df = pd.read_csv('../input/test.csv')
print(test_df.shape)
print(test_df.dtypes)

In [None]:
# Reuse the above helper functions to add our features and generate the input matrix.
add_travel_vector_features(test_df)
test_X = get_input_matrix(test_df)
# Predict fare_amount on the test set using our model (w) trained on the training set.
test_y_predictions = np.matmul(test_X, w).round(decimals=2)

In [None]:
# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
{'key': test_df.key, 'fare_amount': test_y_predictions},
columns=['key', 'fare_amount'])
submission.to_csv('submission.csv', index=False)
print(os.listdir('.'))

Check the RMSE on other training data

In [None]:
val_df = pd.read_csv('../input/train.csv', skiprows=10000000, nrows=5000000, names=train_df.columns)
print(val_df.shape)

In [None]:
val_df.head()

In [None]:
print(val_df.isnull().sum())

In [None]:
val_df = val_df.dropna(how='any', axis='rows')

In [None]:
add_travel_vector_features(val_df)
val_X = get_input_matrix(val_df)
val_y = np.array(val_df['fare_amount'])
val_y_predictions = np.matmul(val_X, w).round(decimals=2)

In [None]:
def RMSE(y, yp):
    return np.mean((y - yp)**2)**(0.5)

In [None]:
RMSE(val_y, val_y_predictions)

In [None]:
val_y[:10]

In [None]:
val_y_predictions[:10]

In [None]:
val_y.shape

In [None]:
val_y_predictions.shape

In [None]:
val_y_predictions.max()

In [None]:
val_y.max()

In [None]:
val_y_predictions.argmax(), val_y.argmax()

The RMSE is so fucking large, we need to drop some outliers first.

In [None]:
print('Old size: %d' % len(val_df))
val_df = val_df[(val_df.abs_diff_latitude < 5.0) & (val_df.abs_diff_longitude < 5.0)]
print('New size: %d' % len(val_df))

In [None]:
add_travel_vector_features(val_df)
val_X = get_input_matrix(val_df)
val_y = np.array(val_df['fare_amount'])
val_y_predictions = np.matmul(val_X, w).round(decimals=2)

In [None]:
RMSE(val_y, val_y_predictions)

果然主要是outliers造成的。