In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline

In [2]:
train_df = pd.read_csv('./all/train.csv', nrows=2000000)

In [3]:
# Given a dataframe, add two new features 'abs_diff_longitude' and
# 'abs_diff_latitude' reprensenting the "Manhattan vector" from
# the pickup location to the dropoff location.
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()
    
    return df

In [4]:
def extract_date_features(df):
    df['dates'] = pd.to_datetime(df['pickup_datetime'])
    
    df['year'] = df['dates'].dt.year
    df['month'] = df['dates'].dt.month
    df['day'] = df['dates'].dt.day
    
    df['hour'] = df['dates'].dt.hour
    df['minute'] = df['dates'].dt.minute
    return df

In [5]:
def date_to_cat(df, features):
    return pd.get_dummies(df, drop_first=True, columns=features)

In [6]:
new_train_df = add_travel_vector_features(train_df)
new_train_df = extract_date_features(new_train_df)
new_train_df = date_to_cat(new_train_df, ['year', 'month', 'day', 'hour', 'minute'])

In [7]:
#new_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Columns: 140 entries, key to minute_59
dtypes: datetime64[ns](1), float64(7), int64(1), object(2), uint8(129)
memory usage: 413.9+ MB


In [8]:
new_train_df.head(n=3)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_diff_longitude,abs_diff_latitude,...,minute_50,minute_51,minute_52,minute_53,minute_54,minute_55,minute_56,minute_57,minute_58,minute_59
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,0.002701,0.009041,...,0,0,0,0,0,0,0,0,0,0
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,0.03678,0.070701,...,0,0,1,0,0,0,0,0,0,0
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,0.008504,0.010708,...,0,0,0,0,0,0,0,0,0,0
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,0.004437,0.024949,...,0,0,0,0,0,0,0,0,0,0
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,0.01144,0.015754,...,0,1,0,0,0,0,0,0,0,0
5,2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:45 UTC,-74.000964,40.73163,-73.972892,40.758233,1,0.028072,0.026603,...,1,0,0,0,0,0,0,0,0,0
6,2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1,0.0062,0.01318,...,0,0,0,0,0,0,0,0,0,0
7,2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1,0.038795,0.02309,...,0,0,0,0,0,0,0,0,0,0
8,2012-12-03 13:10:00.000000125,9.0,2012-12-03 13:10:00 UTC,-74.006462,40.726713,-73.993078,40.731628,1,0.013384,0.004915,...,0,0,0,0,0,0,0,0,0,0
9,2009-09-02 01:11:00.00000083,8.9,2009-09-02 01:11:00 UTC,-73.980658,40.733873,-73.99154,40.758138,2,0.010882,0.024265,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#check for missing values
print(new_train_df.isnull().sum())

key                    0
fare_amount            0
pickup_datetime        0
pickup_longitude       0
pickup_latitude        0
dropoff_longitude     14
dropoff_latitude      14
passenger_count        0
abs_diff_longitude    14
abs_diff_latitude     14
dates                  0
year_2010              0
year_2011              0
year_2012              0
year_2013              0
year_2014              0
year_2015              0
month_2                0
month_3                0
month_4                0
month_5                0
month_6                0
month_7                0
month_8                0
month_9                0
month_10               0
month_11               0
month_12               0
day_2                  0
day_3                  0
                      ..
minute_30              0
minute_31              0
minute_32              0
minute_33              0
minute_34              0
minute_35              0
minute_36              0
minute_37              0
minute_38              0


In [10]:
#drop missing values
new_train_df.dropna(how = 'any', axis = 'rows', inplace=True)

In [11]:
#drop outliers with a difference of 5 degrees and above
new_train_df = new_train_df[(new_train_df.abs_diff_longitude < 5.0) & (new_train_df.abs_diff_latitude < 5.0)]

In [12]:
new_train_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,abs_diff_longitude,abs_diff_latitude,year_2010,year_2011,...,minute_50,minute_51,minute_52,minute_53,minute_54,minute_55,minute_56,minute_57,minute_58,minute_59
count,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,...,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0,1995956.0
mean,11.34078,-72.58671,39.96007,-72.58582,39.96041,1.684127,0.02254066,0.02111638,0.1508826,0.1590135,...,0.01687863,0.01688765,0.01675187,0.0167163,0.01669476,0.01673734,0.0167644,0.01668524,0.01649385,0.01631599
std,9.832107,10.53077,6.222508,10.53066,6.222546,1.314879,0.03936737,0.02887887,0.3579345,0.3656888,...,0.1288167,0.1288505,0.1283404,0.1282064,0.1281251,0.1282857,0.1283875,0.1280892,0.1273649,0.1266878
min,-62.0,-2647.971,-1185.391,-2647.971,-1185.391,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,-73.99209,40.73498,-73.99142,40.73406,1.0,0.005794,0.00658,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.5,-73.98183,40.75266,-73.98019,40.75316,1.0,0.012415,0.013835,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,12.5,-73.9672,40.76712,-73.96377,40.76811,2.0,0.02361,0.026833,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1273.31,40.85036,1963.516,40.88638,1963.516,208.0,4.951542,4.991325,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
wtf = new_train_df[new_train_df.passenger_count > 10]
wtf[['passenger_count']]

Unnamed: 0,passenger_count
929022,208


In [14]:
#drop any records with more than 3 passengers
new_train_df = new_train_df[new_train_df.passenger_count < 4]

# Neural Network

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import math

import tensorflow as tf
from tensorflow import keras

In [16]:
# Data needs to be scaled to a small range like 0 to 1 for the neural
# network to work well.
scaler = MinMaxScaler(feature_range=(0, 1))

In [17]:
# 
features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'abs_diff_longitude', 'abs_diff_latitude', 'year', 'month', 'day', 'hour', 'minute']
scaled_train_df = scaler.fit_transform(new_train_df.drop(['key', 'fare_amount', 'pickup_datetime', 'dates'], axis=1))

In [18]:
#scaled_train_df[:10][:] #check fare amount

In [19]:
# Print out the adjustment that the scaler applied to the fare_amount column of data
#print("Note: total_earnings values were scaled by multiplying by {:.10f} and subtracting {:.6f}".format(scaler.scale_[0], scaler.min_[0]))

In [20]:
print(scaled_train_df.shape)

(1770281, 136)


In [21]:
X = scaled_train_df #all features from column 1
y = new_train_df['fare_amount'] #column 0

In [22]:
print(X.shape)
print(y.shape)

(1770281, 136)
(1770281,)


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.1)

In [127]:
sgd = keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=0.1, nesterov=True)
adam = keras.optimizers.Adam(lr=0.01) # current best performing

model = keras.Sequential()
model.add(keras.layers.Dense(400, input_dim=136, activation='relu'))
model.add(keras.layers.Dropout(0.1))
model.add(keras.layers.Dense(300, activation='relu'))
model.add(keras.layers.Dropout(0.1))
model.add(keras.layers.Dense(400, activation='relu'))
model.add(keras.layers.Dropout(0.1))
model.add(keras.layers.Dense(1, activation='linear'))

model.compile(loss="mean_squared_error", optimizer=adam)

model.fit(X_train, y_train, epochs=3, shuffle=True, verbose=2, batch_size=128)

Epoch 1/3
 - 223s - loss: 70.3071
Epoch 2/3
 - 197s - loss: 52.6930
Epoch 3/3
 - 182s - loss: 51.5308


<tensorflow.python.keras._impl.keras.callbacks.History at 0x1a34ff89b0>

In [59]:
training_error_rate = model.evaluate(X_train, y_train, verbose=0)
test_error_rate = model.evaluate(X_test, y_test, verbose=0)

print('Training RMSE: {}'.format(math.sqrt(training_error_rate)))
print('Test RMSE: {}'.format(math.sqrt(test_error_rate)))

Training RMSE: 9.80479892144378
Test RMSE: 9.744348343258665


In [118]:
test_df = pd.read_csv('./all/test.csv')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 7 columns):
key                  9914 non-null object
pickup_datetime      9914 non-null object
pickup_longitude     9914 non-null float64
pickup_latitude      9914 non-null float64
dropoff_longitude    9914 non-null float64
dropoff_latitude     9914 non-null float64
passenger_count      9914 non-null int64
dtypes: float64(4), int64(1), object(2)
memory usage: 542.2+ KB


In [119]:
new_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1770281 entries, 0 to 1999999
Columns: 140 entries, key to minute_59
dtypes: datetime64[ns](1), float64(7), int64(1), object(2), uint8(129)
memory usage: 379.9+ MB


In [120]:
new_test_df = add_travel_vector_features(test_df)
new_test_df = extract_date_features(new_test_df)
new_test_df = date_to_cat(new_test_df, ['year', 'month', 'day', 'hour', 'minute'])
#new_test_df['fare_amount'] = 0 #add dummy column for scaling

scaled_test_df = scaler.transform(new_test_df.drop(['key', 'pickup_datetime', 'dates'], axis=1))

In [121]:
#scaled_test_df[:10, :]

In [122]:
test_X = scaled_test_df

In [123]:
predictions = model.predict(test_X)
print(predictions[:5])
#predictions = predictions + scaler.min_[0]
#predictions = predictions / scaler.scale_[0]

[[14.826575]
 [15.49303 ]
 [ 9.046192]
 [12.993279]
 [19.887224]]


In [114]:
sub = new_test_df[['key']]
sub['fare_amount'] = predictions[:,0]
print(sub.shape)

(9914, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [116]:
sub.to_csv('./predictions/nn_136_400_300_300_1.csv', index=False)

In [128]:
mse = mean_squared_error(y_test, model.predict(X_test))
print('Test RMSE: {}'.format(math.sqrt(mse)))

Test RMSE: 26.267525848936096
