# New York City Taxi Fare Prediction

## Team member：
### Haoun Guo          hg1483
### Da Cai             dc4069
### Naisheng Zhang     nz862

In [92]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping
from keras import optimizers
from keras import regularizers

## Step 1 : Data load and clean

In [93]:
TRAIN_PATH = '../input/train.csv'
TEST_PATH = '../input/test.csv'
SUBMISSION_NAME = 'submission.csv'
DATASET_SIZE = 200000
# datatypes = {'key': 'str', 
#               'fare_amount': 'float32',
#               'pickup_datetime': 'str', 
#               'pickup_longitude': 'float32',
#               'pickup_latitude': 'float32',
#               'dropoff_longitude': 'float32',
#               'dropoff_latitude': 'float32',
#               'passenger_count': 'uint8'}
train = pd.read_csv(TRAIN_PATH, nrows=DATASET_SIZE)
test = pd.read_csv(TEST_PATH)
# train.head()
# test=test.drop('key', axis=1)
train.head()
print('Old size: %d' % len(train))
train = train[train.fare_amount>=0]
print('New size: %d' % len(train))


Old size: 200000
New size: 199987


remove missing data

In [94]:
print('Old size: %d' % len(train))
train = train.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(train))

Old size: 199987
New size: 199986


## step 2: Data clean

### After analysis of data, we need to remove some noisy from data.
* Lats and lons that do not belong to New York.
* Negative fare.
* Fare greater than 250 (this seems to be noisy data).
* Rides that begin and end in the same location.

make map small

In [95]:
# def select_within_boundingbox(df, BB):
#     return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
#            (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
#            (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
#            (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])
            
# # load image of NYC map
# BB = (-74.5, -72.8, 40.5, 41.8)
# print('Old size: %d' % len(train))
# train = train[select_within_boundingbox(train, BB)]
# print('New size: %d' % len(train))

In [64]:
print('Old size: %d' % len(train))
def clean(df):
    # Delimiter lats and lons to NY only
    df = df[(-76 <= df['pickup_longitude']) & (df['pickup_longitude'] <= -72)]
    df = df[(-76 <= df['dropoff_longitude']) & (df['dropoff_longitude'] <= -72)]
    df = df[(38 <= df['pickup_latitude']) & (df['pickup_latitude'] <= 42)]
    df = df[(38 <= df['dropoff_latitude']) & (df['dropoff_latitude'] <= 42)]
    # Remove possible outliers
    df = df[(0 < df['fare_amount']) & (df['fare_amount'] <= 250)]
    # Remove inconsistent values
    df = df[(df['dropoff_longitude'] != df['pickup_longitude'])]
    df = df[(df['dropoff_latitude'] != df['pickup_latitude'])]
    
    return df
train = clean(train)
print('New size: %d' % len(train))
train.head()

Old size: 193666
New size: 193666


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


## step 3.1 : Time features


* * Year, Month, Day, Hour, Weekday
* * Night (between 16h and 20h, from monday to friday)
* * Late night (between 20h and and 6h)

In [96]:
def late_night (row):
    if (row['hour'] <= 6) or (row['hour'] >20):
        return 1
    else:
        return 0
def night (row):
    if ((row['hour'] <= 20) and (row['hour'] > 16)) and (row['weekday'] < 5):
        return 1
    else:
        return 0    
def day_time(row):
    if (row['hour'] <= 16) and (row['hour'] >6):
        return 1
    else:
        return 0
def spring(row):
    if (row['month'] >=4) and (row['month'] <7):
        return 1
    else:
        return 0  
def summer(row):
    if (row['month'] >=7) and (row['month'] <10):
        return 1
    else:
        return 0
def fall(row):
    if (row['month'] >=10) and (row['month'] <=12):
        return 1
    else:
        return 0  
def winter(row):
    if (row['month'] >=1) and (row['month'] <4):
        return 1
    else:
        return 0  

def add_time_features(df):
    df['pickup_datetime'] =  pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S %Z')
    df['year'] = df['pickup_datetime'].apply(lambda x: x.year)
    df['month'] = df['pickup_datetime'].apply(lambda x: x.month)
    df['day'] = df['pickup_datetime'].apply(lambda x: x.day)
    df['hour'] = df['pickup_datetime'].apply(lambda x: x.hour)
    df['weekday'] = df['pickup_datetime'].apply(lambda x: x.weekday())
    df['pickup_datetime'] =  df['pickup_datetime'].apply(lambda x: str(x))
    df['night'] = df.apply (lambda x: night(x), axis=1)
    df['late_night'] = df.apply (lambda x: late_night(x), axis=1)
    df['day_time'] = df.apply (lambda x: day_time(x), axis=1)
    df['spring']=df.apply (lambda x: spring(x), axis=1)
    df['summer']=df.apply (lambda x: summer(x), axis=1)
    df['fall']=df.apply (lambda x: fall(x), axis=1)
    df['winter']=df.apply (lambda x: winter(x), axis=1)
    # Drop 'pickup_datetime' as we won't need it anymore
    df = df.drop('pickup_datetime', axis=1)
    return df
train = add_time_features(train)
test = add_time_features(test)
train.head(100)

Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,hour,weekday,night,late_night,day_time,spring,summer,fall,winter
0,2009-06-15 17:26:21.0000001,4.50,-73.844311,40.721319,-73.841610,40.712278,1,2009,6,15,17,0,1,0,0,1,0,0,0
1,2010-01-05 16:52:16.0000002,16.90,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,16,1,0,0,1,0,0,0,1
2,2011-08-18 00:35:00.00000049,5.70,-73.982738,40.761270,-73.991242,40.750562,2,2011,8,18,0,3,0,1,0,0,1,0,0
3,2012-04-21 04:30:42.0000001,7.70,-73.987130,40.733143,-73.991567,40.758092,1,2012,4,21,4,5,0,1,0,1,0,0,0
4,2010-03-09 07:51:00.000000135,5.30,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,7,1,0,0,1,0,0,0,1
5,2011-01-06 09:50:45.0000002,12.10,-74.000964,40.731630,-73.972892,40.758233,1,2011,1,6,9,3,0,0,1,0,0,0,1
6,2012-11-20 20:35:00.0000001,7.50,-73.980002,40.751662,-73.973802,40.764842,1,2012,11,20,20,1,1,0,0,0,0,1,0
7,2012-01-04 17:22:00.00000081,16.50,-73.951300,40.774138,-73.990095,40.751048,1,2012,1,4,17,2,1,0,0,0,0,0,1
8,2012-12-03 13:10:00.000000125,9.00,-74.006462,40.726713,-73.993078,40.731628,1,2012,12,3,13,0,0,0,1,0,0,1,0
9,2009-09-02 01:11:00.00000083,8.90,-73.980658,40.733873,-73.991540,40.758138,2,2009,9,2,1,2,0,1,0,0,1,0,0


In [91]:
# print(train.year.min(),train.year.max())
# print(test.year.min(),test.year.max())
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))

2009 2015
2009 2015


## step 3.2 : Coordinate features

* * Latitude difference (difference from pickup and dropout latitudes)
* * Longitude difference (difference from pickup and dropout longitudes)

In [None]:
def add_coordinate_features(df):
    lat1 = df['pickup_latitude']
    lat2 = df['dropoff_latitude']
    lon1 = df['pickup_longitude']
    lon2 = df['dropoff_longitude']
    # Add new features
    df['latdiff'] = (lat1 - lat2)
    df['londiff'] = (lon1 - lon2)
    return df

## step 3.3 : Distances features

In [None]:
def manhattan(pickup_lat, pickup_long, dropoff_lat, dropoff_long):
    return np.abs(dropoff_lat - pickup_lat) + np.abs(dropoff_long - pickup_long)
def add_distances_features(df):
    # Add distances from airpot and downtown
    ny = (-74.0063889, 40.7141667)
    jfk = (-73.7822222222, 40.6441666667)
    ewr = (-74.175, 40.69)
    lgr = (-73.87, 40.77)
    
    lat1 = df['pickup_latitude']
    lat2 = df['dropoff_latitude']
    lon1 = df['pickup_longitude']
    lon2 = df['dropoff_longitude']
    
    df['euclidean'] = (df['latdiff'] ** 2 + df['londiff'] ** 2) ** 0.5
    df['manhattan'] = manhattan(lat1, lon1, lat2, lon2)
    
    df['downtown_pickup_distance'] = manhattan(ny[1], ny[0], lat1, lon1)
    df['downtown_dropoff_distance'] = manhattan(ny[1], ny[0], lat2, lon2)
    df['jfk_pickup_distance'] = manhattan(jfk[1], jfk[0], lat1, lon1)
    df['jfk_dropoff_distance'] = manhattan(jfk[1], jfk[0], lat2, lon2)
    df['ewr_pickup_distance'] = manhattan(ewr[1], ewr[0], lat1, lon1)
    df['ewr_dropoff_distance'] = manhattan(ewr[1], ewr[0], lat2, lon2)
    df['lgr_pickup_distance'] = manhattan(lgr[1], lgr[0], lat1, lon1)
    df['lgr_dropoff_distance'] = manhattan(lgr[1], lgr[0], lat2, lon2)
    return df

## step 4 : Process data

In [None]:

add_coordinate_features(train)
add_coordinate_features(test)
train = add_distances_features(train)
test = add_distances_features(test)
train.head()

In [None]:
dropped_columns = ['pickup_longitude', 'pickup_latitude', 
                   'dropoff_longitude', 'dropoff_latitude']
train_clean = train.drop(dropped_columns, axis=1)
test_clean = test.drop(dropped_columns + ['key', 'passenger_count'], axis=1)

# peek data
train_clean.head()

## step 5 : Split data in train and validation

In [None]:
train_df, validation_df = train_test_split(train_clean, test_size=0.10, random_state=1)
# Get labels
train_labels = train_df['fare_amount'].values
validation_labels = validation_df['fare_amount'].values
train_df = train_df.drop(['fare_amount'], axis=1)
validation_df = validation_df.drop(['fare_amount'], axis=1)

In [None]:
scaler = preprocessing.MinMaxScaler()
train_df_scaled = scaler.fit_transform(train_df)
validation_df_scaled = scaler.transform(validation_df)
test_scaled = scaler.transform(test_clean)

## step 6 : Create Model

In [None]:
BATCH_SIZE = 256
EPOCHS = 20
LEARNING_RATE = 0.001
model = Sequential()
model.add(Dense(256, activation='relu', input_dim=train_df_scaled.shape[1], activity_regularizer=regularizers.l1(0.01)))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(8, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1))
adam = optimizers.Adam(lr=LEARNING_RATE)
model.compile(loss='mse', optimizer=adam, metrics=['mae'])

## step 7 : Training Model

In [None]:
history = model.fit(x=train_df_scaled, y=train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS, 
                    verbose=1, validation_data=(validation_df_scaled, validation_labels), 
                    shuffle=True)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

In [None]:
prediction = model.predict(test_scaled, batch_size=128, verbose=1)

In [None]:
def output_submission(raw_test, prediction, id_column, prediction_column, file_name):
    df = pd.DataFrame(prediction, columns=[prediction_column])
    df[id_column] = raw_test[id_column]
    df[[id_column, prediction_column]].to_csv((file_name), index=False)
    print('Output complete')
output_submission(test, prediction, 'key', 'fare_amount', SUBMISSION_NAME)