### Dependencies

Bellow we setup path to the datasets and import the libraries that are gonna be used

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, LSTM
from keras.callbacks import EarlyStopping
from keras import optimizers
from keras import regularizers
from keras.callbacks import ModelCheckpoint
from keras import backend
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot


TRAIN_PATH = '../input/new-york-city-taxi-fare-prediction/train.csv'
SUBMISSION_NAME = 'submissiontry_water.csv'

TRAINMEHRAK_PATH = '../input/taxipredictiondata8m/trainMehrak.csv'
TESTMEHRAK_PATH = '../input/taxipredictiondata8m/testMehrak.csv'
TEST_PATH = '../input/new-york-city-taxi-fare-prediction/test.csv'

# Model parameters
BATCH_SIZE = 1000
EPOCHS = 100
LEARNING_RATE = 0.5
DATASET_SIZE = 8000000



This is the function to clean the outliers

In [None]:
def clean(df):
    
    print(' Old size: %d' % len(df))
    # remove any null data
    df = df.dropna(how = 'any', axis = 'rows')
    print(' New size after dropna: %d' % len(df))  
    
        # Remove inconsistent values
    df = df[(df['dropoff_longitude'] != df['pickup_longitude']) & (df['dropoff_latitude'] != df['pickup_latitude'])]
    print(' New size after removing same long lat: %d' % len(df))                 
    
    df = df[(df['dropoff_longitude'] != 0) & (df['pickup_longitude'] != 0) & (df['dropoff_latitude'] != 0) & (df['pickup_latitude'] != 0)] 
    print(' New size after removing 0 long lat: %d' % len(df))                          
    #MinMax = (-74.5, -72.8, 40.5, 41.8)
    MinMax = (-74.1, -73.87, 40.6, 41.0)

    #-72.986532	41.709555	-72.990963	41.696683
    # Delimiter lats and lons to NY only
    df = df[(MinMax[0] <= df['pickup_longitude']) & (df['pickup_longitude'] <= MinMax[1])]
    df = df[(MinMax[0] <= df['dropoff_longitude']) & (df['dropoff_longitude'] <= MinMax[1])]
    df = df[(MinMax[2] <= df['pickup_latitude']) & (df['pickup_latitude'] <= MinMax[3])]
    df = df[(MinMax[2] <= df['dropoff_latitude']) & (df['dropoff_latitude'] <= MinMax[3])]
       

    print(' New size after NYC lang lot: %d' % len(df))         
    
    df = df[(df['pickup_latitude'] != 0)]
    df = df[(df['pickup_latitude'] != 0)]
    df = df[(df['dropoff_longitude']!= 0)]
    df = df[(df['dropoff_latitude'] != 0)]
    
    print(' New size after lang lot > 0: %d' % len(df))         

    df = df[((df['pickup_latitude'] - df['dropoff_latitude']).abs() > 0.001)]
    df = df[((df['pickup_longitude'] - df['dropoff_longitude']).abs() > 0.001)]
    
    print(' New size after lang - lot > 0.001: %d' % len(df))         
    
    print(' New size after only NYC: %d' % len(df)) 
    
    df = df[(0 < df['fare_amount']) & (df['fare_amount'] <= 50)]
    
    print(' New size after removing outliers: %d' % len(df)) 
    
    df = df[(df['passenger_count'] > 0) & (df['passenger_count'] < 50)]
    print(' New size after removing 6=>passenger_count > 0 : %d' % len(df)) 
    #print(' New size after removing passenger_count > 0: %d' % len(df)) 
    
    
    
    # Remove airports
    
    nyc_coord = (40.7141667,-74.0063889) 
    fk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    sol_coord = (40.6892,-74.0445) # Statue of Liberty
    
             

    #ny
    df = df[(nyc_coord[1] != df['pickup_longitude']) & (df['pickup_latitude'] != nyc_coord[0])]
    df = df[(nyc_coord[1] != df['dropoff_longitude']) & (df['dropoff_latitude'] != nyc_coord[0])]
    
    print(' New size after NY airport: %d' % len(df))
    
    #jfk
    df = df[(fk_coord[1] != df['pickup_longitude']) & (df['pickup_latitude'] != fk_coord[0])]
    df = df[(fk_coord[1] != df['dropoff_longitude']) & (df['dropoff_latitude'] != fk_coord[0])]
    
    print(' New size after jfk airport: %d' % len(df))
    
    #ewr
    df = df[(ewr_coord[1] != df['pickup_longitude']) & (df['pickup_latitude'] != ewr_coord[0])]
    df = df[(ewr_coord[1] != df['dropoff_longitude']) & (df['dropoff_latitude'] != ewr_coord[0])]
    
    print(' New size after ewr airport: %d' % len(df))
    #lgr
    df = df[(lga_coord[1] != df['pickup_longitude']) & (df['pickup_latitude'] != lga_coord[0])]
    df = df[(lga_coord[1] != df['dropoff_longitude']) & (df['dropoff_latitude'] != lga_coord[0])]
    

    print(' New size after lgr airport: %d' % len(df))
             
    #sol
    df = df[(sol_coord[1] != df['pickup_longitude']) & (df['pickup_latitude'] != sol_coord[0])]
    df = df[(sol_coord[1] != df['dropoff_longitude']) & (df['dropoff_latitude'] != sol_coord[0])]
    

    print(' New size after sol removed: %d' % len(df))             
    
            
    print('Old size: %d' % len(df))
    df = remove_datapoints_from_water(df)
    print('New size: %d' % len(df))
    
        
    print(' New size: %d' % len(df))
    
       
    return df
    
   


Helper methods

In [None]:
# Helper methods
def remove_datapoints_from_water(df):
    def lonlat_to_xy(longitude, latitude, dx, dy, BB):
        return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
               (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')

    # define bounding box
    BB = (-74.5, -72.8, 40.5, 41.8)
    
    # read nyc mask and turn into boolean map with
    # land = True, water = False
    nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9
    
    # calculate for each lon,lat coordinate the xy coordinate in the mask map
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)    
    # calculate boolean index
    idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
    
    # return only datapoints on land
    return df[idx]
    
def late_night (row):
     if (row['hour'] <= 3) or (row['hour'] >= 0):
        return 1
     else:
        return 0


def night (row):
    if ((row['hour'] > 20) and (row['hour'] > 0)) and (row['weekday'] < 5):
        return 1
    else:
        return 0
    
def rush_hour (row):
    if ((row['hour'] <= 20) and (row['hour'] >= 16)) and (row['weekday'] < 5):
        return 1
    else:
        return 0   
    
def manhattan(pickup_lat, pickup_long, dropoff_lat, dropoff_long):
    return np.abs(dropoff_lat - pickup_lat) + np.abs(dropoff_long - pickup_long)


def add_time_features(df):
    df['pickup_datetime'] =  pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S %Z')
    df['year'] = df['pickup_datetime'].apply(lambda x: x.year)
    df['month'] = df['pickup_datetime'].apply(lambda x: x.month)
    df['day'] = df['pickup_datetime'].apply(lambda x: x.day)
    df['hour'] = df['pickup_datetime'].apply(lambda x: x.hour)
    df['minute'] = df['pickup_datetime'].apply(lambda x: x.minute)
    df['second'] = df['pickup_datetime'].apply(lambda x: x.second)
    
    return df


def add_coordinate_features(df):
    lat1 = df['pickup_latitude']
    lat2 = df['dropoff_latitude']
    lon1 = df['pickup_longitude']
    lon2 = df['dropoff_longitude']
    
    # Add new features
    df['latdiff'] = (lat1 - lat2)#.abs()
    df['londiff'] = (lon1 - lon2)#.abs()
    
    
    return df


def add_distances_features(df):
    
    lat1 = df['pickup_latitude']
    lat2 = df['dropoff_latitude']
    lon1 = df['pickup_longitude']
    lon2 = df['dropoff_longitude']
    
    df['manhattan'] = manhattan(lat1, lon1, lat2, lon2)
    #df['distance'] = np.sqrt(np.abs(df['pickup_longitude']-df['dropoff_longitude'])**2 + np.abs(df['pickup_latitude']-df['dropoff_latitude'])**2)
    df['distance'] = distance(lat1, lon1, lat2, lon2)
    #df['distance_miles'] = distance(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])
    
  
    return df

# This function is based on https://stackoverflow.com/questions/27928/
# calculate-distance-between-two-latitude-longitude-points-haversine-formula 
# return distance in miles
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a)) # 2*R*asin...
# This function is based on https://stackoverflow.com/questions/27928/
# calculate-distance-between-two-latitude-longitude-points-haversine-formula 
# return distance in miles
def distanceP(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a)) # 2*R*asin...
    
def output_submission(raw_test, prediction, id_column, prediction_column, file_name):
    df = pd.DataFrame(prediction, columns=[prediction_column])
    df[id_column] = raw_test[id_column]
    df[[id_column, prediction_column]].to_csv((file_name), index=False)
    print('Output complete')
    
    
def plot_loss_accuracy_rmse(history):
    
    plt.figure(figsize=(20,10))
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper right')
    plt.show()
    
    
    plt.figure(figsize=(20,10))
    plt.plot(history.history['rmse'])
    plt.plot(history.history['val_rmse'])
    plt.title('Model rmse')
    plt.ylabel('rmse')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper right')
    plt.show()
    
def plot_on_map(df, BB, nyc_map, s=10, alpha=0.2):
    fig, axs = plt.subplots(1, 2, figsize=(16,10))
    axs[0].scatter(df.pickup_longitude, df.pickup_latitude, zorder=1, alpha=alpha, c='r', s=s)
    axs[0].set_xlim((BB[0], BB[1]))
    axs[0].set_ylim((BB[2], BB[3]))
    axs[0].set_title('Pickup locations')
    axs[0].imshow(nyc_map, zorder=0, extent=BB)

    axs[1].scatter(df.dropoff_longitude, df.dropoff_latitude, zorder=1, alpha=alpha, c='r', s=s)
    axs[1].set_xlim((BB[0], BB[1]))
    axs[1].set_ylim((BB[2], BB[3]))
    axs[1].set_title('Dropoff locations')
    axs[1].imshow(nyc_map, zorder=0, extent=BB)    
    

def rmse(y_true, y_pred):
	return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))

def rmse(y_true, y_pred):
	return backend.sqrt(backend.mean(backend.square(y_pred - y_true), axis=-1))
def mean_absolute_percentage_error(y_true, y_pred): 
    return backend.mean(backend.abs((y_true - y_pred) / y_true)) * 100
    

import data

In [None]:
# Load values in a more compact form
datatypes = {'key': 'str', 
              'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

train_df = pd.read_csv(TRAINMEHRAK_PATH, nrows=DATASET_SIZE, dtype=datatypes, usecols=[1,2,3,4,5,6,7])
test_df = pd.read_csv(TESTMEHRAK_PATH, nrows=DATASET_SIZE, dtype=datatypes, usecols=[1,2,3,4,5,6,7])
testKaggle = pd.read_csv(TEST_PATH)   

print('Done with importing data')


running some statistics on train and test set, as well as plot scatter and histagram

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
train_df = train_df.dropna(how = 'any', axis = 'rows')
test_df = test_df.dropna(how = 'any', axis = 'rows')

In [None]:
plot = train_df.plot.scatter('pickup_longitude', 'fare_amount')
plot = train_df.plot.scatter('pickup_latitude', 'fare_amount')
plot = train_df.plot.scatter('dropoff_longitude', 'fare_amount')
plot = train_df.plot.scatter('dropoff_latitude', 'fare_amount')
plot = train_df.plot.scatter('passenger_count', 'fare_amount')
plot = train_df.plot.scatter('fare_amount', 'fare_amount')

In [None]:
plot = test_df.plot.scatter('pickup_longitude', 'fare_amount')
plot = test_df.plot.scatter('pickup_latitude', 'fare_amount')
plot = test_df.plot.scatter('dropoff_longitude', 'fare_amount')
plot = test_df.plot.scatter('dropoff_latitude', 'fare_amount')
plot = test_df.plot.scatter('passenger_count', 'fare_amount')
plot = test_df.plot.scatter('fare_amount', 'fare_amount')

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['pickup_longitude'])
plt.xlabel("pickup_longitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['pickup_latitude'])
plt.xlabel("pickup_latitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['dropoff_longitude'])
plt.xlabel("dropoff_longitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['dropoff_latitude'])
plt.xlabel("dropoff_latitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['passenger_count'])
plt.xlabel("passenger_count")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['fare_amount'])
plt.xlabel("fare_amount")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['pickup_longitude'])
plt.xlabel("pickup_longitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['pickup_latitude'])
plt.xlabel("pickup_latitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['dropoff_longitude'])
plt.xlabel("droppoff_longitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['dropoff_latitude'])
plt.xlabel("dropoff_latitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['passenger_count'])
plt.xlabel("passenger_count")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['fare_amount'])
plt.xlabel("fare_amount")
_ = plt.ylabel("Count")

Calling the function to clean the outliers

In [None]:
# Only a fraction of the whole data

train_df = clean(train_df)
test_df = clean(test_df)

print('Done with cleaning data')


Another group of scatter plots and histagram to display the data after cleanup

In [None]:
plot = train_df.plot.scatter('pickup_longitude', 'fare_amount')
plot = train_df.plot.scatter('pickup_latitude', 'fare_amount')
plot = train_df.plot.scatter('dropoff_longitude', 'fare_amount')
plot = train_df.plot.scatter('dropoff_latitude', 'fare_amount')
plot = train_df.plot.scatter('passenger_count', 'fare_amount')
plot = train_df.plot.scatter('fare_amount', 'fare_amount')

In [None]:
plot = test_df.plot.scatter('pickup_longitude', 'fare_amount')
plot = test_df.plot.scatter('pickup_latitude', 'fare_amount')
plot = test_df.plot.scatter('dropoff_longitude', 'fare_amount')
plot = test_df.plot.scatter('dropoff_latitude', 'fare_amount')
plot = test_df.plot.scatter('passenger_count', 'fare_amount')
plot = test_df.plot.scatter('fare_amount', 'fare_amount')

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['pickup_longitude'])
plt.xlabel("pickup_longitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['pickup_latitude'])
plt.xlabel("pickup_latitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['dropoff_longitude'])
plt.xlabel("dropoff_longitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['dropoff_latitude'])
plt.xlabel("dropoff_latitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['passenger_count'])
plt.xlabel("passenger_count")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['fare_amount'])
plt.xlabel("fare_amount")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['pickup_longitude'])
plt.xlabel("pickup_longitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['pickup_latitude'])
plt.xlabel("pickup_latitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['dropoff_longitude'])
plt.xlabel("droppoff_longitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['dropoff_latitude'])
plt.xlabel("dropoff_latitude")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['passenger_count'])
plt.xlabel("passenger_count")
_ = plt.ylabel("Count")

In [None]:
import matplotlib.pyplot as plt
plt.hist(test_df['fare_amount'])
plt.xlabel("fare_amount")
_ = plt.ylabel("Count")

Adding time features such as Year, month, day, hour, minute and second

In [None]:
print('train_df add_time_features')
train_df = add_time_features(train_df)
print('test_df add_time_features')
test_df = add_time_features(test_df)
print('testKaggle add_time_features')
testKaggle = add_time_features(testKaggle)

print('Done with add_time_features')
  

Adding lat and long diff features

In [None]:
print('train_df add_coordinate_features')
add_coordinate_features(train_df)
print('test_df add_coordinate_features Disabled!')
add_coordinate_features(test_df)
print('testKaggle add_coordinate_features')
add_coordinate_features(testKaggle)

print('Done with add_coordinate_features')

Adding distance features

In [None]:
print('train_df add_distances_features')
train_df = add_distances_features(train_df)
print('test_df add_distances_features')
test_df = add_distances_features(test_df)
print('testKaggle add_distances_features')
testKaggle = add_distances_features(testKaggle)

print('Done with add_distances_features')


In [None]:
print('Done with Adding features')

plot the new engineered features

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['manhattan'])
plt.xlabel("manhattan")
_ = plt.ylabel("Count")

In [None]:
plot = train_df.plot.scatter('year', 'year')
plot = train_df.plot.scatter('month', 'month')
plot = train_df.plot.scatter('day', 'day')
plot = train_df.plot.scatter('hour', 'hour')
plot = train_df.plot.scatter('minute', 'minute')
plot = train_df.plot.scatter('second', 'second')


In [None]:
import matplotlib.pyplot as plt
plt.hist(train_df['distance'])
plt.xlabel("distance")
_ = plt.ylabel("Count")

Drop extra columns from feature set

In [None]:

# Drop unwanted columns
dropped_columns = ['pickup_datetime']

train_df = train_df.drop(dropped_columns, axis=1)
test_df = test_df.drop(dropped_columns, axis=1)

testKaggle_clean = testKaggle.drop(dropped_columns + ['key'], axis=1)

print('Done with dropped_columns')

Data normalization

In [None]:
train_df_scaled = train_df
test_df_scaled = test_df
testKaggle_scaled = testKaggle_clean

scaler = preprocessing.MinMaxScaler()
train_df_scaled[['pickup_longitude']] = scaler.fit_transform(train_df[['pickup_longitude']])
test_df_scaled['pickup_longitude'] = scaler.transform(test_df[['pickup_longitude']])
testKaggle_scaled['pickup_longitude'] = scaler.transform(testKaggle_clean[['pickup_longitude']])


train_df_scaled[['pickup_latitude']] = scaler.fit_transform(train_df[['pickup_latitude']])
test_df_scaled['pickup_latitude'] = scaler.transform(test_df[['pickup_latitude']])
testKaggle_scaled['pickup_latitude'] = scaler.transform(testKaggle_clean[['pickup_latitude']])


train_df_scaled[['dropoff_latitude']] = scaler.fit_transform(train_df[['dropoff_latitude']])
test_df_scaled['dropoff_latitude'] = scaler.transform(test_df[['dropoff_latitude']])
testKaggle_scaled['dropoff_latitude'] = scaler.transform(testKaggle_clean[['dropoff_latitude']])

train_df_scaled[['dropoff_longitude']] = scaler.fit_transform(train_df[['dropoff_longitude']])
test_df_scaled['dropoff_longitude'] = scaler.transform(test_df[['dropoff_longitude']])
testKaggle_scaled['dropoff_longitude'] = scaler.transform(testKaggle_clean[['dropoff_longitude']])

train_df_scaled[['passenger_count']] = scaler.fit_transform(train_df[['passenger_count']])
test_df_scaled['passenger_count'] = scaler.transform(test_df[['passenger_count']])
testKaggle_scaled['passenger_count'] = scaler.transform(testKaggle_clean[['passenger_count']])

train_df_scaled[['manhattan']] = scaler.fit_transform(train_df[['manhattan']])
test_df_scaled['manhattan'] = scaler.transform(test_df[['manhattan']])
testKaggle_scaled['manhattan'] = scaler.transform(testKaggle_clean[['manhattan']])

train_df_scaled[['distance']] = scaler.fit_transform(train_df[['distance']])
test_df_scaled['distance'] = scaler.transform(test_df[['distance']])
testKaggle_scaled['distance'] = scaler.transform(testKaggle_clean[['distance']])

train_df_scaled[['latdiff']] = scaler.fit_transform(train_df[['latdiff']])
test_df_scaled['latdiff'] = scaler.transform(test_df[['latdiff']])
testKaggle_scaled['latdiff'] = scaler.transform(testKaggle_clean[['latdiff']])


train_df_scaled[['londiff']] = scaler.fit_transform(train_df[['londiff']])
test_df_scaled['londiff'] = scaler.transform(test_df[['londiff']])
testKaggle_scaled['londiff'] = scaler.transform(testKaggle_clean[['londiff']])


train_df_scaled[['year']] = scaler.fit_transform(train_df[['year']])
test_df_scaled['year'] = scaler.transform(test_df[['year']])
testKaggle_scaled['year'] = scaler.transform(testKaggle_clean[['year']])

train_df_scaled[['month']] = scaler.fit_transform(train_df[['month']])
test_df_scaled['month'] = scaler.transform(test_df[['month']])
testKaggle_scaled['month'] = scaler.transform(testKaggle_clean[['month']])


train_df_scaled[['day']] = scaler.fit_transform(train_df[['day']])
test_df_scaled['day'] = scaler.transform(test_df[['day']])
testKaggle_scaled['day'] = scaler.transform(testKaggle_clean[['day']])

train_df_scaled[['hour']] = scaler.fit_transform(train_df[['hour']])
test_df_scaled['hour'] = scaler.transform(test_df[['hour']])
testKaggle_scaled['hour'] = scaler.transform(testKaggle_clean[['hour']])


train_df_scaled[['minute']] = scaler.fit_transform(train_df[['minute']])
test_df_scaled['minute'] = scaler.transform(test_df[['minute']])
testKaggle_scaled['minute'] = scaler.transform(testKaggle_clean[['minute']])

train_df_scaled[['second']] = scaler.fit_transform(train_df[['second']])
test_df_scaled['second'] = scaler.transform(test_df[['second']])
testKaggle_scaled['second'] = scaler.transform(testKaggle_clean[['second']])

print('Done with data normalization')

Plot features after normalization

In [None]:
plot = train_df.plot.scatter('pickup_longitude', 'pickup_longitude')
plot = train_df.plot.scatter('pickup_latitude', 'pickup_latitude')
plot = train_df.plot.scatter('dropoff_longitude', 'dropoff_longitude')
plot = train_df.plot.scatter('dropoff_latitude', 'dropoff_latitude')
plot = train_df.plot.scatter('passenger_count', 'passenger_count')
plot = train_df.plot.scatter('year', 'year')
plot = train_df.plot.scatter('month', 'month')
plot = train_df.plot.scatter('day', 'day')
plot = train_df.plot.scatter('hour', 'hour')
plot = train_df.plot.scatter('minute', 'minute')
plot = train_df.plot.scatter('second', 'second')

Split Training and Validation Data

In [None]:
train_df_scaled, validation_df_scaled = train_test_split(train_df_scaled, test_size=0.10, random_state=1)

print('Done with splitiong data')

Create Label data set for training, validation and test datasets

In [None]:
# Get labels
train_labels = train_df_scaled['fare_amount'].values
validation_labels = validation_df_scaled['fare_amount'].values
test_labels = test_df_scaled['fare_amount'].values

train_df_scaled = train_df_scaled.drop(['fare_amount'], axis=1)
validation_df_scaled = validation_df_scaled.drop(['fare_amount'], axis=1)
test_df_scaled = test_df_scaled.drop(['fare_amount'], axis=1)

print('Done with Labels')


Building and fitting the model and saving the best model

In [None]:
checkpoint = ModelCheckpoint(filepath='my_model.h5', verbose=1, save_best_only=True)
model = Sequential()
model.add(Dense(256, activation='linear', input_dim=train_df_scaled.shape[1], activity_regularizer=regularizers.l1(0.01)))
#model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
#model.add(BatchNormalization())
model.add(Dense(64, activation='relu'))
#model.add(BatchNormalization())
model.add(Dense(32, activation='relu'))
#model.add(BatchNormalization())
#model.add(Dense(16, activation='relu'))
#model.add(BatchNormalization())
model.add(Dense(8, activation='relu'))
#model.add(BatchNormalization())
model.add(Dense(1,activation='linear'))

adam = optimizers.adam(lr=LEARNING_RATE)
model.compile(loss='mean_squared_error', optimizer=adam, metrics=['mae', rmse, 'mse', mean_absolute_percentage_error])

print('Dataset size: %s' % DATASET_SIZE)
print('Epochs: %s' % EPOCHS)
print('Learning rate: %s' % LEARNING_RATE)
print('Batch size: %s' % BATCH_SIZE)
print('Input dimension: %s' % train_df_scaled.shape[1])
print('Features used: %s' % train_df_scaled.columns)
model.summary()

history = model.fit(x=train_df_scaled, y=train_labels, batch_size=BATCH_SIZE, epochs=EPOCHS, 
                    verbose=1, callbacks=[checkpoint], validation_data=(validation_df_scaled, validation_labels), 
                    shuffle=True)
		    
print('Done with Fit the model')    

Creatong SVG diagram of the model

In [None]:
SVG(model_to_dot(model).create(prog='dot', format='svg'))

plot  the loss and rmse diagram

In [None]:
plot_loss_accuracy_rmse(history)



Evaluate model on training set

In [None]:
score = model.evaluate(train_df_scaled, train_labels, verbose=1)
print(score)
print('train mean_squared_error:', score[0])
print('train mae:', score[1])
print('train rmse:', score[2])
print('train mse:', score[3])
print('train msep:', score[4])

Evaluate model on validation  set

In [None]:
score = model.evaluate(validation_df_scaled, validation_labels, verbose=1)
print(score)
print('Validation mean_squared_error:', score[0])
print('Validation mae:', score[1])
print('Validation rmse:', score[2])
print('Validation mse:', score[3])
print('Validation msep:', score[4])

Evaluate model on test set, this is how we know how good our model can predict on a set of data it never saw.

In [None]:
score = model.evaluate(test_df_scaled, test_labels, verbose=1)
print(score)
print('Test mean_squared_error:', score[0])
print('Test mae:', score[1])
print('Test rmse:', score[2])
print('Test mse:', score[3])
print('Test msep:', score[4])

Here as well we jyust visualize the prediction on validation data

In [None]:
validation_predictions = model.predict(validation_df_scaled).flatten()

plt.scatter(validation_labels, validation_predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.axis('equal')
plt.xlim(plt.xlim())
plt.ylim(plt.ylim())
_ = plt.plot([validation_predictions.min(), validation_predictions.max()], [validation_predictions.min(), validation_predictions.max()], 'k--', lw=4)

Here as well we jyust visualize the prediction on test data

In [None]:
test_predictions = model.predict(test_df_scaled).flatten()

plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.axis('equal')
plt.xlim(plt.xlim())
plt.ylim(plt.ylim())
_ = plt.plot([test_predictions.min(), test_predictions.max()], [test_predictions.min(), test_predictions.max()], 'k--', lw=4)

Spot checking Max and Min Prediction with actual value on test set

In [None]:
print('Index of max fare_amount:', np.argmax(test_predictions))
print('value of max fare_amount predicted:',test_predictions[np.argmax(test_predictions)])
print('value of max fare_amount actual:',test_labels[np.argmax(test_predictions)])
test_df.iloc[np.argmax(test_predictions)]

In [None]:
print('Index of min fare_amount:',np.argmin(test_predictions))
print('value of min fare_amount predicted:',test_predictions[np.argmin(test_predictions)])
print('value of min fare_amount actual:',test_labels[np.argmin(test_predictions)])
test_df.iloc[np.argmin(test_predictions)]

In [None]:
fig, ax = plt.subplots()
ax.scatter(test_labels, test_predictions)
ax.plot([test_labels.min(), test_labels.max()], [test_labels.min(), test_labels.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()

Comparing the actual fare_amount and predicted values

In [None]:
plt.figure(figsize=(20,10))
plt.plot(validation_labels[:100])
plt.plot(validation_predictions[:100])
plt.title('Valdation Data Prediction vs Actual')
plt.ylabel('Fare Amount')
plt.xlabel('Transaction')
plt.legend(['Actual', 'prediction'], loc='upper right')
plt.show()


In [None]:
plt.figure(figsize=(20,10))
plt.plot(test_labels[:100])
plt.plot(test_predictions[:100])
plt.title('Test Data Prediction vs Actual')
plt.ylabel('Fare Amount')
plt.xlabel('Transaction')
plt.legend(['Actual', 'prediction'], loc='upper right')
plt.show()



In [None]:
print("test actual mean:" ,np.mean(test_labels))
print("test prediction mean:" ,np.mean(test_predictions))

In [None]:
print("test actual std:" ,np.std(test_labels))
print("test prediction std:" ,np.std(test_predictions))

In [None]:
predictionKaggle = model.predict(testKaggle_scaled, batch_size=128, verbose=1)

In [None]:
predictionKaggle

In [None]:
# output prediction

output_submission(testKaggle, predictionKaggle, 'key', 'fare_amount', SUBMISSION_NAME)
