Rohit's First Kernal - NYC Taxi Fare Prediction
===========
This is the first kernal for submission for Google Cloud Playground [New York City Taxi Fare Prediction](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction)

Strategy
--------------------
1. Filter out outliers
    1. Remove data outside NYC
    2. Remove data where fare is unresonable (too high / too low)
2. Use Linear Regression ML Model On Clean Data
3. Use Linear Fit On Unclean Data

Using NYC Open Data
-------------------
NYC Open Data is stored in Google Big Query open datasets. To access this data in your notebook, check out kernal [How to Query the NYC Open Data
](https://www.kaggle.com/paultimothymooney/how-to-query-the-nyc-open-data)


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# to plot 3d scatter plots
from mpl_toolkits.mplot3d import Axes3D

import math

# to print out current time
import datetime
import os

import traceback

import tensorflow as tf
import shutil
print(tf.__version__)

1.10.0


In [2]:
BATCH_SIZE = 512

# Try to load the data. This may be an intensive process
df_train = pd.read_csv(r'M:\kaggle\NY Taxi Cab\input\train.csv', nrows = 10000, parse_dates=["pickup_datetime"]);

In [3]:
df_train.head(n=10)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1
5,2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:45,-74.000964,40.73163,-73.972892,40.758233,1
6,2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00,-73.980002,40.751662,-73.973802,40.764842,1
7,2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00,-73.9513,40.774138,-73.990095,40.751048,1
8,2012-12-03 13:10:00.000000125,9.0,2012-12-03 13:10:00,-74.006462,40.726713,-73.993078,40.731628,1
9,2009-09-02 01:11:00.00000083,8.9,2009-09-02 01:11:00,-73.980658,40.733873,-73.99154,40.758138,2


In [4]:
df_train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,512.0,512.0,512.0,512.0,512.0,512.0
mean,11.608164,-72.242612,39.795867,-72.385201,39.876019,1.632812
std,9.315499,11.20277,6.171227,10.736289,5.91453,1.270598
min,2.5,-74.035839,0.0,-74.035839,0.0,0.0
25%,6.0,-73.992777,40.735444,-73.993028,40.731372,1.0
50%,8.5,-73.982185,40.752074,-73.980713,40.752815,1.0
75%,13.0,-73.968465,40.766767,-73.964584,40.767634,2.0
max,58.0,0.0,40.828531,0.0,40.881878,6.0


In [5]:
CSV_COLUMNS = ['key',
               'fare_amount',
               'pickup_datetime',
               'pickup_longitude',
               'pickup_latitude',
               'dropoff_longitude',
               'dropoff_latitude',
               'passenger_count']

LABEL_COLUMN = 'fare_amount' # 'pickup_datetime' #

DEFAULTS = [['NoKey'],
            [0.0],
            ['BadDate'],
            [-74.0],
            [40.0],
            [-74.0],
            [40.7],
            [1.0]]

TRAIN_TEST_SPLIT_RATIO = 0.8

In [6]:
def read_dataset(filenames, mode, batch_size = BATCH_SIZE):
        
    def _input_fn():
        
        def parse_dataset(filename, header_lines = 1):
            return tf.data.TextLineDataset(filenames=filename).skip(header_lines) 
        
        def parse_batch(value_column):
            if mode == tf.estimator.ModeKeys.PREDICT:
                columns = tf.decode_csv(value_column, record_defaults = DEFAULTS[:1] + DEFAULTS[1:])
                features = dict(zip(CSV_COLUMNS[:1] + CSV_COLUMNS[1:], columns))
                label = DEFAULTS[1]
            else:
                columns = tf.decode_csv(value_column, record_defaults = DEFAULTS)
                features = dict(zip(CSV_COLUMNS, columns))
                label = features.pop(LABEL_COLUMN)
            return features, label

        # Create list of file names that match "glob" pattern (i.e. data_file_*.csv)
        filenames_dataset = tf.data.Dataset.list_files(filenames)
        
        # Read lines from text files
        dataset = filenames_dataset.flat_map(parse_dataset)
        
        # Parse text lines as comma-separated values (CSV)
        dataset = dataset.map(parse_batch)
        
        # Note:
        # use tf.data.Dataset.flat_map to apply one to many transformations (here: filename -> text lines)
        # use tf.data.Dataset.map            to apply one to one    transformations (here: text line -> feature list)
        
        if mode == tf.estimator.ModeKeys.TRAIN:
                num_epochs = None # loop indefinitely
                dataset = dataset.shuffle(buffer_size = 10 * batch_size)
        else:
                num_epochs = 1 # end-of-input after this

        dataset = dataset.repeat(num_epochs).batch(batch_size)
        
        # Skip header row
        return dataset.skip(1).make_one_shot_iterator().get_next()
    return _input_fn

In [7]:
def get_train():
    return read_dataset('../input/train/train-*.csv', mode = tf.estimator.ModeKeys.TRAIN)

def get_valid():
    return read_dataset('../input/train/test-*.csv', mode = tf.estimator.ModeKeys.EVAL)

def get_test():
    return read_dataset('../input/test.csv', mode = tf.estimator.ModeKeys.PREDICT)

In [8]:
INPUT_COLUMNS = [
    tf.feature_column.numeric_column('pickup_longitude'),
    tf.feature_column.numeric_column('pickup_latitude'),
    tf.feature_column.numeric_column('dropoff_longitude'),
    tf.feature_column.numeric_column('dropoff_latitude'),
    tf.feature_column.numeric_column('passenger_count'),
]

def add_more_features(feats):
    # Nothing to add (yet!)
    return feats

feature_cols = add_more_features(INPUT_COLUMNS)

In [9]:
def print_rmse(model, name, input_fn):
    metrics = model.evaluate(input_fn = input_fn, steps = None)
    print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))

In [10]:
OUTDIR = '../taxi_trained'

tf.logging.set_verbosity(tf.logging.INFO)

with tf.Session() as sess:
    try:
        shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time
        
        model = tf.estimator.LinearRegressor(feature_columns = feature_cols, model_dir = OUTDIR)
        
        print('Begin Training ---------------- ', datetime.datetime.now())
        model.train(input_fn = get_train(), steps = 1000)
        
        print('Begin Testing ---------------- ', datetime.datetime.now())        
        print_rmse(model, 'validation', get_valid())
        
        print('Finished Testing ---------------- ', datetime.datetime.now())   
    except:
        traceback.print_exc()

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '../taxi_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002A50AEDCBA8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Begin Training ----------------  2018-09-11 19:59:58.138549
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Savi

In [None]:
# RMSE on validation dataset = 10.006631851196289