In [None]:
import sys
sys.path.insert(0, '/Users/raghr010/anaconda/envs/tensorflow/lib/python2.7/site-packages/')

import tensorflow as tf
import numpy as np

import pandas as pd
import datetime
from geopy.distance import vincenty


MIN_LAT=40.7
MAX_LAT=40.81
MIN_LONG=-74
MAX_LONG=-73.75
LAT_BUCKETS=40
LONG_BUCKETS=40
BATCH_SIZE = 2000
TRAIN_EPOCHS = 4

TEST_EPOCHS = 1
TEST_EXAMPLE_SIZE=20

columns = {'vendor_id' : 0,
        'passenger_count' :1,
        'pickup_longitude' : 2,
        'pickup_latitude' : 3,
        'dropoff_longitude' : 4,
        'dropoff_latitude' : 5,
        'store_and_fwd_flag' : 6,
        'trip_duration' : 7,
        'month' : 8,
        'date' : 9,
        'hour' : 10,
        'weekday' : 11,
        'week_of_the_year' : 12,
          'distance' : 13}

def normalize_column(col):
    return (col - np.mean(col)) / np.std(col)

float_columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']

# Real valued columns
#passenger_count    = tf.feature_column.indicator_column(normalize_column(tf.feature_column.numeric_column("passenger_count", dtype=tf.int32)))

# Create 20 bins for latitude, logitude and create 2 embedding column
pickup_latitude = tf.feature_column.numeric_column("pickup_latitude", dtype=tf.float64)
pickup_latitude_feature = tf.feature_column.bucketized_column(
    source_column=pickup_latitude,boundaries = list(np.arange( MIN_LAT, MAX_LAT, (MAX_LAT-MIN_LAT)/LAT_BUCKETS)))

pickup_longitude = tf.feature_column.numeric_column("pickup_longitude", dtype=tf.float64 )
pickup_longitude_feature = tf.feature_column.bucketized_column(
    source_column=pickup_longitude,boundaries = list(np.arange(MIN_LONG, MAX_LONG, (MAX_LONG-MIN_LONG)/LAT_BUCKETS)))


dropoff_latitude = tf.feature_column.numeric_column("dropoff_latitude", dtype=tf.float64)
dropoff_latitude_feature = tf.feature_column.bucketized_column(
    source_column=dropoff_latitude,boundaries = list(np.arange( MIN_LAT, MAX_LAT, (MAX_LAT-MIN_LAT)/LAT_BUCKETS)))

dropoff_longitude = tf.feature_column.numeric_column("dropoff_longitude", dtype=tf.float64)
dropoff_longitude_feature = tf.feature_column.bucketized_column(
    source_column=dropoff_longitude,boundaries = list(np.arange(MIN_LONG, MAX_LONG, (MAX_LONG-MIN_LONG)/LAT_BUCKETS)))


distance = tf.feature_column.numeric_column("distance", dtype=tf.float64)

pickup_lat_x_long = tf.feature_column.embedding_column(
    tf.feature_column.crossed_column(
        keys=[pickup_latitude_feature, pickup_longitude_feature],
        hash_bucket_size=400
    ),
    dimension=20
)


# Direct columns from file
vendor = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('vendor_id', [1,2], dtype=tf.int32))

store_and_fwd_flag =tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('store_and_fwd_flag', [1,0], dtype=tf.int32))

# Date columns
month = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('month', range(1,12), dtype=tf.int32))
date = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('date', range(1,31), dtype=tf.int32))
hour = tf.feature_column.categorical_column_with_vocabulary_list('hour', range(0,23), dtype=tf.int32)


weekday = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('weekday', range(0,6), dtype=tf.int32))
week_of_the_year = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('week_of_the_year', range(1,52), dtype=tf.int32))

# month_date_hour = tf.feature_column.indicator_column(
#     tf.feature_column.crossed_column(
#         keys=[month, date, hour],
#         hash_bucket_size=8928
#     )
# )
#
# weekday_hour = tf.feature_column.indicator_column(
#     tf.feature_column.crossed_column(
#         keys=[weekday, hour],
#         hash_bucket_size=364
#     )
# )


feature_columns = { pickup_lat_x_long,
                   vendor, weekday, hour, week_of_the_year, week_of_the_year, distance}

print feature_columns

def create_training_and_test_data(file_name):

    df = pd.read_csv(file_name, header=0)

    def pickup_weekday(row):
        return datetime.date(row['year'], row['month'], row['date']).weekday()

    def week_of_year(row):
        return datetime.date(row['year'], row['month'], row['date']).isocalendar()[1]

    def store_and_forward(row):
        return 1 if row['store_and_fwd_flag'] == 'Y' else 0

    df['month'] = df['pickup_datetime'].str.split('-').str[1].astype(np.int32)
    df['year'] = df['pickup_datetime'].str.split('-').str[0].astype(np.int32)
    df['date'] = df['pickup_datetime'].str.split('-').str[2].str.split(' ').str[0].astype(np.int32)
    df['hour'] = df['pickup_datetime'].str.split(' ').str[1].str.split(':').str[0].astype(np.int32)
    df['store_and_fwd_flag'] = df.apply(store_and_forward, axis=1)
    df['weekday'] = df.apply(pickup_weekday, axis=1)

    df['week_of_the_year'] = df.apply(week_of_year, axis=1)

    del df['pickup_datetime']
    del df['dropoff_datetime']

    del df['year']

    del df['id']

    np_array = df.values
    np.random.shuffle(np_array)

    np.save('training_data.npy', np_array[:TEST_EXAMPLE_SIZE])
    np.save('testing_data.npy', np_array[TEST_EXAMPLE_SIZE:])

    return df

def make_input_fn(file_name, test=False):

    df = pd.read_csv(file_name, header=0)

    def pickup_weekday(row):
        return datetime.date(row['year'], row['month'], row['date']).weekday()

    def week_of_year(row):
        return datetime.date(row['year'], row['month'], row['date']).isocalendar()[1]

    def store_and_forward(row):
        return 1 if row['store_and_fwd_flag'] == 'Y' else 0

    df['month'] = df['pickup_datetime'].str.split('-').str[1].astype(np.int32)
    df['year'] = df['pickup_datetime'].str.split('-').str[0].astype(np.int32)
    df['date'] = df['pickup_datetime'].str.split('-').str[2].str.split(' ').str[0].astype(np.int32)
    df['hour'] = df['pickup_datetime'].str.split(' ').str[1].str.split(':').str[0].astype(np.int32)
    df['store_and_fwd_flag'] = df.apply(store_and_forward, axis=1)
    df['weekday'] = df.apply(pickup_weekday, axis=1)

    df['week_of_the_year'] = df.apply(week_of_year, axis=1)

    del df['pickup_datetime']
    del df['dropoff_datetime']

    del df['year']

    del df['id']

    df_train = df[: -1 * TEST_EXAMPLE_SIZE]
    df_test = df[-1 * TEST_EXAMPLE_SIZE : ]

    # Create tensorflow input fn
    def train_input_fn():
        # Wrap the useful features in an array
        useful_fueatures = [
            np.array(df_train["month"].values, dtype=np.int32),
            np.array(df_train["date"].values, dtype=np.int32),
            np.array(df_train["hour"].values, dtype=np.int32),
            np.array(df_train["weekday"].values, dtype=np.int32),
            np.array(df_train["week_of_the_year"].values, dtype=np.int32),
            np.array(df_train['vendor_id'], dtype=np.int32),
            np.array(df_train['passenger_count'], dtype=np.int32),
            np.array(df_train['pickup_longitude'], dtype=np.float32),
            np.array(df_train['pickup_latitude'], dtype=np.float32),
            np.array(df_train['dropoff_longitude'], dtype=np.float32),
            np.array(df_train['dropoff_latitude'], dtype=np.float32),
            np.array(df_train['store_and_fwd_flag'], dtype=np.int32),
            np.array(df_train['trip_duration'], dtype=np.int32)
        ]

        # Ugly, but creates all the slice input producers for all the features selected
        month, date, hour, \
        weekday, week_of_the_year, vendor_id, \
        passenger_count, pickup_longitude, pickup_latitude, dropoff_longitude,dropoff_latitude, \
        store_and_fwd_flag, trip_duration = tf.train.slice_input_producer(
            tensor_list=useful_fueatures,
            num_epochs=TRAIN_EPOCHS,
            shuffle=True,
            capacity=BATCH_SIZE * 5
        )

        # Created a dict out of sliced input producers
        dataset_dict = dict(
            month=month,
            date=date,
            hour=hour,
            weekday=weekday,
            week_of_the_year=week_of_the_year,
            vendor_id=vendor_id,
            passenger_count=passenger_count,
            pickup_longitude=pickup_longitude,
            pickup_latitude=pickup_latitude,
            dropoff_longitude=dropoff_longitude,
            dropoff_latitude=dropoff_latitude,
            store_and_fwd_flag=store_and_fwd_flag,
            trip_duration=trip_duration
        )

        # Creates a batched dictionary that holds a queue that loads the data
        # while the training is happening. Multithreading.
        batch_dict = tf.train.batch(
            dataset_dict,
            BATCH_SIZE,
            num_threads=10,
            capacity=BATCH_SIZE * 5,
            enqueue_many=False,
            dynamic_pad=False,
            allow_smaller_final_batch=True,
            shared_name=None,
            name=None
        )

        # The labels need to be returned separately
        batch_labels = batch_dict.pop('trip_duration')
        return batch_dict, tf.reshape(batch_labels, [-1, 1])

    # Create tensorflow input fn
    def test_input_fn():
        # Wrap the useful features in an array
        useful_fueatures = [
            np.array(df_test["month"].values, dtype=np.int32),
            np.array(df_test["date"].values, dtype=np.int32),
            np.array(df_test["hour"].values, dtype=np.int32),
            np.array(df_test["weekday"].values, dtype=np.int32),
            np.array(df_test["week_of_the_year"].values, dtype=np.int32),
            np.array(df_test['vendor_id'], dtype=np.int32),
            np.array(df_test['passenger_count'], dtype=np.int32),
            np.array(df_test['pickup_longitude'], dtype=np.float32),
            np.array(df_test['pickup_latitude'], dtype=np.float32),
            np.array(df_test['dropoff_longitude'], dtype=np.float32),
            np.array(df_test['dropoff_latitude'], dtype=np.float32),
            np.array(df_test['store_and_fwd_flag'], dtype=np.int32),
            np.array(df_test['trip_duration'], dtype=np.int32)
        ]

        # Ugly, but creates all the slice input producers for all the features selected
        month, date, hour, \
        weekday, week_of_the_year, vendor_id, \
        passenger_count, pickup_longitude, pickup_latitude, dropoff_longitude,dropoff_latitude, \
        store_and_fwd_flag, trip_duration = tf.train.slice_input_producer(
            tensor_list=useful_fueatures,
            num_epochs=TEST_EPOCHS,
            shuffle=True,
            capacity=BATCH_SIZE * 5
        )

        # Created a dict out of sliced input producers
        dataset_dict = dict(
            month=month,
            date=date,
            hour=hour,
            weekday=weekday,
            week_of_the_year=week_of_the_year,
            vendor_id=vendor_id,
            passenger_count=passenger_count,
            pickup_longitude=pickup_longitude,
            pickup_latitude=pickup_latitude,
            dropoff_longitude=dropoff_longitude,
            dropoff_latitude=dropoff_latitude,
            store_and_fwd_flag=store_and_fwd_flag,
            trip_duration=trip_duration
        )

        # Creates a batched dictionary that holds a queue that loads the data
        # while the training is happening. Multithreading.
        batch_dict = tf.train.batch(
            dataset_dict,
            BATCH_SIZE,
            num_threads=10,
            capacity=BATCH_SIZE * 5,
            enqueue_many=False,
            dynamic_pad=False,
            allow_smaller_final_batch=True,
            shared_name=None,
            name=None
        )

        # The labels need to be returned separately
        batch_labels = batch_dict.pop('trip_duration')
        return batch_dict, tf.reshape(batch_labels, [-1, 1])

    return train_input_fn, test_input_fn


def make_model(features, labels, mode, params, config):
    # Creates the input layer starting from the feature definitions of above
    input_layer = tf.feature_column.input_layer(
        features=features,
        feature_columns=feature_columns
    )

    # Get the global step
    global_step = tf.contrib.framework.get_or_create_global_step()

    # First dense layer or the neural net
    x = tf.layers.dense(
        inputs=input_layer,
        units=512,
        activation=tf.nn.relu,
        name="fisrt_fully_connected_layer"
    )

    # Adding dropout to lessen chances of overfitting
    x = tf.layers.dropout(
        inputs=x,
        name="first_dropout"
    )

    # Second dense layer
    x = tf.layers.dense(
        inputs=x,
        units=128,
        activation=tf.nn.relu,
        name="second_fully_connected_layer"
    )

    # Third and final deep layer of the neural net
    x = tf.layers.dense(
        inputs=x,
        units=16,
        activation=tf.nn.relu,
        name="third_fully_connected_layer"
    )

    # Linear output neuron that combine the output of the neural net
    predictions = tf.contrib.layers.fully_connected(
        inputs=x,
        num_outputs=1
    )

    # Loss is defined as the L1 distance since it is less sensitive to outliers
    loss = tf.losses.absolute_difference(
        labels=labels,
        predictions=predictions
    )

    # Export the loss to tensorboard
    tf.summary.scalar("Loss", loss)

    # Using ADAgrad Momentum Optimizer since it provides quite some advance features and
    # turns out to be very stable
    optimizer = tf.train.AdamOptimizer(
        learning_rate=params.learning_rate,
    )

    # Out train op in the tensorflow graph. Computing this also increases our global_step
    train_op = optimizer.minimize(loss, global_step=global_step)

    # Finally, wrap the tensor defined above in the format Tensorflow expects
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions,
        loss=loss,
        train_op=train_op
    )




# Main file
def main(_):
    #create_training_and_test_data('train1.csv')
    train_input_fn, test_input_fn = make_input_fn('train.csv')


    # Creates hyperparams
    hparams = tf.contrib.training.HParams(
        learning_rate=.1,
    )

    config = tf.ConfigProto(
        # allow_soft_placement=True,
        # log_device_placement=True
    )
    # Turns on JIT Compilation through XLA for boosting performance. If crashes disable this
    # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    trainingConfig = tf.contrib.learn.RunConfig(
        # log_device_placement=True,
        save_summary_steps=500,
        save_checkpoints_steps=500,
        # Creates model dir (need to change this)
        model_dir=("/tmp/"),
        session_config=config
    )

    estimator = tf.estimator.Estimator(
        model_fn=make_model,
        params=hparams,
        config=trainingConfig
    )

    # Finally, perform the training (VERY VERY LONG!)
    estimator.train(
        input_fn=train_input_fn,
        steps=TRAIN_EPOCHS,
    )

    print estimator.evaluate(input_fn=test_input_fn, steps=TEST_EPOCHS)


# Run the main
if __name__ == '__main__':
    tf.app.run(main)
    #create_training_and_test_data('train1.csv')

    #df = np.load('training_data.npy')