In [8]:
import sys
sys.path.insert(0, '/Users/raghr010/anaconda/envs/tensorflow/lib/python2.7/site-packages/')

import tensorflow as tf
import numpy as np

import pandas as pd
import datetime
from geopy.distance import vincenty


MIN_LAT=40.7
MAX_LAT=40.81
MIN_LONG=-74
MAX_LONG=-73.75
LAT_BUCKETS=40
LONG_BUCKETS=40
BATCH_SIZE = 2000
TRAIN_EPOCHS = 400

TEST_EPOCHS = 1
TEST_EXAMPLE_SIZE=20

columns = {'vendor_id' : 0,
        'passenger_count' :1,
        'pickup_longitude' : 2,
        'pickup_latitude' : 3,
        'dropoff_longitude' : 4,
        'dropoff_latitude' : 5,
        'store_and_fwd_flag' : 6,
        'trip_duration' : 7,
        'month' : 8,
        'date' : 9,
        'hour' : 10,
        'weekday' : 11,
        'week_of_the_year' : 12,
          'distance' : 13}

def normalize_column(col):
    return (col - np.mean(col)) / np.std(col)

float_columns = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']

# Real valued columns
#passenger_count    = tf.feature_column.indicator_column(normalize_column(tf.feature_column.numeric_column("passenger_count", dtype=tf.int32)))

# Create 20 bins for latitude, logitude and create 2 embedding column
pickup_latitude = tf.feature_column.numeric_column("pickup_latitude", dtype=tf.float64)
pickup_latitude_feature = tf.feature_column.bucketized_column(
    source_column=pickup_latitude,boundaries = list(np.arange( MIN_LAT, MAX_LAT, (MAX_LAT-MIN_LAT)/LAT_BUCKETS)))

pickup_longitude = tf.feature_column.numeric_column("pickup_longitude", dtype=tf.float64 )
pickup_longitude_feature = tf.feature_column.bucketized_column(
    source_column=pickup_longitude,boundaries = list(np.arange(MIN_LONG, MAX_LONG, (MAX_LONG-MIN_LONG)/LAT_BUCKETS)))


dropoff_latitude = tf.feature_column.numeric_column("dropoff_latitude", dtype=tf.float64)
dropoff_latitude_feature = tf.feature_column.bucketized_column(
    source_column=dropoff_latitude,boundaries = list(np.arange( MIN_LAT, MAX_LAT, (MAX_LAT-MIN_LAT)/LAT_BUCKETS)))

dropoff_longitude = tf.feature_column.numeric_column("dropoff_longitude", dtype=tf.float64)
dropoff_longitude_feature = tf.feature_column.bucketized_column(
    source_column=dropoff_longitude,boundaries = list(np.arange(MIN_LONG, MAX_LONG, (MAX_LONG-MIN_LONG)/LAT_BUCKETS)))


distance = tf.feature_column.numeric_column("distance", dtype=tf.float64)

pickup_lat_x_long = tf.feature_column.embedding_column(
    tf.feature_column.crossed_column(
        keys=[pickup_latitude_feature, pickup_longitude_feature],
        hash_bucket_size=400
    ),
    dimension=20
)


# Direct columns from file
vendor = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('vendor_id', [1,2], dtype=tf.int32))

store_and_fwd_flag =tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('store_and_fwd_flag', [1,0], dtype=tf.int32))

# Date columns
month = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('month', range(1,12), dtype=tf.int32))
date = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('date', range(1,31), dtype=tf.int32))
hour = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('hour', range(0,23), dtype=tf.int32))


weekday = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('weekday', range(0,6), dtype=tf.int32))
week_of_the_year = tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list('week_of_the_year', range(1,52), dtype=tf.int32))

# month_date_hour = tf.feature_column.indicator_column(
#     tf.feature_column.crossed_column(
#         keys=[month, date, hour],
#         hash_bucket_size=8928
#     )
# )
#
# weekday_hour = tf.feature_column.indicator_column(
#     tf.feature_column.crossed_column(
#         keys=[weekday, hour],
#         hash_bucket_size=364
#     )
# )


feature_columns = { pickup_lat_x_long,
                   vendor, weekday, hour, week_of_the_year, week_of_the_year, distance}

print feature_columns

set([_IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='vendor_id', vocabulary_list=(1, 2), dtype=tf.int32, default_value=-1)), _EmbeddingColumn(categorical_column=_CrossedColumn(keys=(_BucketizedColumn(source_column=_NumericColumn(key='pickup_latitude', shape=(1,), default_value=None, dtype=tf.float64, normalizer_fn=None), boundaries=(40.700000000000003, 40.702750000000002, 40.705500000000001, 40.70825, 40.710999999999999, 40.713749999999997, 40.716499999999996, 40.719249999999995, 40.721999999999994, 40.724749999999993, 40.727499999999992, 40.730249999999991, 40.73299999999999, 40.735749999999989, 40.738499999999988, 40.741249999999987, 40.743999999999986, 40.746749999999984, 40.749499999999983, 40.752249999999982, 40.754999999999981, 40.75774999999998, 40.760499999999979, 40.763249999999978, 40.765999999999977, 40.768749999999976, 40.771499999999975, 40.774249999999974, 40.776999999999973, 40.779749999999972, 40.78249999999997, 40.785249999999969, 40.787999999

In [9]:
def get_data_frame(file_name, predict=False):
    df = pd.read_csv(file_name, header=0)

    def pickup_weekday(row):
        return datetime.date(int(row['year']), int(row['month']), int(row['date'])).weekday()

    def week_of_year(row):
        return datetime.date(int(row['year']), int(row['month']), int(row['date'])).isocalendar()[1]

    def store_and_forward(row):
        return 1 if row['store_and_fwd_flag'] == 'Y' else 0
    
    def distance(row):
        a = (row['pickup_latitude'], row['pickup_longitude'])
        b = (row['dropoff_latitude'], row['dropoff_longitude'])
        return (vincenty(a, b).miles)

    df['month'] = df['pickup_datetime'].str.split('-').str[1].astype(np.int32)
    df['year'] = df['pickup_datetime'].str.split('-').str[0].astype(np.int32)
    df['date'] = df['pickup_datetime'].str.split('-').str[2].str.split(' ').str[0].astype(np.int32).astype(np.int32)
    df['hour'] = df['pickup_datetime'].str.split(' ').str[1].str.split(':').str[0].astype(np.int32)
    df['store_and_fwd_flag'] = df.apply(store_and_forward, axis=1).astype(np.int32)
    df['weekday'] = df.apply(pickup_weekday, axis=1).astype(np.int32)

    df['week_of_the_year'] = df.apply(week_of_year, axis=1).astype(np.int32)
    df['distance'] = normalize_column(df.apply(distance, axis=1).astype(np.float64))
    
    del df['pickup_datetime']
    if not predict:
        del df['dropoff_datetime']

    del df['year']

    del df['id']

    return df


In [10]:
def create_training_and_test_data(file_name):

    df = get_data_frame(file_name)

    np.random.shuffle(df.values)

    df_train = df[: -1 * TEST_EXAMPLE_SIZE]

    df_test = df[-1 * TEST_EXAMPLE_SIZE:]

    df_train.to_pickle('training_data.npy')
    df_test.to_pickle('testing_data.npy')

In [15]:
def create_prediction_data(file_name):
    df = get_data_frame(file_name, predict=True)
    df['trip_duration'] = np.ones(len(df))
    print 'DF', len(df.values)
    df.to_pickle('predict_data.npy')


def normalize_column(col):
    return (col - np.mean(col)) / np.std(col)


def make_input_fn(test=False, predict=False):

    if predict:
        df = pd.read_pickle('predict_data.npy')
    else:
        df = pd.read_pickle('training_data.npy') if not test else pd.read_pickle('testing_data.npy')

    EPOCHS = TRAIN_EPOCHS if not test else TEST_EPOCHS
    if predict:
        EPOCHS = TEST_EPOCHS

    df = df.values

    x = {}
    for column in columns:
        if column not in float_columns:
            x[column] = df[:, columns[column]].astype(np.int32)
        else:
            x[column] = df[:, columns[column]].astype(np.float64)

    if not predict:
        input_fn = tf.contrib.learn.io.numpy_input_fn(

            x=x,
            y=normalize_column(df[:, columns['trip_duration']]),
            batch_size=BATCH_SIZE,
            num_epochs=EPOCHS,
        )
    else:
        print x
        input_fn = tf.contrib.learn.io.numpy_input_fn(

            x=x,
            batch_size=len(x),
            num_epochs=1,
        )

    return input_fn


In [None]:
estimator = tf.contrib.learn.DNNRegressor(
    feature_columns=feature_columns,
    hidden_units=[512, 128, 16],
    optimizer=tf.train.ProximalAdagradOptimizer(
      learning_rate=0.00001,
      l1_regularization_strength=0.001
    ),
model_dir="./models/")


#create_training_and_test_data('train.csv')
#create_prediction_data('test.csv')

estimator.fit(input_fn=make_input_fn())

print estimator.evaluate(input_fn=make_input_fn(test=True))



INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x13bad1c50>, '_model_dir': './models/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_tf_random_seed': None, '_environment': 'local', '_num_worker_replicas': 0, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_evaluation_master': '', '_master': ''}
Object was never used (type <class 'tensorflow.python.framework.ops.Tensor'>):
<tf.Tensor 'report_uninitialized_variables_1/boolean_mask/Gather:0' shape=(?,) dtype=string>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses t

INFO:tensorflow:global_step/sec: 17.3102
INFO:tensorflow:loss = 0.557786, step = 501 (5.777 sec)
INFO:tensorflow:global_step/sec: 17.9965
INFO:tensorflow:loss = 0.163238, step = 601 (5.557 sec)
INFO:tensorflow:global_step/sec: 17.8731
INFO:tensorflow:loss = 0.822544, step = 701 (5.594 sec)
INFO:tensorflow:global_step/sec: 17.9479
INFO:tensorflow:loss = 0.430515, step = 801 (5.572 sec)
INFO:tensorflow:global_step/sec: 17.2528
INFO:tensorflow:loss = 0.295864, step = 901 (5.796 sec)
INFO:tensorflow:global_step/sec: 17.9678
INFO:tensorflow:loss = 0.560151, step = 1001 (5.565 sec)
INFO:tensorflow:global_step/sec: 18.0048
INFO:tensorflow:loss = 0.423784, step = 1101 (5.554 sec)
INFO:tensorflow:global_step/sec: 18.0031
INFO:tensorflow:loss = 0.294461, step = 1201 (5.555 sec)
INFO:tensorflow:global_step/sec: 17.2876
INFO:tensorflow:loss = 0.1638, step = 1301 (5.785 sec)
INFO:tensorflow:global_step/sec: 17.7636
INFO:tensorflow:loss = 0.553875, step = 1401 (5.629 sec)
INFO:tensorflow:global_step

INFO:tensorflow:global_step/sec: 17.1178
INFO:tensorflow:loss = 0.0196288, step = 8901 (5.842 sec)
INFO:tensorflow:global_step/sec: 16.8815
INFO:tensorflow:loss = 68.7931, step = 9001 (5.923 sec)
INFO:tensorflow:global_step/sec: 16.8934
INFO:tensorflow:loss = 0.273666, step = 9101 (5.919 sec)
INFO:tensorflow:global_step/sec: 17.1132
INFO:tensorflow:loss = 0.30336, step = 9201 (5.844 sec)
INFO:tensorflow:global_step/sec: 16.6241
INFO:tensorflow:loss = 0.411016, step = 9301 (6.015 sec)
INFO:tensorflow:global_step/sec: 16.7443
INFO:tensorflow:loss = 0.151659, step = 9401 (5.972 sec)
INFO:tensorflow:global_step/sec: 16.8046
INFO:tensorflow:loss = 0.280492, step = 9501 (5.951 sec)
INFO:tensorflow:global_step/sec: 16.8274
INFO:tensorflow:loss = 0.136279, step = 9601 (5.943 sec)
INFO:tensorflow:global_step/sec: 16.9322
INFO:tensorflow:loss = 0.666296, step = 9701 (5.906 sec)
INFO:tensorflow:global_step/sec: 16.7218
INFO:tensorflow:loss = 0.283519, step = 9801 (5.980 sec)
INFO:tensorflow:globa

INFO:tensorflow:loss = 0.149862, step = 17101 (5.588 sec)
INFO:tensorflow:global_step/sec: 17.9683
INFO:tensorflow:loss = 0.645971, step = 17201 (5.565 sec)
INFO:tensorflow:global_step/sec: 18.0347
INFO:tensorflow:loss = 0.685061, step = 17301 (5.545 sec)
INFO:tensorflow:global_step/sec: 17.9963
INFO:tensorflow:loss = 0.540562, step = 17401 (5.556 sec)
INFO:tensorflow:global_step/sec: 17.8628
INFO:tensorflow:loss = 0.281857, step = 17501 (5.598 sec)
INFO:tensorflow:global_step/sec: 18.0481
INFO:tensorflow:loss = 0.282758, step = 17601 (5.541 sec)
INFO:tensorflow:global_step/sec: 17.7814
INFO:tensorflow:loss = 0.4164, step = 17701 (5.624 sec)
INFO:tensorflow:global_step/sec: 17.8211
INFO:tensorflow:loss = 0.0169011, step = 17801 (5.613 sec)
INFO:tensorflow:global_step/sec: 17.9229
INFO:tensorflow:loss = 0.148997, step = 17901 (5.579 sec)
INFO:tensorflow:global_step/sec: 17.8122
INFO:tensorflow:loss = 0.800801, step = 18001 (5.614 sec)
INFO:tensorflow:global_step/sec: 17.5277
INFO:tensor

In [24]:
ids = []
with open('test.csv') as fh:
    index = 0
    for line in fh:
        if index == 0:
            index += 1
            continue
        ids.append(line.split(',')[0])


print len(ids)


with open('submission.csv', 'w') as fh:
    fh.write('id,trip_duration\n')
    index = 0
    for prediction in  estimator.predict_scores(input_fn=make_input_fn(predict=True)):
        fh.write(ids[index] + ',' + str(prediction) + '\n')
        index += 1


625134
{'distance': array([1, 1, 1, ..., 1, 1, 1], dtype=int32), 'week_of_the_year': array([0, 0, 0, ..., 1, 3, 0], dtype=int32), 'hour': array([3, 3, 3, ..., 4, 4, 4], dtype=int32), 'trip_duration': array([6, 6, 6, ..., 1, 1, 1], dtype=int32), 'vendor_id': array([1, 1, 1, ..., 1, 1, 2], dtype=int32), 'pickup_longitude': array([-73.98812866, -73.96420288, -73.99743652, ..., -73.97226715,
       -73.97650146, -73.98184967]), 'month': array([30, 30, 30, ...,  1,  1,  1], dtype=int32), 'dropoff_longitude': array([-73.99017334, -73.95980835, -73.98616028, ..., -73.87660217,
       -73.85426331, -73.96932983]), 'passenger_count': array([1, 1, 1, ..., 2, 1, 2], dtype=int32), 'weekday': array([26, 26, 26, ..., 53, 53, 53], dtype=int32), 'date': array([23, 23, 23, ...,  0,  0,  0], dtype=int32), 'store_and_fwd_flag': array([0, 0, 0, ..., 0, 0, 0], dtype=int32), 'pickup_latitude': array([ 40.73202896,  40.67999268,  40.73758316, ...,  40.75986481,
        40.73356247,  40.7168808 ]), 'dropoff_l

NotFittedError: Couldn't find trained model at ./models/.