Reference link: https://medium.com/@keshan/estimators-an-easy-way-to-work-with-tensorflow-fa0a0381906f

In [3]:
import pandas as pd
from sodapy import Socrata 

In [4]:
client = Socrata("data.cityofnewyork.us", None)
results = client.get("pqfs-mqru", limit=10000)
results_df = pd.DataFrame.from_records(results)



In [5]:
print(results_df)

        dropoff_latitude    dropoff_longitude extra fare_amount  \
0     40.698043823242188  -73.924278259277344   0.5           8   
1     40.761379241943359  -73.923919677734375   0.5        15.5   
2     40.646072387695313  -74.013160705566406   0.5        16.5   
3     40.689033508300781  -74.000648498535156   0.5        13.5   
4     40.663013458251953  -73.940719604492188   0.5          12   
5     40.742111206054688  -73.867744445800781   0.5           7   
6     40.745689392089844  -73.886192321777344   0.5           5   
7     40.794120788574219  -73.949150085449219   0.5           7   
8     40.679725646972656  -73.971572875976562   0.5          12   
9     40.739658355712891  -73.917549133300781   0.5           9   
10    40.763126373291016  -73.921028137207031   0.5           6   
11    40.718177795410156  -73.962753295898438   0.5         3.5   
12    40.842765808105469  -73.924903869628906   0.5        14.5   
13    40.775833129882812   -73.90240478515625   0.5           

In [6]:
import numpy as np

CSV_COLUMNS = ['dropoff_latitude', 'dropoff_longitude','pickup_latitude','pickup_longitude','passenger_count', 'fare_amount']
FEATURES = CSV_COLUMNS[0:len(CSV_COLUMNS) - 1]
LABEL = CSV_COLUMNS[-1]

# Split into train and eval as 80% and 20% respectively.
np.random.seed(seed=1) # makes split reproducible
msk = np.random.rand(len(results_df)) < 0.8

df_train = results_df[msk]
df_valid = results_df[~msk]

Then we need inputs function to read the pandas dataframe

In [7]:
def make_input_fn(df, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    x = df[FEATURES].astype(float),
    y = df[LABEL].astype(float),
    batch_size = 128,
    num_epochs = num_epochs,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

In [8]:
def make_prediction_input_fn(df):
  return tf.estimator.inputs.pandas_input_fn(
    x = df[FEATURES].astype(float),
    y = None,
    batch_size = 128,
    num_epochs = 1,
    shuffle = True,
    queue_capacity = 1000,
    num_threads = 1
  )

In [9]:
def make_feature_cols():
  input_columns = [tf.feature_column.numeric_column(k) for k in FEATURES]
  return input_columns

Let’s use a very simple DNNRegressor which is a pre made Estimator with the inputs and feature columns which we created in the above functions.

In [10]:
import shutil
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)
OUTDIR = 'taxi_trained'

shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time
model = tf.estimator.DNNRegressor(hidden_units = [32, 8, 2],
      feature_columns = make_feature_cols(), model_dir = OUTDIR)
model.train(input_fn = make_input_fn(df_train, num_epochs = 100))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'taxi_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11dc1cc88>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into taxi_trained/model.ckpt.
INFO:tensorflow:los

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x10c195fd0>

In [11]:
def print_rmse(model, name, df):
  metrics = model.evaluate(input_fn = make_input_fn(df, 1))
  print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))
print_rmse(model, 'validation', df_valid)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-01-02:06:53
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from taxi_trained/model.ckpt-6283
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-09-01-02:06:54
INFO:tensorflow:Saving dict for global step 6283: average_loss = 84.86123, global_step = 6283, loss = 10390.196
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 6283: taxi_trained/model.ckpt-6283


RMSE on validation dataset = 9.212015151977539
