<a href="https://colab.research.google.com/github/stuyml/ML-Club-Projects/blob/master/Taxi_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -U -q PyDrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [0]:
# Enter the ID of the Google Drive folder containing train.csv and test.csv files.
# The ID of the folder is the long string of numbers and letters in the URL of the folder in Google Drive.

file_list = drive.ListFile({'q': "'1bXgGh5zrfFishI5NUSM3Zfgrbb5A9Uzj' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))

title: Taxi Prediction.ipynb, id: 1nZ7shjZNEYe53kkkYs3QQnpUlzhQOZR4
title: TFRecord file creator.ipynb, id: 1B2imMvxMaB1n9IUWH1GOXG-hS1ER6js8
title: train.csv, id: 174cLWlxIEwb7VFqYfcVFuH1j-uaqLuPF
title: GCP-Coupons-Instructions.rtf, id: 1kDN_kC5-GdO3CmN28_DEwWPFDHS-UlJM
title: sample_submission.csv, id: 1k4RmIgAQO4XujlEp5nuZWlCNvCkrcK4p
title: test.csv, id: 1pdLyx3edlI7WCBPFqurIIvO5IKzuxhEf


In [0]:
# Enter the ID of train.csv and test.csv. The IDs are printed in the output of the cell above.
train_downloaded = drive.CreateFile({'id': '174cLWlxIEwb7VFqYfcVFuH1j-uaqLuPF'})
train_downloaded.GetContentFile('train.csv')
test_downloaded = drive.CreateFile({'id': '1pdLyx3edlI7WCBPFqurIIvO5IKzuxhEf'})
test_downloaded.GetContentFile('test.csv')

In [0]:
# See the top rows of the train.csv file
!ls
!head train.csv
!head test.csv
#!wc -l train.csv


adc.json  sample_data  test.csv  train.csv
key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:45 UTC,-74.000964,40.73163,-73.972892,40.758233,1
2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1
2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1
2012-12-03 13:10:00.000000125,9,2012-12-

##Data transformations:

Read input

1. read training file in chunks, write TFRecords into a separate file for each chunk (roughly 500MB each file)
2. ignore number of passengers
3. From pickup time, extract features: (1) number of days since 1/1/1990, (2) day of the week, (3) minutes since the start of the day
4. Add taxicab distance between pickup and dropoff points



In [0]:
from dateutil import parser
import datetime
from datetime import date

dt = parser.parse("2016-12-31 19:12:13 UTC")

print dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.timetuple().tm_yday

print dt.weekday()

d0 = date(2008, 8, 18)
d1 = date(2008, 9, 26)
delta = d1 - d0
print delta.days

2016 12 31 19 12 366
5
39


## Writing into TFRecord file

In [0]:
import tensorflow as tf
import pandas as pd
from dateutil import parser
import datetime
from datetime import date
import math

def get_features(row):
  complete = True
  fare_amount = row['fare_amount']
  pickup_datetime = row['pickup_datetime']
  pickup_longitude = row['pickup_longitude']
  pickup_latitude = row['pickup_latitude']
  dropoff_longitude = row['dropoff_longitude']
  dropoff_latitude = row['dropoff_latitude']
  if math.isnan(dropoff_longitude) or math.isnan(dropoff_latitude) or math.isnan(pickup_longitude) or math.isnan(pickup_latitude) or pickup_datetime==False or math.isnan(fare_amount) or pickup_longitude==0:
    complete = False
  dt = parser.parse(pickup_datetime)
  year = dt.year
  day_of_year = dt.timetuple().tm_yday
  day_of_week = dt.weekday()
  minutes_since_midnight = (dt.hour * 60) + dt.minute
  distance= 1000*((pickup_latitude-dropoff_latitude)**2 + (pickup_longitude-dropoff_longitude)**2)**.5
  features = {
      "pickup_longitude": tf.train.Feature(float_list = tf.train.FloatList(value=[pickup_longitude])),
      "pickup_latitude": tf.train.Feature(float_list = tf.train.FloatList(value=[pickup_latitude])),
      "fare": tf.train.Feature(float_list = tf.train.FloatList(value=[fare_amount])),
      "distance": tf.train.Feature(float_list = tf.train.FloatList(value=[distance])),
      "year": tf.train.Feature(float_list = tf.train.FloatList(value=[year])),
      "day_of_year": tf.train.Feature(float_list = tf.train.FloatList(value=[day_of_year])),
      "day_of_week": tf.train.Feature(float_list = tf.train.FloatList(value=[day_of_week])),
      "minutes_since_midnight": tf.train.Feature(float_list = tf.train.FloatList(value=[minutes_since_midnight])),
      "dropoff_longitude": tf.train.Feature(float_list = tf.train.FloatList(value=[dropoff_longitude])),
      "dropoff_latitude": tf.train.Feature(float_list = tf.train.FloatList(value=[dropoff_latitude])),
  }
  return (features, complete)

chunk_id = 0

for df in pd.read_csv('train.csv', chunksize=100000):
  chunk_id += 1
  tfr_file = ("training_examples_{id}.tfr".format(id=chunk_id))
  print "generating TFRecord file " + tfr_file
  writer = tf.python_io.TFRecordWriter(tfr_file)

  # For int features:
  # 'name': tf.train.Feature(int64_list = tf.train.Int64List(value = ...))
  # For float features:
  # 'name': tf.train.Feature(float_list = tf.train.FloatList(value = ...))

  for index, row in df.iterrows():
    features_map, is_complete = get_features(row)
    if is_complete:
      example = tf.train.Example(features=tf.train.Features(feature=features_map))
      writer.write(example.SerializeToString())

  writer.close()

generating TFRecord file training_examples_1.tfr
generating TFRecord file training_examples_2.tfr
generating TFRecord file training_examples_3.tfr
generating TFRecord file training_examples_4.tfr
generating TFRecord file training_examples_5.tfr
generating TFRecord file training_examples_6.tfr
generating TFRecord file training_examples_7.tfr
generating TFRecord file training_examples_8.tfr
generating TFRecord file training_examples_9.tfr
generating TFRecord file training_examples_10.tfr
generating TFRecord file training_examples_11.tfr
generating TFRecord file training_examples_12.tfr
generating TFRecord file training_examples_13.tfr
generating TFRecord file training_examples_14.tfr
generating TFRecord file training_examples_15.tfr
generating TFRecord file training_examples_16.tfr
generating TFRecord file training_examples_17.tfr
generating TFRecord file training_examples_18.tfr


KeyboardInterrupt: ignored

In [0]:
!ls -l

total 6045216
-rw-r--r-- 1 root root       2719 Feb 12 20:47 adc.json
drwxr-xr-x 1 root root       4096 Feb 11 17:40 sample_data
-rw-r--r-- 1 root root     983020 Feb 12 20:49 test.csv
-rw-r--r-- 1 root root 5697178298 Feb 12 20:49 train.csv
-rw-r--r-- 1 root root   27760602 Feb 12 21:00 training_examples_10.tfr
-rw-r--r-- 1 root root   27778997 Feb 12 21:00 training_examples_11.tfr
-rw-r--r-- 1 root root   27774186 Feb 12 21:01 training_examples_12.tfr
-rw-r--r-- 1 root root   27765413 Feb 12 21:02 training_examples_13.tfr
-rw-r--r-- 1 root root   27761734 Feb 12 21:03 training_examples_14.tfr
-rw-r--r-- 1 root root   27771356 Feb 12 21:04 training_examples_15.tfr
-rw-r--r-- 1 root root   27760885 Feb 12 21:05 training_examples_16.tfr
-rw-r--r-- 1 root root   27759470 Feb 12 21:06 training_examples_17.tfr
-rw-r--r-- 1 root root   20033536 Feb 12 21:07 training_examples_18.tfr
-rw-r--r-- 1 root root   27756074 Feb 12 20:51 training_examples_1.tfr
-rw-r--r-- 1 root root   27775318 Feb 1

##Input function for feeding training examples into the model

In [0]:
import tensorflow as tf

feature_columns = [
    tf.feature_column.numeric_column('pickup_longitude', shape=(1,), dtype=tf.float32),
    tf.feature_column.numeric_column('pickup_latitude', shape=(1,), dtype=tf.float32),
    tf.feature_column.numeric_column('dropoff_longitude', shape=(1,), dtype=tf.float32),
    tf.feature_column.numeric_column('dropoff_latitude', shape=(1,), dtype=tf.float32),
    tf.feature_column.numeric_column('distance', shape=(1,), dtype=tf.float32),
    tf.feature_column.numeric_column('year', shape=(1,), dtype=tf.float32),
    tf.feature_column.numeric_column('day_of_year', shape=(1,), dtype=tf.float32),
    tf.feature_column.numeric_column('day_of_week', shape=(1,), dtype=tf.float32),
    tf.feature_column.numeric_column('minutes_since_midnight', shape=(1,), dtype=tf.float32),
]

label_column = tf.feature_column.numeric_column('fare', shape=(1,), dtype=tf.float32)

features_spec = tf.feature_column.make_parse_example_spec(
    feature_columns + [label_column]
)

def input_fn(file_list, num_epochs):
  dataset = tf.contrib.data.make_batched_features_dataset(
      file_pattern=file_list,
      batch_size=128,
      features=features_spec,
      num_epochs=num_epochs,
      shuffle=True,
      shuffle_buffer_size=10000
  )
  it = dataset.make_one_shot_iterator()
  features = it.get_next()
  print features
  labels = features.pop('fare')
  return features, labels

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


##Training the model

In [0]:
#Run once only
!rm -r model_dir
!mkdir model_dir

rm: cannot remove 'model_dir': No such file or directory


In [0]:
model_dir="model_dir"

regressor = tf.estimator.DNNRegressor(
    hidden_units=[32,16,8],
    feature_columns=feature_columns,
    model_dir=model_dir
)

def get_eval_metrics(labels, predictions):
  return {
      'rmse': tf.metrics.root_mean_squared_error(
          labels=labels,
          predictions=predictions['predictions']
      )
  }

regressor = tf.contrib.estimator.add_metrics(regressor, get_eval_metrics)

training_files = [ "training_examples_%d.tfr" % id for id in range(10) ]
testing_files = [ "training_examples_%d.tfr" % id for id in range(10,15) ]

train_spec = tf.estimator.TrainSpec(
    input_fn=lambda: input_fn(training_files, 1),
    max_steps=10000000
)

eval_spec = tf.estimator.EvalSpec(
    input_fn=lambda: input_fn(testing_files, 1),
    steps=None,
    start_delay_secs=10,
    throttle_secs=0
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe323534250>, '_model_dir': 'model_dir', '_protocol': None, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_device_fn': None, '_experimental_distribute': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_evaluation_master': '', '_eval_distribute': None, '_train_distribute': None, '_master': ''}

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/te

In [0]:
while True:
  tf.estimator.train_and_evaluate(regressor, train_spec, eval_spec)

INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use `tf.data.experimental.make_batched_features_dataset(...)`.
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.
{'fare': <tf.Tensor 'IteratorGetNext:5' shape=(?, 1) dtype=float32>, 'distance': <tf.Tensor 'IteratorGetNext:2' shape=(?, 1) dtype=float32>, 'day_of_year': <tf.Tensor 'IteratorGetNext:1' shape=(?, 1) dtype=float32>, 'day_of_week': <tf.Tensor '

In [0]:
import itertools
import math

testdf = pd.read_csv('test.csv')

for index, row in testdf.iterrows():
  fare_amount = row['fare_amount']
  pickup_datetime = row['pickup_datetime']
  pickup_longitude = row['pickup_longitude']
  pickup_latitude = row['pickup_latitude']
  dropoff_longitude = row['dropoff_longitude']
  dropoff_latitude = row['dropoff_latitude']
  if math.isnan(dropoff_longitude) or math.isnan(dropoff_latitude) or math.isnan(pickup_longitude) or math.isnan(pickup_latitude) or pickup_datetime==False or math.isnan(fare_amount) or pickup_longitude==0:
    complete = False
  dt = parser.parse(pickup_datetime)
  year = dt.year
  day_of_year = dt.timetuple().tm_yday
  day_of_week = dt.weekday()
  minutes_since_midnight = (dt.hour * 60) + dt.minute
  distance= 1000*((pickup_latitude-dropoff_latitude)**2 + (pickup_longitude-dropoff_longitude)**2)**.5
  features = {
      "pickup_longitude": tf.train.Feature(float_list = tf.train.FloatList(value=[pickup_longitude])),
      "pickup_latitude": tf.train.Feature(float_list = tf.train.FloatList(value=[pickup_latitude])),
      "fare": tf.train.Feature(float_list = tf.train.FloatList(value=[fare_amount])),
      "distance": tf.train.Feature(float_list = tf.train.FloatList(value=[distance])),
      "year": tf.train.Feature(float_list = tf.train.FloatList(value=[year])),
      "day_of_year": tf.train.Feature(float_list = tf.train.FloatList(value=[day_of_year])),
      "day_of_week": tf.train.Feature(float_list = tf.train.FloatList(value=[day_of_week])),
      "minutes_since_midnight": tf.train.Feature(float_list = tf.train.FloatList(value=[minutes_since_midnight])),
      "dropoff_longitude": tf.train.Feature(float_list = tf.train.FloatList(value=[dropoff_longitude])),
      "dropoff_latitude": tf.train.Feature(float_list = tf.train.FloatList(value=[dropoff_latitude])),
  }

testdf['YearBuilt'] = (testdf['YearBuilt'] - 1990) / 10.0
testdf['LotArea'] = testdf['LotArea'] / 1000.
testdf['GrLivArea'] = testdf['GrLivArea'] / 1000.

for index, row in testdf.iterrows():
  OverallQual = row['OverallQual']
  OverallCond = row['OverallCond']
  YearBuilt = row['YearBuilt']
  FullBath = row['FullBath']
  BedroomAbvGr = row['BedroomAbvGr']
  HalfBath = row['HalfBath']
  LotArea = row['LotArea']
  GrLivArea = row['GrLivArea']
  Neighborhood = row['Neighborhood']
  if math.isnan(OverallQual):
    row['OverallQual'] = 5
  if math.isnan(OverallCond):
    row['OverallCond'] = 5
  if math.isnan(YearBuilt):
    row['YearBuilt'] = 0
  if math.isnan(FullBath):
    row['FullBath'] = 2
  if math.isnan(BedroomAbvGr):
    row['BedroomAbvGr'] = 2
    
def eval_input_fn1():
  eval_dataset1 = tf.data.Dataset.from_tensor_slices((dict(testdf)))
  #eval_dataset1 = eval_dataset1.shuffle(batch_size).repeat().batch(batch_size)
  eval_dataset1 = eval_dataset1.batch(len(testdf))
  return eval_dataset1

def test_input_fn(df):
  x = {
      'pickup_longitude':tf.constant(
      df['pickup_longitude'].values,
          shape = [df['pickup_longitude'].size,1],
          dtype=tf.float32),
      'pickup_latitude':tf.constant(
      df['pickup_latitude'].values,
          shape = [df['pickup_latitude'].size,1],
          dtype=tf.float32),
      'YearBuilt':tf.constant(
      df['YearBuilt'].values,
          shape = [df['YearBuilt'].size,1],
          dtype=tf.float32),
      'FullBath':tf.constant(
      df['FullBath'].values,
          shape = [df['FullBath'].size,1],
          dtype=tf.float32),
      'BedroomAbvGr':tf.constant(
      df['BedroomAbvGr'].values,
          shape = [df['BedroomAbvGr'].size,1],
          dtype=tf.float32),
      'HalfBath':tf.constant(
      df['HalfBath'].values,
          shape = [df['HalfBath'].size,1],
          dtype=tf.float32),
      'LotArea':tf.constant(
      df['LotArea'].values,
          shape = [df['LotArea'].size,1],
          dtype=tf.float32),
      'GrLivArea':tf.constant(
      df['GrLivArea'].values,
          shape = [df['GrLivArea'].size,1],
          dtype=tf.float32),
      'Neighborhood':tf.constant(
      df['Neighborhood'].values,
          shape = [df['Neighborhood'].size,1],
          dtype=tf.string)
  }
  return x

def test_fn_predictions():
  return test_input_fn(testdf)

# For int features:
# 'name': tf.train.Feature(int64_list = tf.train.Int64List(value = ...))
# For float features:
# 'name': tf.train.Feature(float_list = tf.train.FloatList(value = ...))

predictions_generator = regressor.predict(
    input_fn=test_fn_predictions,
    yield_single_examples=True)

iter = itertools.islice(predictions_generator, 1459)

predictions = [y['predictions'][0] for y in iter]

print len(predictions), "predictions:"
print predictions

predictions = [math.exp(i) for i in predictions]
submission = [["Id", "SalePrice"]]
for i in range(1459):
  submission.append([testdf['Id'][i], predictions[i]])
print submission

In [0]:
variable_names = regressor.get_variable_names()
print "variable names: "
for var_name in variable_names:
  values = regressor.get_variable_value(var_name)
  print var_name, values

In [0]:
values = regressor.get_variable_value("")


# Show tensorboard analytics on model training

In [0]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip ngrok-stable-linux-amd64.zip

In [0]:
# kill currently running tensorboard instance, if any
#!ps wx
!kill -9 1383

In [0]:
LOG_DIR = './model_dir3'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

In [0]:

get_ipython().system_raw('./ngrok http 6006 &')

In [0]:
! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"

In [0]:
#!ls -l model_dir.tar
#!gzip model_dir.tar
!ls -l model_dir.tar.gz


In [0]:
def eval_input_fn1():
  eval_dataset1 = tf.data.Dataset.from_tensor_slices((dict(features_df1)))
  #eval_dataset1 = eval_dataset1.shuffle(batch_size).repeat().batch(batch_size)
  eval_dataset1 = eval_dataset1.batch(len(features_df1))
  return eval_dataset1

In [0]:
import tensorflow as tf
import pandas as pd
from dateutil import parser
import datetime
from datetime import date
import math

def get_test_features(row):
  complete = True
  pickup_datetime = row['pickup_datetime']
  pickup_longitude = row['pickup_longitude']
  pickup_latitude = row['pickup_latitude']
  dropoff_longitude = row['dropoff_longitude']
  dropoff_latitude = row['dropoff_latitude']
  dt = parser.parse(pickup_datetime)
  year = dt.year
  day_of_year = dt.timetuple().tm_yday
  day_of_week = dt.weekday()
  minutes_since_midnight = (dt.hour * 60) + dt.minute
  distance= 1000*((pickup_latitude-dropoff_latitude)**2 + (pickup_longitude-dropoff_longitude)**2)**.5
  features = {
      "pickup_longitude": tf.train.Feature(float_list = tf.train.FloatList(value=[pickup_longitude])),
      "pickup_latitude": tf.train.Feature(float_list = tf.train.FloatList(value=[pickup_latitude])),
      "distance": tf.train.Feature(float_list = tf.train.FloatList(value=[distance])),
      "year": tf.train.Feature(float_list = tf.train.FloatList(value=[year])),
      "day_of_year": tf.train.Feature(float_list = tf.train.FloatList(value=[day_of_year])),
      "day_of_week": tf.train.Feature(float_list = tf.train.FloatList(value=[day_of_week])),
      "minutes_since_midnight": tf.train.Feature(float_list = tf.train.FloatList(value=[minutes_since_midnight])),
      "dropoff_longitude": tf.train.Feature(float_list = tf.train.FloatList(value=[dropoff_longitude])),
      "dropoff_latitude": tf.train.Feature(float_list = tf.train.FloatList(value=[dropoff_latitude])),
  }
  return features

reader = pd.read_csv('test.csv')
print reader


tfr_file = ("test_examples.tfr")
print "generating TFRecord file " + tfr_file
test_writer = tf.python_io.TFRecordWriter(tfr_file)  

# For int features:
# 'name': tf.train.Feature(int64_list = tf.train.Int64List(value = ...))
# For float features:
# 'name': tf.train.Feature(float_list = tf.train.FloatList(value = ...))


  


for index, row in reader.iterrows():
  feature = get_test_features(row)
  
  example = tf.train.Example(
      features=tf.train.Features(feature=feature))
  test_writer.write(example.SerializeToString())

test_writer.close()

In [0]:
!ls