In [15]:
"""A dataset loader for imports85.data."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections

import numpy as np
import tensorflow as tf

STEPS = 1000
PRICE_NORM_FACTOR = 1000
train_fraction=0.7
seed=100
y_name="price"

try:
  import pandas as pd  # pylint: disable=g-import-not-at-top
except ImportError:
  pass


URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

# Order is important for the csv-readers, so we use an OrderedDict here.
defaults = collections.OrderedDict([
    ("symboling", [0]),
    ("normalized-losses", [0.0]),
    ("make", [""]),
    ("fuel-type", [""]),
    ("aspiration", [""]),
    ("num-of-doors", [""]),
    ("body-style", [""]),
    ("drive-wheels", [""]),
    ("engine-location", [""]),
    ("wheel-base", [0.0]),
    ("length", [0.0]),
    ("width", [0.0]),
    ("height", [0.0]),
    ("curb-weight", [0.0]),
    ("engine-type", [""]),
    ("num-of-cylinders", [""]),
    ("engine-size", [0.0]),
    ("fuel-system", [""]),
    ("bore", [0.0]),
    ("stroke", [0.0]),
    ("compression-ratio", [0.0]),
    ("horsepower", [0.0]),
    ("peak-rpm", [0.0]),
    ("city-mpg", [0.0]),
    ("highway-mpg", [0.0]),
    ("price", [0.0])
])  # pyformat: disable


types = collections.OrderedDict((key, type(value[0])) for key, value in defaults.items())

In [5]:
path = tf.contrib.keras.utils.get_file(URL.split("/")[-1], URL)

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data

In [6]:
# Load it into a pandas dataframe
df = pd.read_csv(path, names=types.keys(), dtype=types, na_values="?")

In [16]:
# Delete rows with unknowns
data = df.dropna()

# Shuffle the data
np.random.seed(None)

# Split the data into train/test subsets.
x_train = data.sample(frac=train_fraction, random_state=seed)
x_test = data.drop(x_train.index)

# Extract the label from the features dataframe.
y_train = x_train.pop(y_name)
y_test = x_test.pop(y_name)


In [21]:
def has_no_question_marks(line):
    """Returns True if the line of text has no question marks."""
    # split the line into an array of characters
    chars = tf.string_split(line[tf.newaxis], "").values
    # for each character check if it is a question mark
    is_question = tf.equal(chars, "?")
    any_question = tf.reduce_any(is_question)
    no_question = ~any_question
    return no_question

In [27]:
# Define how the lines of the file should be parsed
def decode_line(line):
    """Convert a csv line into a (features_dict,label) pair."""
    # Decode the line to a tuple of items based on the types of
    # csv_header.values().
    items = tf.decode_csv(line, list(defaults.values()))

    # Convert the keys and items to a dict.
    pairs = zip(defaults.keys(), items)
    features_dict = dict(pairs)

    # Remove the label from the features_dict
    label = features_dict.pop(y_name)

    return features_dict, label

In [28]:
base_dataset = (tf.contrib.data.TextLineDataset(path).filter(has_no_question_marks))

In [30]:
def in_training_set(line):
    """Returns a boolean tensor, true if the line is in the training set."""
    # If you randomly split the dataset you won't get the same split in both
    # sessions if you stop and restart training later. Also a simple
    # random split won't work with a dataset that's too big to `.cache()` as
    # we are doing here.
    num_buckets = 1000000
    bucket_id = tf.string_to_hash_bucket_fast(line, num_buckets)
    # Use the hash bucket id as a random number that's deterministic per example
    return bucket_id < int(train_fraction * num_buckets)

def in_test_set(line):
    """Returns a boolean tensor, true if the line is in the training set."""
    # Items not in the training set are in the test set.
    # This line must use `~` instead of `not` beacuse `not` only works on python
    # booleans but we are dealing with symbolic tensors.
    return ~in_training_set(line)

In [31]:
train = (base_dataset.filter(in_training_set).cache().map(decode_line))
# Take only the training-set lines.
# Cache data so you only read the file once.
# Decode each line into a (features_dict, label) pair.
test = (base_dataset.filter(in_test_set).cache().map(decode_line))

In [34]:
# Switch the labels to units of thousands for better convergence.
def to_thousands(features, labels):
    return features, labels / PRICE_NORM_FACTOR

In [35]:
train = train.map(to_thousands)
test = test.map(to_thousands)

In [38]:
# Build the training input_fn.
# Shuffling with a buffer larger than the data set ensures
# that the examples are well mixed.
# Repeat forever
def input_train():
    return (train.shuffle(1000).batch(128).repeat().make_one_shot_iterator().get_next())

In [39]:
# Build the validation input_fn.
def input_test():
    return (test.shuffle(1000).batch(128).make_one_shot_iterator().get_next())

In [40]:
feature_columns = [
      # "curb-weight" and "highway-mpg" are numeric columns.
      tf.feature_column.numeric_column(key="curb-weight"),
      tf.feature_column.numeric_column(key="highway-mpg"),
  ]

In [42]:
# Build the Estimator.
model = tf.estimator.LinearRegressor(feature_columns=feature_columns)

# Train the model.
# By default, the Estimators log output every 100 steps.
model.train(input_fn=input_train, steps=STEPS)

# Evaluate how the model performs on data it has not yet seen.
eval_result = model.evaluate(input_fn=input_test)

# The evaluation returns a Python dictionary. The "average_loss" key holds the
# Mean Squared Error (MSE).
average_loss = eval_result["average_loss"]

# Convert MSE to Root Mean Square Error (RMSE).
print("\n" + 80 * "*")
print("\nRMS error for the test set: ${:.0f}".format(PRICE_NORM_FACTOR * average_loss**0.5))

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_is_chief': True, '_master': '', '_session_config': None, '_save_checkpoints_secs': 600, '_model_dir': '/tmp/tmpzuh_x2jr', '_save_summary_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff1326ccef0>, '_save_checkpoints_steps': None, '_task_id': 0, '_log_step_count_steps': 100, '_service': None, '_keep_checkpoint_every_n_hours': 10000, '_task_type': 'worker', '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tf_random_seed': None}


INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_is_chief': True, '_master': '', '_session_config': None, '_save_checkpoints_secs': 600, '_model_dir': '/tmp/tmpzuh_x2jr', '_save_summary_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff1326ccef0>, '_save_checkpoints_steps': None, '_task_id': 0, '_log_step_count_steps': 100, '_service': None, '_keep_checkpoint_every_n_hours': 10000, '_task_type': 'worker', '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tf_random_seed': None}


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpzuh_x2jr/model.ckpt.


INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpzuh_x2jr/model.ckpt.


INFO:tensorflow:loss = 18176.3, step = 1


INFO:tensorflow:loss = 18176.3, step = 1


INFO:tensorflow:global_step/sec: 42.7102


INFO:tensorflow:global_step/sec: 42.7102


INFO:tensorflow:loss = 1393.11, step = 101 (2.346 sec)


INFO:tensorflow:loss = 1393.11, step = 101 (2.346 sec)


INFO:tensorflow:global_step/sec: 45.1422


INFO:tensorflow:global_step/sec: 45.1422


INFO:tensorflow:loss = 1214.21, step = 201 (2.213 sec)


INFO:tensorflow:loss = 1214.21, step = 201 (2.213 sec)


INFO:tensorflow:global_step/sec: 45.169


INFO:tensorflow:global_step/sec: 45.169


INFO:tensorflow:loss = 1086.35, step = 301 (2.216 sec)


INFO:tensorflow:loss = 1086.35, step = 301 (2.216 sec)


INFO:tensorflow:global_step/sec: 44.4904


INFO:tensorflow:global_step/sec: 44.4904


INFO:tensorflow:loss = 994.949, step = 401 (2.248 sec)


INFO:tensorflow:loss = 994.949, step = 401 (2.248 sec)


INFO:tensorflow:global_step/sec: 45.3581


INFO:tensorflow:global_step/sec: 45.3581


INFO:tensorflow:loss = 929.614, step = 501 (2.202 sec)


INFO:tensorflow:loss = 929.614, step = 501 (2.202 sec)


INFO:tensorflow:global_step/sec: 45.1621


INFO:tensorflow:global_step/sec: 45.1621


INFO:tensorflow:loss = 882.908, step = 601 (2.217 sec)


INFO:tensorflow:loss = 882.908, step = 601 (2.217 sec)


INFO:tensorflow:global_step/sec: 45.0061


INFO:tensorflow:global_step/sec: 45.0061


INFO:tensorflow:loss = 849.517, step = 701 (2.219 sec)


INFO:tensorflow:loss = 849.517, step = 701 (2.219 sec)


INFO:tensorflow:global_step/sec: 45.0328


INFO:tensorflow:global_step/sec: 45.0328


INFO:tensorflow:loss = 825.645, step = 801 (2.221 sec)


INFO:tensorflow:loss = 825.645, step = 801 (2.221 sec)


INFO:tensorflow:global_step/sec: 45.1945


INFO:tensorflow:global_step/sec: 45.1945


INFO:tensorflow:loss = 808.576, step = 901 (2.215 sec)


INFO:tensorflow:loss = 808.576, step = 901 (2.215 sec)


INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmpzuh_x2jr/model.ckpt.


INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmpzuh_x2jr/model.ckpt.


INFO:tensorflow:Loss for final step: 796.474.


INFO:tensorflow:Loss for final step: 796.474.


INFO:tensorflow:Starting evaluation at 2018-01-06-06:32:01


INFO:tensorflow:Starting evaluation at 2018-01-06-06:32:01


INFO:tensorflow:Restoring parameters from /tmp/tmpzuh_x2jr/model.ckpt-1000


INFO:tensorflow:Restoring parameters from /tmp/tmpzuh_x2jr/model.ckpt-1000


INFO:tensorflow:Finished evaluation at 2018-01-06-06:32:01


INFO:tensorflow:Finished evaluation at 2018-01-06-06:32:01


INFO:tensorflow:Saving dict for global step 1000: average_loss = 9.25865, global_step = 1000, loss = 453.674


INFO:tensorflow:Saving dict for global step 1000: average_loss = 9.25865, global_step = 1000, loss = 453.674



********************************************************************************

RMS error for the test set: 3043


In [43]:
# Run the model in prediction mode.
input_dict = {
  "curb-weight": np.array([2000, 3000]),
  "highway-mpg": np.array([30, 40])
}
predict_input_fn = tf.estimator.inputs.numpy_input_fn(
      input_dict, shuffle=False)

In [44]:
predict_results = model.predict(input_fn=predict_input_fn)

# Print the prediction results.
print("\nPrediction results:")
for i, prediction in enumerate(predict_results):
    msg = ("Curb weight: {: 4d}lbs, "
           "Highway: {: 0d}mpg, "
           "Prediction: ${: 9.2f}")
    msg = msg.format(input_dict["curb-weight"][i], input_dict["highway-mpg"][i],
                     PRICE_NORM_FACTOR * prediction["predictions"][0])

    print("    " + msg)


Prediction results:
INFO:tensorflow:Restoring parameters from /tmp/tmpzuh_x2jr/model.ckpt-1000


INFO:tensorflow:Restoring parameters from /tmp/tmpzuh_x2jr/model.ckpt-1000


    Curb weight:  2000lbs, Highway:  30mpg, Prediction: $  8695.01
    Curb weight:  3000lbs, Highway:  40mpg, Prediction: $ 14073.10
