In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import os
import collections
import itertools

In [23]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas as pd

In [24]:
from six.moves import urllib

In [25]:
print(tf.__version__)

1.11.0


In [26]:
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"

In [27]:
FILENAME = "automobiles.csv"

In [28]:
def download_data():
    if not os.path.exists(FILENAME):
        filename, _ = urllib.request.urlretrieve(URL,FILENAME)
    print("Found and downloaded file from {}".format(URL))
    print("Downloaded Filename: {}".format(FILENAME))

In [29]:
download_data()

Found and downloaded file from https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
Downloaded Filename: automobiles.csv


In [46]:
COLUMN_TYPES = collections.OrderedDict([
    ("symboling", int),
    ("normalized-losses", float),
    ("make", str),
    ("fuel-type", str),
    ("aspiration", str),
    ("num-of-doors", str),
    ("body-style", str),
    ("drive-wheels", str),
    ("engine-location", str),
    ("wheel-base", float),
    ("length", float),
    ("width", float),
    ("height", float),
    ("curb-weight", float),
    ("engine-type", str),
    ("num-of-cylinders", str),
    ("engine-size", float),
    ("fuel-system", str),
    ("bore", float),
    ("stroke", float),
    ("compression-ratio", float),
    ("horsepower", float),
    ("peak-rpm", float),
    ("city-mpg", float),
    ("highway-mpg", float),
    ("price", float)
])

In [47]:
df = pd.read_csv(FILENAME,names=COLUMN_TYPES.keys(),dtype=COLUMN_TYPES,na_values='?')

In [48]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152.0,mpfi,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109.0,mpfi,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136.0,mpfi,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0


In [49]:
df.count()

symboling            205
normalized-losses    164
make                 205
fuel-type            205
aspiration           205
num-of-doors         203
body-style           205
drive-wheels         205
engine-location      205
wheel-base           205
length               205
width                205
height               205
curb-weight          205
engine-type          205
num-of-cylinders     205
engine-size          205
fuel-system          205
bore                 201
stroke               201
compression-ratio    205
horsepower           203
peak-rpm             203
city-mpg             205
highway-mpg          205
price                201
dtype: int64

In [54]:
df = df.dropna()

In [55]:
TRIMMED_CSV_COLUMNS = [
    "make", "fuel-type", "aspiration", "num-of-doors", "body-style",
    "drive-wheels", "curb-weight", "engine-type", "num-of-cylinders", "engine-size",
    "fuel-system", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"
]

In [56]:
df = df[TRIMMED_CSV_COLUMNS]

In [57]:
df.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,alfa-romero,gas,std,two,convertible,rwd,2548.0,dohc,four,130.0,mpfi,111.0,5000.0,21.0,27.0,13495.0
1,alfa-romero,gas,std,two,convertible,rwd,2548.0,dohc,four,130.0,mpfi,111.0,5000.0,21.0,27.0,16500.0
2,alfa-romero,gas,std,two,hatchback,rwd,2823.0,ohcv,six,152.0,mpfi,154.0,5000.0,19.0,26.0,16500.0
3,audi,gas,std,four,sedan,fwd,2337.0,ohc,four,109.0,mpfi,102.0,5500.0,24.0,30.0,13950.0
4,audi,gas,std,four,sedan,4wd,2824.0,ohc,five,136.0,mpfi,115.0,5500.0,18.0,22.0,17450.0


In [80]:
Y_name = "price"

In [85]:
def get_training_test_prediction_data(df,y):
    #unique shuffle each time
    np.random.seed(None)
    #Split
    x_train = df.sample(frac=0.7,random_state=None)
    x_test = df.drop(x_train.index)
    x_predict = df.sample(frac=0.2,random_state=None)
    #Extract labels
    y_train = x_train.pop(y)
    y_test = x_test.pop(y)
    y_predict = x_predict.pop(y)
    return (x_train,y_train), (x_test,y_test), (x_predict,y_predict)

In [86]:
(x_train,y_train), (x_test,y_test), (x_predict,y_predict)= get_training_test_prediction_data(df,Y_name)

In [88]:
PRICE_SCALING_FACTOR = 10000

y_train /= PRICE_SCALING_FACTOR
y_test /= PRICE_SCALING_FACTOR
y_predict /= PRICE_SCALING_FACTOR

In [90]:
y_train.head()

70     3.1600
12     2.0970
158    0.7898
50     0.5195
167    0.8449
Name: price, dtype: float64

### Categorical Values

In [99]:
df['make'].unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mercury',
       'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche', 'saab',
       'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object)

In [100]:
df['fuel-type'].unique()

array(['gas', 'diesel'], dtype=object)

In [101]:
df["aspiration"].unique()

array(['std', 'turbo'], dtype=object)

In [102]:
df['num-of-doors'].unique()

array(['two', 'four'], dtype=object)

In [103]:
df['body-style'].unique()

array(['convertible', 'hatchback', 'sedan', 'wagon', 'hardtop'],
      dtype=object)

In [104]:
df['drive-wheels'].unique()

array(['rwd', 'fwd', '4wd'], dtype=object)

In [105]:
df['engine-type'].unique()

array(['dohc', 'ohcv', 'ohc', 'l', 'rotor', 'ohcf'], dtype=object)

In [106]:
df['num-of-cylinders'].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [107]:
df['fuel-system'].unique()

array(['mpfi', '2bbl', 'mfi', '1bbl', 'spfi', '4bbl', 'idi', 'spdi'],
      dtype=object)

In [108]:
x_train.keys()

Index(['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
       'drive-wheels', 'curb-weight', 'engine-type', 'num-of-cylinders',
       'engine-size', 'fuel-system', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg'],
      dtype='object')

In [110]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138 entries, 70 to 120
Data columns (total 15 columns):
make                138 non-null object
fuel-type           138 non-null object
aspiration          138 non-null object
num-of-doors        138 non-null object
body-style          138 non-null object
drive-wheels        138 non-null object
curb-weight         138 non-null float64
engine-type         138 non-null object
num-of-cylinders    138 non-null object
engine-size         138 non-null float64
fuel-system         138 non-null object
horsepower          138 non-null float64
peak-rpm            138 non-null float64
city-mpg            138 non-null float64
highway-mpg         138 non-null float64
dtypes: float64(6), object(9)
memory usage: 17.2+ KB


In [117]:
curb_weight = tf.feature_column.numeric_column("curb-weight")

engine_size = tf.feature_column.numeric_column("engine-size")

horsepower = tf.feature_column.numeric_column("horsepower")

peak_rpm = tf.feature_column.numeric_column("peak-rpm")

city_mpg = tf.feature_column.numeric_column("city-mpg")

highway_mpg = tf.feature_column.numeric_column("highway-mpg")

In [118]:
body_style = tf.feature_column.categorical_column_with_vocabulary_list(
      key="body-style", vocabulary_list=df['body-style'].unique())

fuel_type = tf.feature_column.categorical_column_with_vocabulary_list(
      key="fuel-type", vocabulary_list=df['fuel-type'].unique())

aspiration = tf.feature_column.categorical_column_with_vocabulary_list(
      key="aspiration", vocabulary_list=df['aspiration'].unique())

num_of_doors = tf.feature_column.categorical_column_with_vocabulary_list(
      key="num-of-doors", vocabulary_list=df['num-of-doors'].unique())

drive_wheels = tf.feature_column.categorical_column_with_vocabulary_list(
      key="drive-wheels", vocabulary_list=df['drive-wheels'].unique())

engine_type = tf.feature_column.categorical_column_with_vocabulary_list(
      key="engine-type", vocabulary_list=df['engine-type'].unique())

num_of_cylinders = tf.feature_column.categorical_column_with_vocabulary_list(
      key="num-of-cylinders", vocabulary_list=df['num-of-cylinders'].unique())

fuel_system = tf.feature_column.categorical_column_with_vocabulary_list(
      key="fuel-system", vocabulary_list=df['fuel-system'].unique())

In [119]:
make = tf.feature_column.categorical_column_with_hash_bucket(key='make',hash_bucket_size=50)

In [120]:
feature_columns = [
    curb_weight, engine_size, horsepower, peak_rpm, city_mpg, highway_mpg,

    tf.feature_column.indicator_column(body_style),

    tf.feature_column.embedding_column(fuel_type, dimension=3),

    tf.feature_column.embedding_column(aspiration, dimension=3),
    tf.feature_column.embedding_column(num_of_doors, dimension=3),
    tf.feature_column.embedding_column(drive_wheels, dimension=3),
    tf.feature_column.embedding_column(engine_type, dimension=3),
    tf.feature_column.embedding_column(num_of_cylinders, dimension=3),
    tf.feature_column.embedding_column(fuel_system, dimension=3),

    tf.feature_column.embedding_column(make, dimension=4)    
]

In [121]:
def input_fn(x_data,y_data,num_epochs,shuffle):
    return tf.estimator.inputs.pandas_input_fn(x_data,y_data,batch_size=64,num_epochs=num_epochs,shuffle=shuffle)

In [122]:
model = tf.estimator.DNNRegressor(hidden_units=[20,20,20],feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_is_chief': True, '_save_summary_steps': 100, '_eval_distribute': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_task_type': 'worker', '_model_dir': 'C:\\Users\\Lolly\\AppData\\Local\\Temp\\tmpaf205_wz', '_evaluation_master': '', '_task_id': 0, '_experimental_distribute': None, '_log_step_count_steps': 100, '_save_checkpoints_steps': None, '_train_distribute': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_global_id_in_cluster': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000010A126ED1D0>, '_service': None, '_num_worker_replicas': 1, '_device_fn': None, '_protocol': None, '_num_ps_replicas': 0, '_master': '', '_save_checkpoints_secs': 600, '_tf_random_seed': None}


In [125]:
model.train(input_fn=input_fn(x_train,y_train,num_epochs=None,shuffle=True),steps=10000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Lolly\AppData\Local\Temp\tmpaf205_wz\model.ckpt-130
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 130 into C:\Users\Lolly\AppData\Local\Temp\tmpaf205_wz\model.ckpt.
INFO:tensorflow:loss = 387.28546, step = 131
INFO:tensorflow:global_step/sec: 77.565
INFO:tensorflow:loss = 420.0084, step = 231 (1.292 sec)
INFO:tensorflow:global_step/sec: 120.874
INFO:tensorflow:loss = 234.89034, step = 331 (0.838 sec)
INFO:tensorflow:global_step/sec: 143.031
INFO:tensorflow:loss = 279.70108, step = 431 (0.687 sec)
INFO:tensorflow:global_step/sec: 111.098
INFO:tensorflow:loss = 288.0074, step = 531 (0.909 sec)
INFO:tensorflow:global_step/sec: 128.793
INFO:tensorflow:loss = 172.07129, step = 631 (0.781 sec)
INFO:tensorflow:global

INFO:tensorflow:loss = 51.440464, step = 8031 (0.594 sec)
INFO:tensorflow:global_step/sec: 167.671
INFO:tensorflow:loss = 41.883087, step = 8131 (0.597 sec)
INFO:tensorflow:global_step/sec: 154.734
INFO:tensorflow:loss = 34.456898, step = 8231 (0.642 sec)
INFO:tensorflow:global_step/sec: 164.104
INFO:tensorflow:loss = 65.09903, step = 8331 (0.613 sec)
INFO:tensorflow:global_step/sec: 174.076
INFO:tensorflow:loss = 63.955036, step = 8431 (0.568 sec)
INFO:tensorflow:global_step/sec: 153.314
INFO:tensorflow:loss = 56.795, step = 8531 (0.659 sec)
INFO:tensorflow:global_step/sec: 186.024
INFO:tensorflow:loss = 41.635353, step = 8631 (0.534 sec)
INFO:tensorflow:global_step/sec: 156.668
INFO:tensorflow:loss = 39.996365, step = 8731 (0.640 sec)
INFO:tensorflow:global_step/sec: 179.049
INFO:tensorflow:loss = 57.44725, step = 8831 (0.556 sec)
INFO:tensorflow:global_step/sec: 158.401
INFO:tensorflow:loss = 44.490738, step = 8931 (0.634 sec)
INFO:tensorflow:global_step/sec: 182.637
INFO:tensorflow

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x10a126ed5c0>

In [127]:
results = model.evaluate(input_fn=input_fn(x_test,y_test,num_epochs=1,shuffle=True))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-10-11-14:06:14
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Lolly\AppData\Local\Temp\tmpaf205_wz\model.ckpt-10130
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-10-11-14:06:16
INFO:tensorflow:Saving dict for global step 10130: average_loss = 0.73777246, global_step = 10130, label/mean = 1.3333848, loss = 43.528576, prediction/mean = 0.98680574
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10130: C:\Users\Lolly\AppData\Local\Temp\tmpaf205_wz\model.ckpt-10130


In [129]:
results

{'average_loss': 0.73777246,
 'global_step': 10130,
 'label/mean': 1.3333848,
 'loss': 43.528576,
 'prediction/mean': 0.98680574}

In [131]:
for key in sorted(results):
    print("%s %s"%(key,results[key]))

average_loss 0.73777246
global_step 10130
label/mean 1.3333848
loss 43.528576
prediction/mean 0.98680574


In [132]:
average_loss = results["average_loss"]

In [134]:
print("\nRMS error for the test set: ${:.0f}"
        .format(PRICE_SCALING_FACTOR * average_loss**0.5))


RMS error for the test set: $8589


In [135]:
predict_results = model.predict(input_fn=input_fn(x_predict, y_predict, num_epochs=1, shuffle=False))

In [136]:
predictions = list(itertools.islice(predict_results, len(x_predict)))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\Lolly\AppData\Local\Temp\tmpaf205_wz\model.ckpt-10130
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [137]:
predictions

[{'predictions': array([0.21781568], dtype=float32)},
 {'predictions': array([0.58333236], dtype=float32)},
 {'predictions': array([0.32442123], dtype=float32)},
 {'predictions': array([1.6876776], dtype=float32)},
 {'predictions': array([1.5309374], dtype=float32)},
 {'predictions': array([0.39242202], dtype=float32)},
 {'predictions': array([0.44756728], dtype=float32)},
 {'predictions': array([0.5746234], dtype=float32)},
 {'predictions': array([0.9235881], dtype=float32)},
 {'predictions': array([0.5916198], dtype=float32)},
 {'predictions': array([0.40195495], dtype=float32)},
 {'predictions': array([-0.10278673], dtype=float32)},
 {'predictions': array([0.58698684], dtype=float32)},
 {'predictions': array([-0.15476198], dtype=float32)},
 {'predictions': array([0.48126632], dtype=float32)},
 {'predictions': array([1.2170336], dtype=float32)},
 {'predictions': array([-0.54859895], dtype=float32)},
 {'predictions': array([0.7744964], dtype=float32)},
 {'predictions': array([1.536846

In [144]:
predicted_prices = [obj['predictions'][0] * PRICE_SCALING_FACTOR for obj in predictions]

In [145]:
predicted_prices

[2178.1568229198456,
 5833.32359790802,
 3244.21226978302,
 16876.776218414307,
 15309.374332427979,
 3924.2202043533325,
 4475.6728410720825,
 5746.234059333801,
 9235.880970954895,
 5916.197896003723,
 4019.5494890213013,
 -1027.8673470020294,
 5869.8683977127075,
 -1547.6198494434357,
 4812.663197517395,
 12170.336246490479,
 -5485.989451408386,
 7744.963765144348,
 15368.46399307251,
 6355.422139167786,
 7663.76793384552,
 10475.428104400635,
 16756.021976470947,
 11453.821659088135,
 17434.00812149048,
 15430.5100440979,
 11000.826358795166,
 5955.966114997864,
 1527.9798209667206,
 3637.2023820877075,
 1986.6971671581268,
 -9461.399912834167,
 5906.451344490051,
 20890.12384414673,
 4042.6284074783325,
 13906.824588775635,
 -2471.425086259842,
 19207.537174224854,
 9559.710621833801]

In [146]:
compare_df = x_predict.copy()

In [147]:
compare_df

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg
98,nissan,gas,std,two,hardtop,fwd,2008.0,ohc,four,97.0,2bbl,69.0,5200.0,31.0,37.0
141,subaru,gas,std,four,sedan,fwd,2145.0,ohcf,four,108.0,2bbl,82.0,4800.0,32.0,37.0
21,dodge,gas,std,two,hatchback,fwd,1876.0,ohc,four,90.0,2bbl,68.0,5500.0,37.0,41.0
72,mercedes-benz,gas,std,two,convertible,rwd,3685.0,ohcv,eight,234.0,mpfi,155.0,4750.0,16.0,18.0
84,mitsubishi,gas,turbo,two,hatchback,fwd,2926.0,ohc,four,156.0,spdi,145.0,5000.0,19.0,24.0
151,toyota,gas,std,two,hatchback,fwd,2040.0,ohc,four,92.0,2bbl,62.0,4800.0,31.0,38.0
99,nissan,gas,std,four,hatchback,fwd,2324.0,ohc,four,120.0,2bbl,97.0,5200.0,27.0,34.0
113,peugot,gas,std,four,wagon,rwd,3285.0,l,four,120.0,mpfi,95.0,5000.0,19.0,24.0
192,volkswagen,diesel,turbo,four,sedan,fwd,2579.0,ohc,four,97.0,idi,68.0,4500.0,33.0,38.0
139,subaru,gas,std,two,hatchback,fwd,2120.0,ohcf,four,108.0,2bbl,73.0,4400.0,26.0,31.0


In [148]:
compare_df['actual-price'] = y_predict
compare_df['predicted-price'] = predicted_prices

In [149]:
compare_df

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,horsepower,peak-rpm,city-mpg,highway-mpg,actual-price,predicted-price
98,nissan,gas,std,two,hardtop,fwd,2008.0,ohc,four,97.0,2bbl,69.0,5200.0,31.0,37.0,0.8249,2178.156823
141,subaru,gas,std,four,sedan,fwd,2145.0,ohcf,four,108.0,2bbl,82.0,4800.0,32.0,37.0,0.7126,5833.323598
21,dodge,gas,std,two,hatchback,fwd,1876.0,ohc,four,90.0,2bbl,68.0,5500.0,37.0,41.0,0.5572,3244.21227
72,mercedes-benz,gas,std,two,convertible,rwd,3685.0,ohcv,eight,234.0,mpfi,155.0,4750.0,16.0,18.0,3.5056,16876.776218
84,mitsubishi,gas,turbo,two,hatchback,fwd,2926.0,ohc,four,156.0,spdi,145.0,5000.0,19.0,24.0,1.4489,15309.374332
151,toyota,gas,std,two,hatchback,fwd,2040.0,ohc,four,92.0,2bbl,62.0,4800.0,31.0,38.0,0.6338,3924.220204
99,nissan,gas,std,four,hatchback,fwd,2324.0,ohc,four,120.0,2bbl,97.0,5200.0,27.0,34.0,0.8949,4475.672841
113,peugot,gas,std,four,wagon,rwd,3285.0,l,four,120.0,mpfi,95.0,5000.0,19.0,24.0,1.6695,5746.234059
192,volkswagen,diesel,turbo,four,sedan,fwd,2579.0,ohc,four,97.0,idi,68.0,4500.0,33.0,38.0,1.3845,9235.880971
139,subaru,gas,std,two,hatchback,fwd,2120.0,ohcf,four,108.0,2bbl,73.0,4400.0,26.0,31.0,0.7053,5916.197896
