In [1]:
import pandas as pd
import urllib

In [2]:
import urllib.request
import os
file_name = "car.csv"
file_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
if not os.path.isfile(file_name):
    f = urllib.request.urlretrieve(file_url, file_name)

In [3]:
dat = pd.read_csv(file_name, names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"])

In [4]:
from collections import Counter
Counter(dat.buying)

Counter({'high': 432, 'low': 432, 'med': 432, 'vhigh': 432})

In [5]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
class       1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [6]:
pd.get_dummies(dat, prefix=dat.columns).head(3)

Unnamed: 0,buying_high,buying_low,buying_med,buying_vhigh,maint_high,maint_low,maint_med,maint_vhigh,doors_2,doors_3,...,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med,class_acc,class_good,class_unacc,class_vgood
0,0,0,0,1,0,0,0,1,1,0,...,0,0,1,0,1,0,0,0,1,0
1,0,0,0,1,0,0,0,1,1,0,...,0,0,1,0,0,1,0,0,1,0
2,0,0,0,1,0,0,0,1,1,0,...,0,0,1,1,0,0,0,0,1,0


In [7]:
dat = pd.get_dummies(dat, prefix = dat.columns)

In [8]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt


dat = dat.values.astype(np.float32)
np.random.shuffle(dat)
sep = int(0.7 * len(dat))
train_data = dat[:sep]
test_data = dat[sep:]

  return f(*args, **kwds)


In [9]:
dat.shape # Last four columns are the output classes

(1728, 25)

In [10]:
dat.shape[1]

25

In [11]:
# https://morvanzhou.github.io/tutorials/machine-learning/ML-practice/build-car-classifier-from-scratch2/
# Inserts a placeholder for a tensor that will be always fed.
tf_input = tf.placeholder(tf.float32, [None, dat.shape[1]], "input")
tfx = tf_input[:, :21] # except taget variables.
tfy = tf_input[:, 21:]

# Functional interface for the densely-connected layer.
# Activation function: relu
l1 = tf.layers.dense(tfx, 128, tf.nn.relu, name="l1")
l2 = tf.layers.dense(l1, 128, tf.nn.relu, name="l2")
out = tf.layers.dense(l2, 4, name="l3") # 4 categories for the target variables.
# When one hot encoding is been used on the target variable, using softmax method for tf may be a good choice.
prediction = tf.nn.softmax(out, name="pred")

# Built-in function for caculating loss.
loss = tf.losses.softmax_cross_entropy(onehot_labels=tfy, logits=out)

# Built-in function for caculating accuracy.
# return (acc, update_op), and create 2 local variables.
accuracy = tf.metrics.accuracy(
    labels=tf.argmax(tfy, axis=1), predictions=tf.argmax(out, axis=1),)[1]

opt = tf.train.GradientDescentOptimizer(learning_rate=0.1)
train_op = opt.minimize(loss)

sess = tf.Session()
#local_variables_initializer for accuracy function.
sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))

In [12]:
for t in range(1,5001):
    # training
    batch_index = np.random.randint(len(train_data), size=32)
    sess.run(train_op, {tf_input: train_data[batch_index]})

    if t % 500 == 0:
        # testing
        acc_, pred_, loss_ = sess.run([accuracy, prediction, loss], {tf_input: test_data})
        print("Step: %i" % t,"| Accurate: %.2f" % acc_,"| Loss: %.2f" % loss_,)

Step: 500 | Accurate: 0.93 | Loss: 0.15
Step: 1000 | Accurate: 0.95 | Loss: 0.06
Step: 1500 | Accurate: 0.96 | Loss: 0.05
Step: 2000 | Accurate: 0.97 | Loss: 0.03
Step: 2500 | Accurate: 0.97 | Loss: 0.02
Step: 3000 | Accurate: 0.98 | Loss: 0.02
Step: 3500 | Accurate: 0.98 | Loss: 0.02
Step: 4000 | Accurate: 0.98 | Loss: 0.02
Step: 4500 | Accurate: 0.98 | Loss: 0.02
Step: 5000 | Accurate: 0.98 | Loss: 0.01


In [13]:
# Returns probability.
sess.run(prediction, {tf_input: 
                       np.array([[ 0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,
         0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  0.]])
                       })

array([[  3.82039062e-12,   2.63443999e-21,   1.00000000e+00,
          2.13099828e-25]], dtype=float32)

In [14]:
# Returns categories.
#https://github.com/tensorflow/tensorflow/issues/97
pred_output = tf.argmax(prediction,1)
print("predictions", pred_output.eval(feed_dict={tf_input:
                                                np.array([[ 0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,
         0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  0.],
                                                         [ 1.,  1.,  1,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,
         1.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,  1.,  0.]])
                                                }, session=sess) )

predictions [2 2]


In [15]:
# Get prediction result.
pred_output.eval(feed_dict={tf_input:test_data}, session=sess)

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2, 2, 2, 2, 2, 0, 2, 0, 0,
       2, 1, 1, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0,
       2, 2, 0, 1, 0, 2, 0, 2, 0, 2, 3, 0, 2, 2, 2, 2, 2, 0, 3, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 1, 2, 0, 2, 0,
       0, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 3, 0, 0, 2, 2, 1,
       0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2,
       2, 2, 0, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0, 0, 3, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 3,
       0, 3, 0, 2, 0, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 0, 2, 3, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 2, 3, 2, 2, 2, 0, 2, 2, 0, 2, 2,
       0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 1, 0, 2, 2, 2, 2,
       2, 2, 1, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2,
       2, 2,

In [16]:
## https://stackoverflow.com/questions/26762100/reconstruct-a-categorical-variable-from-dummies-in-pandas
# Reconstruct dummy variables.
x = pd.DataFrame(test_data[:,21:25])
np.array(x.idxmax(axis=1))

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2, 2, 2, 2, 2, 0, 2, 0, 0,
       2, 1, 1, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0,
       2, 2, 0, 1, 0, 2, 0, 2, 0, 2, 3, 0, 2, 2, 2, 2, 2, 0, 3, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 3, 2, 0, 2, 0,
       0, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 3, 0, 0, 2, 2, 1,
       0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2,
       2, 2, 0, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0, 0, 3, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 3, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 3,
       0, 3, 0, 2, 3, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 0, 2, 3, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 2, 3, 2, 2, 2, 0, 2, 2, 0, 2, 2,
       0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 1, 0, 2, 2, 2, 2,
       2, 2, 1, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 2,
       2, 2,

In [17]:
Counter(np.array(x.idxmax(axis=1)) == pred_output.eval(feed_dict={tf_input:test_data}, session=sess))

Counter({False: 2, True: 517})