In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from shubov_algorithms import decision_trees, helpers
np.set_printoptions(precision=3)

## Dataset

In [2]:
#shell scripts for downloading the data and placing it in a corresponding directory
!mkdir CAR 
!curl -o CAR/data "http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
!curl -o CAR/description "http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.names"

mkdir: CAR: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 51867  100 51867    0     0  83253      0 --:--:-- --:--:-- --:--:-- 83120
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  3097  100  3097    0     0   9831      0 --:--:-- --:--:-- --:--:--  9800


In [3]:
# csv-file has no header, so we define it manually
col_names = ['price_buy', 'price_main', 'n_doors', 'n_persons', 'lug_boot', 'safety', 'recommendation']
df = pd.read_csv("./CAR/data", header=None, names=col_names)



# All attributes are categorical - a mix of strings and integers.
# We simply map the categorical values of each attribute to a set of distinct integers
ai2an_map = col_names
ai2aiv2aivn_map = []
enc_cols = []
for col in df.columns:
    df[col] = df[col].astype('category')
    a = np.array(df[col].cat.codes.values).reshape((-1,1))
    enc_cols.append(a)
    ai2aiv2aivn_map.append(list(df[col].cat.categories.values))

    
    
# Get the data as numpy 2d-matrix (n_samples, n_features)
dataset = np.hstack(enc_cols)
X, y = dataset[:,:6], dataset[:,6]
print(X.shape, y.shape)

(1728, 6) (1728,)


## Learn

In [4]:
tree = decision_trees.DecisionTreeID3(criterion=decision_trees.entropy, verbose=0)
tree.learn(X, y)

<shubov_algorithms.decision_trees.DecisionTreeID3 at 0x12e771ac0>

In [5]:
tree.print_tree(ai2an_map, ai2aiv2aivn_map)

 |--(): test attribute safety
  |--(safety=high): test attribute n_persons
   |--(n_persons=2): assign label unacc
   |--(n_persons=4): test attribute price_buy
    |--(price_buy=high): test attribute price_main
     |--(price_main=high): assign label acc
     |--(price_main=low): assign label acc
     |--(price_main=med): assign label acc
     |--(price_main=vhigh): assign label unacc
    |--(price_buy=low): test attribute price_main
     |--(price_main=high): test attribute lug_boot
      |--(lug_boot=big): assign label vgood
      |--(lug_boot=med): test attribute n_doors
       |--(n_doors=2): assign label acc
       |--(n_doors=3): assign label acc
       |--(n_doors=4): assign label vgood
       |--(n_doors=5more): assign label vgood
      |--(lug_boot=small): assign label acc
     |--(price_main=low): test attribute lug_boot
      |--(lug_boot=big): assign label vgood
      |--(lug_boot=med): test attribute n_doors
       |--(n_doors=2): assign label good
       |--(n_doors=3): 

## Infer

In [6]:
print ("According to the attributes %s"%(col_names[:-1]))
print ("Should i buy the car %s?"%(dataset[52,0:6]))
print ("The car is %s (in truth it is %s)"%(tree.infer(dataset[[52],0:6]),dataset[52,6]))

According to the attributes ['price_buy', 'price_main', 'n_doors', 'n_persons', 'lug_boot', 'safety']
Should i buy the car [3 3 1 2 0 2]?
The car is [2] (in truth it is 2)


### Evaluating classification accuracy

Of course we want to estimate the error. Therefore we write a function that, given a set of true class labels 'y' and predicted class labels 'y_p', returns the misclassification rate. 

In [7]:
def acc(y, y_p):
    correct = y == y_p
    acc = np.sum(correct) / float(len(y))
    return acc

def err_mis(y, y_p):
    return 1. - acc(y, y_p)

In [8]:
y_p = tree.infer(X)
err = err_mis(y, y_p)
print ("Unsurprisingly, the error on the training set is very low: %f" % err)

Unsurprisingly, the error on the training set is very low: 0.000000


### Cross-Validation

Let's run cross-validation to estimate the generalization performance of our decision tree.

In [9]:
impurity_measures = [decision_trees.gini, decision_trees.entropy, decision_trees.misclass]
k = 10

folds = helpers.get_k_folds(X.shape[0], k)
#folds = getBootstrapFolds(X.shape[0], k, train_fraction=0.9)

for imp in impurity_measures:
    err_tr = 0.
    err_te = 0.
    for i in range(k):
        idx_tr, idx_te = folds[i]
        
        _X_tr = X[idx_tr]
        _y_tr = y[idx_tr]
        _X_te = X[idx_te]
        _y_te = y[idx_te]
        
        decision_trees.DecisionTreeID3(criterion=imp, verbose=0)
        tree.learn(_X_tr, _y_tr)
        
        y_tr_p = tree.infer(_X_tr)
        y_te_p = tree.infer(_X_te)
        err_tr += err_mis(_y_tr, y_tr_p)
        err_te += err_mis(_y_te, y_te_p)
        
    print ("%s: Average training error %f;Average test error %f" % (imp, err_tr/k, err_te/k))

<function gini at 0x128a3d040>: Average training error 0.000000;Average test error 0.053488
<function entropy at 0x12e763c10>: Average training error 0.000000;Average test error 0.053488
<function misclass at 0x12e763ca0>: Average training error 0.000000;Average test error 0.053488


We can see that the generalization error is higher than the training error. We might find a simpler model (a smaller tree) that yields lower test error.