# Lens Classification using Decision Tree


About the dataset:

1.    Number of Instances: 24

2.    Number of Attributes: 4 (all nominal)

3.    Attribute Information: -- 3 Classes 1 : the patient should be fitted with hard contact lenses, 2 : the patient should be fitted with soft contact lenses, 3 : the patient should not be fitted with contact lenses.
        
        A. age of the patient: (1) young, (2) pre-presbyopic, (3) presbyopic
        
        B. spectacle prescription: (1) myope, (2) hypermetrope
        
        C. astigmatic: (1) no, (2) yes
        
        D. tear production rate: (1) reduced, (2) normal

4.    Number of Missing Attribute Values: 0

5.    Class Distribution:

        A. hard contact lenses: 4

        B. soft contact lenses: 5

        C. no contact lenses: 15
        


Essentially, we have a 2d list. Each row is a patient, each column is the following:

[Number, Age, Prescription, Astigmatic, Class Distribution]


In [245]:
import math
import pandas as pd
import numpy as np
from operator import itemgetter

# Lenses data https://archive.ics.uci.edu/ml/machine-learning-databases/lenses/lenses.data
# Directly add here for easiness

data = [[1, 1, 1, 1, 1, 3],
[2, 1, 1, 1, 2, 2],
[3, 1, 1, 2, 1, 3],
[4, 1, 1, 2, 2, 1],
[5, 1, 2, 1, 1, 3],
[6, 1, 2, 1, 2, 2],
[7, 1, 2, 2, 1, 3],
[8, 1, 2, 2, 2, 1],
[9, 2, 1, 1, 1, 3],
[10, 2, 1, 1, 2, 2],
[11, 2, 1, 2, 1, 3],
[12, 2, 1, 2, 2, 1],
[13, 2, 2, 1, 1, 3],
[14, 2, 2, 1, 2, 2],
[15, 2, 2, 2, 1, 3],
[16, 2, 2, 2, 2, 3],
[17, 3, 1, 1, 1, 3],
[18, 3, 1, 1, 2, 3],
[19, 3, 1, 2, 1, 3],
[20, 3, 1, 2, 2, 1],
[21, 3, 2, 1, 1, 3],
[22, 3, 2, 1, 2, 2],
[23, 3, 2, 2, 1, 3],
[24, 3, 2, 2, 2, 3]]

df = pd.DataFrame(data, columns=['num','age','prescription','astigmatic','tear_rate','lens'])
df.head()

Unnamed: 0,num,age,prescription,astigmatic,tear_rate,lens
0,1,1,1,1,1,3
1,2,1,1,1,2,2
2,3,1,1,2,1,3
3,4,1,1,2,2,1
4,5,1,2,1,1,3


#Dataset

Exploring the dataset

In [246]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   num           24 non-null     int64
 1   age           24 non-null     int64
 2   prescription  24 non-null     int64
 3   astigmatic    24 non-null     int64
 4   tear_rate     24 non-null     int64
 5   lens          24 non-null     int64
dtypes: int64(6)
memory usage: 1.2 KB


In [247]:
df.groupby(['lens'])[['lens']].count()

Unnamed: 0_level_0,lens
lens,Unnamed: 1_level_1
1,4
2,5
3,15


#Modeling

Functions are written with attribute INDEX in mind, not label.

In [303]:
class decision_tree:
    def __init__(self):
        self.split_value = 0
        self.split_attr = -1
        self.max_gain = 0.1
        self.left = None
        self.right = None
        self.depth = 0
        self.distribution = []

def entropy(data, target_attr):
    # Calculates the entropy of the given data set for the target attribute.
    # Write your code here

    data_np = np.array(data)
    frequencies = np.unique(data_np[:, target_attr], return_counts=True)[1]
    data_entropy = - np.sum(frequencies / len(data) * np.log2(frequencies / len(data)))

    return data_entropy

def gain(data, attr, target_attr):
    # Calculates the information gain (reduction in entropy) that would result by splitting the data on the chosen attribute (attr).
    # Write your code here

    data_np = np.array(data)
    subset = data_np[:, attr]
    unique_values, _ = np.unique(subset, return_counts=True)

    weighted_entropy = 0.0
    for value in unique_values:
      subset = data_np[data_np[:, attr] == value]
      subset_entropy = entropy(subset, target_attr)
      weighted_entropy += (len(subset) / len(data)) * subset_entropy

    gain = entropy(data, target_attr) - weighted_entropy
    return gain

def growTree(node, data,target_attr):
    # which attribute and value is best to split
    for i in range(len(data)): # go over each instance to split to left/right
        for j in range(len(data[0])): # in each instance, go over each attribute
            if gain(data,j,target_attr) > node.max_gain and j != target_attr:
                node.split_value = data[i][j]
                node.split_attr = j
                node.max_gain = gain(data,j,target_attr)

    if node.split_attr != -1:

        print("=" * node.depth + "v" + str(node.split_attr) + ">" + str(node.split_value))

        # now split data
        left_data, right_data = [], []
        for i in range(len(data)):
            if data[i][node.split_attr] > node.split_value:
                right_data.append(data[i])
            else:
                left_data.append(data[i])
        node.distribution = {i:[row[-1] for row in data].count(i) for i in set([row[-1] for row in data])}
    if node.split_attr == -1 or len(data) <2 or len(right_data) <1 or len(left_data) <1:
        # no more splitting
        node.leaf = True
        node.distribution = {i:[row[-1] for row in data].count(i) for i in set([row[-1] for row in data])}
        # max(dic, key=dic.get) returns the key that has max value in a dict namely dic
        print("=" * node.depth + "leaf: classify to class=" + str(max(node.distribution, key=node.distribution.get)) + " with class distribution " + "[" + str(node.distribution) + "]")
        return node

    node.left = decision_tree()
    node.left.depth = node.depth + 1
    node.right = decision_tree()
    node.right.depth = node.depth + 1
    growTree(node.left, left_data, target_attr)
    growTree(node.right, right_data, target_attr)

def single_classify(node, sample):
  if node.left == None and node.right == None:
    return node.distribution

  if sample[node.split_attr] < node.split_value:
    return single_classify(node.left, sample)
  else:
    return single_classify(node.right, sample)

def classify(node, X):
    # classify test instances in X given a tree
    # Write your code here
    pred = []
    for i in X:
      distribution = single_classify(node, i)
      predicted_class = max(distribution.items(), key=itemgetter(1))[0]
      pred.append(predicted_class)
    pred = np.array(pred)
    return pred

#Output

In [304]:
tree = decision_tree()
growTree(tree,[row[1:] for row in data], 4)

v3>1
=leaf: classify to class=3 with class distribution [{3: 12}]
=v2>1
==v0>1
===leaf: classify to class=2 with class distribution [{2: 2}]
===v0>2
====leaf: classify to class=2 with class distribution [{2: 2}]
====v1>1
=====leaf: classify to class=3 with class distribution [{3: 1}]
=====leaf: classify to class=2 with class distribution [{2: 1}]
==v1>1
===leaf: classify to class=1 with class distribution [{1: 3}]
===v0>1
====leaf: classify to class=1 with class distribution [{1: 1}]
====leaf: classify to class=3 with class distribution [{3: 2}]


In [305]:
pred = classify(tree, data)
y = [row[-1] for row in data]
print("Accuracy: %f\n" % (np.mean(y == pred) * 100))

Accuracy: 62.500000

