# Lab 4.10.2: Building a Decision Tree-based Classifier—Contact Lenses
The dataset you are going to use for this exercise is about contact lenses, Download contact lenses, which has three class labels:<p>
 - the patient should be prescribed hard contact lenses.
 - the patient should be prescribed soft contact lenses.
 - the patient should not be fitted with contact lenses.

The attributes are the following:<p>
 - age of the patient: (1) young, (2) pre-presbyopic, (3) presbyopic
 - spectacle prescription: (1) myope, (2) hypermétrope
 - astigmatic: (1) no, (2) yes
 - tear production rate: (1) reduced, (2) normal<p>

Step 1: Build a decision tree-based classifier using about 80% of the data that would recommend the class label based on the other attributes from the dataset.<p>
Step 2: Use the remaining data to manually test how well your model will classify new data.<p>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [8]:
df = pd.read_excel('LAB 4.10.2 lenses.xlsx')
print(df.columns)
display(df.head(2))

Index(['X1.1', 'X1.2', 'X1.3', 'X1.4', 'X3'], dtype='object')


Unnamed: 0,X1.1,X1.2,X1.3,X1.4,X3
0,1,1,1,1,3
1,1,1,1,2,2


In [9]:
# Rename the columns
df = df.rename(columns={
    'X1.1': 'age',
    'X1.2': 'glass_rx',
    'X1.3': 'astigmatic',
    'X1.4': 'tear',
    'X3': 'contact_rx'
})
display(df.tail(2))

Unnamed: 0,age,glass_rx,astigmatic,tear,contact_rx
22,3,2,2,1,3
23,3,2,2,2,3


In [17]:
df.describe()

Unnamed: 0,age,glass_rx,astigmatic,tear,contact_rx
count,24.0,24.0,24.0,24.0,24.0
mean,2.0,1.5,1.5,1.5,2.458333
std,0.834058,0.510754,0.510754,0.510754,0.779028
min,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,2.0
50%,2.0,1.5,1.5,1.5,3.0
75%,3.0,2.0,2.0,2.0,3.0
max,3.0,2.0,2.0,2.0,3.0


In [12]:
# Prepare the data
X = df[['age', 'glass_rx', 'astigmatic', 'tear']]  # Predictor variables
y = df['contact_rx']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(report)

Accuracy: 1.00
Confusion Matrix:
[[1 0]
 [0 4]]
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         4

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5



In [13]:
print(X_test)
print(y_test)

    age  glass_rx  astigmatic  tear
8     2         1           1     1
16    3         1           1     1
0     1         1           1     1
18    3         1           2     1
11    2         1           2     2
8     3
16    3
0     3
18    3
11    1
Name: contact_rx, dtype: int64


In [18]:
def translate_result(result):
    if np.array_equal(result, [1.0, 0.0, 0.0]):
        return 1
    elif np.array_equal(result, [0.0, 1.0, 0.0]):
        return 2
    elif np.array_equal(result, [0.0, 0.0, 1.0]):
        return 3
    else:
        return None  # Handle unexpected cases

In [23]:
i = 0
for age, glass_rx, astigmatic, tear in X_test.values:
    print(f"age: {age}, glass_rx: {glass_rx}, astigmatic: {astigmatic}, tears: {tear}", end = ' | ')
    input_data = pd.DataFrame({'age': [age], 'glass_rx': [glass_rx], 'astigmatic': [astigmatic], 'tear': [tear]})

    predicted_proba = clf.predict_proba(input_data)
    # The output of predict_proba is an array where each row corresponds to an input
    # and each column corresponds to a class. For a binary classification problem,
    # the first column is the probability of Contaxt Rx being 1, the second 2 and the third 3
    print(f'Predicted Contact Rx = {translate_result(predicted_proba[0])}; actual = {y_test.iloc[i]}')
    i += 1

age: 2, glass_rx: 1, astigmatic: 1, tears: 1 | Predicted Contact Rx = 3; actual = 3
age: 3, glass_rx: 1, astigmatic: 1, tears: 1 | Predicted Contact Rx = 3; actual = 3
age: 1, glass_rx: 1, astigmatic: 1, tears: 1 | Predicted Contact Rx = 3; actual = 3
age: 3, glass_rx: 1, astigmatic: 2, tears: 1 | Predicted Contact Rx = 3; actual = 3
age: 2, glass_rx: 1, astigmatic: 2, tears: 2 | Predicted Contact Rx = 1; actual = 1
