# A6 : Decision Tree

## 1. Calculate entropy and gain

In [1]:
import math

# function to calculate entropy
def entropy(p):
    return -sum(x * math.log2(x) if x > 0 else 0 for x in p) # Avoid log(0)

# Probabilities for each set
p_original = [5/8, 3/8]
p_sub1 = [4/5, 1/5]
p_sub2 = [1/3, 2/3]

# Entropy calculations
entropy_original = entropy(p_original)
entropy_sub1 = entropy(p_sub1)
entropy_sub2 = entropy(p_sub2)

# Average entropy after the split
p1 = 5 / 8  # proportion of subset 1
p2 = 3 / 8  # proportion of subset 2
average_after_split_entropy = p1 * entropy_sub1 + p2 * entropy_sub2

# Information gain from the split
gain_of_split = entropy_original - average_after_split_entropy

# Print results

print("1.a\n")
print(f"Original entropy: {entropy_original:.4f}")
print(f"Entropy after split 1: {entropy_sub1:.4f}")
print(f"Entropy after split 2: {entropy_sub2:.4f}")

print("\n1.b\n")
print(f"Average entropy after the split: {average_after_split_entropy:.4f}")

print("\n1.c\n")
print(f"gain of the split: {gain_of_split:.4f}")


1.a

Original entropy: 0.9544
Entropy after split 1: 0.7219
Entropy after split 2: 0.9183

1.b

Average entropy after the split: 0.7956

1.c

gain of the split: 0.1589


2. Build a decision tree with ID3 algorithm

In [2]:
import pandas as pd

# Data from the table
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temp': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No']
}
df = pd.DataFrame(data)

# print the data
print(df)


# Calculate the entropy for the whole dataset
play_counts = df['Play'].value_counts()
total = df['Play'].count()
origin_entropty = entropy(play_counts / total)

# Calculate entropy for each attribute and their Information Gain
def entropy_of_attribute(dataframe, attribute, target):
    attribute_values = dataframe[attribute].unique() # Unique values of the attribute
    total_entropy = 0
    total_count = len(dataframe)
    
    for value in attribute_values:
        subset = dataframe[dataframe[attribute] == value]
        count = len(subset)
        target_counts = subset[target].value_counts()
        entropy_val = entropy(target_counts / count)
        total_entropy += (count / total_count) * entropy_val
    
    return total_entropy

# Calculate gain of split for each attribute
gain_of_split = {}
attributes = ['Outlook', 'Temp', 'Humidity', 'Wind']
for attribute in attributes:
    attr_entropy = entropy_of_attribute(df, attribute, 'Play')
    gain_of_split[attribute] = origin_entropty - attr_entropy

# Print results
print(f"Overall entropy: {origin_entropty:.4f}")
print("\n")
for attribute, gain in gain_of_split.items():
    print(f"Gain of split for {attribute}: {gain:.4f}")
    
# choose the attribute with the highest gain
best_attribute = max(gain_of_split, key=gain_of_split.get)
print(f"\nBest attribute to split on: {best_attribute}")

     Outlook  Temp Humidity    Wind Play
0      Sunny   Hot     High    Weak   No
1      Sunny   Hot     High  Strong   No
2   Overcast   Hot     High    Weak  Yes
3       Rain  Mild     High    Weak  Yes
4       Rain  Cool   Normal    Weak  Yes
5       Rain  Cool   Normal  Strong   No
6   Overcast  Cool   Normal  Strong   No
7      Sunny  Mild     High    Weak   No
8      Sunny  Cool   Normal    Weak  Yes
9       Rain  Mild   Normal    Weak  Yes
10     Sunny  Mild   Normal  Strong  Yes
11  Overcast  Mild     High  Strong   No
12  Overcast   Hot   Normal    Weak  Yes
13      Rain  Mild     High  Strong   No
Overall entropy: 1.0000


Gain of split for Outlook: 0.0207
Gain of split for Temp: 0.0000
Gain of split for Humidity: 0.1369
Gain of split for Wind: 0.2578

Best attribute to split on: Wind


In [3]:
# Split the data based on the wind
weak = df[df['Wind'] == 'Weak']
strong = df[df['Wind'] == 'Strong']

# print the data
print("\nWeak wind data:")
print(weak)
print("\nStrong wind data:")
print(strong)


Weak wind data:
     Outlook  Temp Humidity  Wind Play
0      Sunny   Hot     High  Weak   No
2   Overcast   Hot     High  Weak  Yes
3       Rain  Mild     High  Weak  Yes
4       Rain  Cool   Normal  Weak  Yes
7      Sunny  Mild     High  Weak   No
8      Sunny  Cool   Normal  Weak  Yes
9       Rain  Mild   Normal  Weak  Yes
12  Overcast   Hot   Normal  Weak  Yes

Strong wind data:
     Outlook  Temp Humidity    Wind Play
1      Sunny   Hot     High  Strong   No
5       Rain  Cool   Normal  Strong   No
6   Overcast  Cool   Normal  Strong   No
10     Sunny  Mild   Normal  Strong  Yes
11  Overcast  Mild     High  Strong   No
13      Rain  Mild     High  Strong   No


In [4]:
# do the same for the weak wind data and the strong wind data
for data in [weak, strong]:
    play_counts = data['Play'].value_counts()
    total = data['Play'].count()
    entropy_val = entropy(play_counts / total)
    print(f"\nEntropy for the data: Wind = {data['Wind'].unique()[0]}")
    print(f"Entropy: {entropy_val:.4f}\n")
    gain_of_split = {}
    attributes = ['Outlook', 'Temp', 'Humidity']
    for attribute in attributes:
        attr_entropy = entropy_of_attribute(data, attribute, 'Play')
        gain_of_split[attribute] = entropy_val - attr_entropy
    for attribute, gain in gain_of_split.items():
        print(f"Gain of split for {attribute}: {gain:.4f}")
    best_attribute = max(gain_of_split, key=gain_of_split.get)
    print(f"\nBest attribute to split on: {best_attribute}")
    print("=========================================")



Entropy for the data: Wind = Weak
Entropy: 0.8113

Gain of split for Outlook: 0.4669
Gain of split for Temp: 0.1226
Gain of split for Humidity: 0.3113

Best attribute to split on: Outlook

Entropy for the data: Wind = Strong
Entropy: 0.6500

Gain of split for Outlook: 0.3167
Gain of split for Temp: 0.1909
Gain of split for Humidity: 0.1909

Best attribute to split on: Outlook


In [5]:
# Split the data based on the outlook
weak_sunny = weak[weak['Outlook'] == 'Sunny']
weak_overcast = weak[weak['Outlook'] == 'Overcast']
weak_rain = weak[weak['Outlook'] == 'Rain']

strong_sunny = strong[strong['Outlook'] == 'Sunny']
strong_overcast = strong[strong['Outlook'] == 'Overcast']
strong_rain = strong[strong['Outlook'] == 'Rain']

# print the data
print("Wind = Weak, Outlook = Sunny")
print(weak_sunny)
print("\n")

print("Wind = Weak, Outlook = Overcast")
print(weak_overcast)
print("\n")

print("Wind = Weak, Outlook = Rain")
print(weak_rain)
print("\n")

print("Wind = Strong, Outlook = Sunny")
print(strong_sunny)
print("\n")

print("Wind = Strong, Outlook = Overcast")
print(strong_overcast)
print("\n")

print("Wind = Strong, Outlook = Rain")
print(strong_rain)
print("\n")


Wind = Weak, Outlook = Sunny
  Outlook  Temp Humidity  Wind Play
0   Sunny   Hot     High  Weak   No
7   Sunny  Mild     High  Weak   No
8   Sunny  Cool   Normal  Weak  Yes


Wind = Weak, Outlook = Overcast
     Outlook Temp Humidity  Wind Play
2   Overcast  Hot     High  Weak  Yes
12  Overcast  Hot   Normal  Weak  Yes


Wind = Weak, Outlook = Rain
  Outlook  Temp Humidity  Wind Play
3    Rain  Mild     High  Weak  Yes
4    Rain  Cool   Normal  Weak  Yes
9    Rain  Mild   Normal  Weak  Yes


Wind = Strong, Outlook = Sunny
   Outlook  Temp Humidity    Wind Play
1    Sunny   Hot     High  Strong   No
10   Sunny  Mild   Normal  Strong  Yes


Wind = Strong, Outlook = Overcast
     Outlook  Temp Humidity    Wind Play
6   Overcast  Cool   Normal  Strong   No
11  Overcast  Mild     High  Strong   No


Wind = Strong, Outlook = Rain
   Outlook  Temp Humidity    Wind Play
5     Rain  Cool   Normal  Strong   No
13    Rain  Mild     High  Strong   No




In [6]:
# do the same for the only for weak_sunny, strong_sunny
for data in [weak_sunny, strong_sunny]:
    play_counts = data['Play'].value_counts()
    total = data['Play'].count()
    entropy_val = entropy(play_counts / total)
    print(f"\nEntropy for the data: Wind = {data['Wind'].unique()[0]}")
    print(data)
    print(f"Entropy: {entropy_val:.4f}\n")
    gain_of_split = {}
    attributes = ['Temp', 'Humidity']
    for attribute in attributes:
        attr_entropy = entropy_of_attribute(data, attribute, 'Play')
        gain_of_split[attribute] = entropy_val - attr_entropy
    for attribute, gain in gain_of_split.items():
        print(f"Gain of split for {attribute}: {gain:.4f}")
        
# randomly choose the attribute to split on (same gain)
print("\nRandomly choose the attribute to split : humidity")



Entropy for the data: Wind = Weak
  Outlook  Temp Humidity  Wind Play
0   Sunny   Hot     High  Weak   No
7   Sunny  Mild     High  Weak   No
8   Sunny  Cool   Normal  Weak  Yes
Entropy: 0.9183

Gain of split for Temp: 0.9183
Gain of split for Humidity: 0.9183

Entropy for the data: Wind = Strong
   Outlook  Temp Humidity    Wind Play
1    Sunny   Hot     High  Strong   No
10   Sunny  Mild   Normal  Strong  Yes
Entropy: 1.0000

Gain of split for Temp: 1.0000
Gain of split for Humidity: 1.0000

Randomly choose the attribute to split : humidity


![DT_Result](./DT_result.jpeg)