<a href="https://colab.research.google.com/github/sarveshrastogi1/ML_Lab_1bm22cs247/blob/main/tennis_weather_ID3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Read the dataset from CSV
df = pd.read_csv("/content/tennis.csv")

# Define the entropy calculation function
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    ent = 0
    for i in range(len(elements)):
        probability = counts[i] / np.sum(counts)
        ent -= probability * np.log2(probability)
    return ent

# Define function to calculate information gain for an attribute
def info_gain(data, split_attribute, target_attr):
    # Total entropy for the current dataset
    total_entropy = entropy(data[target_attr])

    # Values and corresponding counts for the split attribute
    vals, counts = np.unique(data[split_attribute], return_counts=True)

    # Calculate the weighted entropy for the split attribute
    weighted_entropy = sum(
        (counts[i] / np.sum(counts)) * entropy(data[data[split_attribute] == vals[i]][target_attr])
        for i in range(len(vals))
    )

    # Information Gain is the difference in entropy
    gain = total_entropy - weighted_entropy
    return gain

# Implement the recursive ID3 algorithm
def id3(data, original_data, features, target_attr, parent_class=None):
    # If dataset is empty, return the mode target feature value in the original dataset
    if data.empty:
        return np.unique(original_data[target_attr])[np.argmax(np.unique(original_data[target_attr], return_counts=True)[1])]

    # If all target_values have the same value, return that value
    elif len(np.unique(data[target_attr])) == 1:
        return np.unique(data[target_attr])[0]

    # If the feature space is empty, return the parent node's class value (mode target feature value)
    elif len(features) == 0:
        return parent_class
    else:
        # Determine the parent class (mode) for the current node
        parent_class = np.unique(data[target_attr])[np.argmax(np.unique(data[target_attr], return_counts=True)[1])]

        # Select the feature which best splits the dataset based on information gain
        gains = [info_gain(data, feature, target_attr) for feature in features]
        best_feature_index = np.argmax(gains)
        best_feature = features[best_feature_index]

        # Create the tree structure. The root is the best feature.
        tree = {best_feature: {}}

        # Remove the best feature from the feature space
        remaining_features = [feat for feat in features if feat != best_feature]

        # Grow a branch under the root node for each value of the best feature
        for value in np.unique(data[best_feature]):
            sub_data = data[data[best_feature] == value]
            subtree = id3(sub_data, original_data, remaining_features, target_attr, parent_class)
            tree[best_feature][value] = subtree
        return tree

# Specify the target attribute and feature names.
# Adjust 'Play' to match your CSV's column name for the target variable if different.
target_attribute = "play"
features = list(df.columns)
features.remove(target_attribute)

# Build the decision tree
decision_tree = id3(df, df, features, target_attribute)

# Output the resulting decision tree
print("Learned Decision Tree using ID3:")
print(decision_tree)


Learned Decision Tree using ID3:
{'outlook': {'overcast': 'yes', 'rainy': {'windy': {False: 'yes', True: 'no'}}, 'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}


In [None]:
pip install graphviz



In [None]:
import pandas as pd
import numpy as np
from graphviz import Digraph

# Read the dataset from CSV
df = pd.read_csv("tennis.csv")

# Debug: Print out the columns to ensure the target column name is correct.
print("Columns in dataset:", df.columns.tolist())

# Update this if your target column header is different.
target_attribute = "play"
if target_attribute not in df.columns:
    raise ValueError(f"Target column '{target_attribute}' not found in the dataset. Please check the header names.")

features = list(df.columns)
features.remove(target_attribute)

# Function to calculate entropy
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    ent = 0
    for i in range(len(elements)):
        probability = counts[i] / np.sum(counts)
        ent -= probability * np.log2(probability)
    return ent

# Function to calculate information gain for an attribute
def info_gain(data, split_attribute, target_attr):
    total_entropy = entropy(data[target_attr])
    vals, counts = np.unique(data[split_attribute], return_counts=True)
    weighted_entropy = sum(
        (counts[i] / np.sum(counts)) * entropy(data[data[split_attribute] == vals[i]][target_attr])
        for i in range(len(vals))
    )
    gain = total_entropy - weighted_entropy
    return gain

# ID3 recursive algorithm
def id3(data, original_data, features, target_attr, parent_class=None):
    # If dataset is empty, return the mode target feature value of the original dataset.
    if data.empty:
        return np.unique(original_data[target_attr])[np.argmax(np.unique(original_data[target_attr], return_counts=True)[1])]

    # If all target_values have the same value, return that value.
    elif len(np.unique(data[target_attr])) == 1:
        return np.unique(data[target_attr])[0]

    # If the feature space is empty, return the parent node's class value.
    elif len(features) == 0:
        return parent_class
    else:
        # Determine the parent class (mode) for the current node.
        parent_class = np.unique(data[target_attr])[np.argmax(np.unique(data[target_attr], return_counts=True)[1])]

        # Select the feature which best splits the dataset.
        gains = [info_gain(data, feature, target_attr) for feature in features]
        best_feature_index = np.argmax(gains)
        best_feature = features[best_feature_index]

        # Create the tree structure. The root is the best feature.
        tree = {best_feature: {}}

        # Remove the best feature from the list of available features.
        remaining_features = [feat for feat in features if feat != best_feature]

        # Grow a branch under the root node for each value of the best feature.
        for value in np.unique(data[best_feature]):
            sub_data = data[data[best_feature] == value]
            subtree = id3(sub_data, original_data, remaining_features, target_attr, parent_class)
            tree[best_feature][value] = subtree
        return tree

# Build the decision tree
decision_tree = id3(df, df, features, target_attribute)
print("Learned Decision Tree using ID3:")
print(decision_tree)

# -----------------------------------------------
# Visualization using Graphviz
# -----------------------------------------------

def add_nodes_edges(tree, dot=None, parent=None, edge_label=""):
    """
    Recursively add nodes and edges from the decision tree dictionary into the Graphviz Digraph.
    """
    if dot is None:
        dot = Digraph()

    # If the current tree is not a dictionary, it is a leaf node.
    if not isinstance(tree, dict):
        node_id = str(id(tree))
        dot.node(node_id, label=str(tree), shape="box", style="filled", color="lightgray")
        if parent is not None:
            dot.edge(parent, node_id, label=edge_label)
        return dot

    # Since tree is a dictionary, it has one key which is the decision attribute.
    root_attr = list(tree.keys())[0]
    root_id = str(id(tree))
    dot.node(root_id, label=root_attr, shape="ellipse", style="filled", color="lightblue")
    if parent is not None:
        dot.edge(parent, root_id, label=edge_label)

    # For each branch, add the subtree recursively.
    for branch_val, subtree in tree[root_attr].items():
        add_nodes_edges(subtree, dot, parent=root_id, edge_label=str(branch_val))

    return dot

# Create the Graphviz Digraph from the decision tree structure.
dot = add_nodes_edges(decision_tree)
dot.format = 'png'
# Save and render the tree visualization.
dot.render("decision_tree", view=True)

print("Decision tree visualization generated and saved as 'decision_tree.png'.")


Columns in dataset: ['outlook', 'temp', 'humidity', 'windy', 'play']
Learned Decision Tree using ID3:
{'outlook': {'overcast': 'yes', 'rainy': {'windy': {False: 'yes', True: 'no'}}, 'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}
Decision tree visualization generated and saved as 'decision_tree.png'.


In [None]:
import pandas as pd
import numpy as np
from graphviz import Digraph

# Load dataset
df = pd.read_csv("tennis.csv")
print("Columns:", df.columns.tolist())
target = "play"
if target not in df.columns:
    raise ValueError(f"Target column '{target}' not found.")
features = list(df.columns)
features.remove(target)

# Calculate entropy
def entropy(col):
    elems, counts = np.unique(col, return_counts=True)
    return -sum((c/np.sum(counts)) * np.log2(c/np.sum(counts)) for c in counts)

# Calculate information gain
def info_gain(data, attr, target):
    total = entropy(data[target])
    vals, counts = np.unique(data[attr], return_counts=True)
    weighted = sum((counts[i]/np.sum(counts)) * entropy(data[data[attr]==vals[i]][target])
                   for i in range(len(vals)))
    return total - weighted

# ID3 algorithm
def id3(data, orig, feats, target, parent_class=None):
    if data.empty:
        return np.unique(orig[target])[np.argmax(np.unique(orig[target], return_counts=True)[1])]
    if len(np.unique(data[target])) == 1:
        return np.unique(data[target])[0]
    if not feats:
        return parent_class
    parent_class = np.unique(data[target])[np.argmax(np.unique(data[target], return_counts=True)[1])]
    best = feats[np.argmax([info_gain(data, f, target) for f in feats])]
    tree = {best: {}}
    remaining = [f for f in feats if f != best]
    for val in np.unique(data[best]):
        subtree = id3(data[data[best]==val], orig, remaining, target, parent_class)
        tree[best][val] = subtree
    return tree

tree = id3(df, df, features, target)
print("Decision Tree:", tree)

# Visualization
def add_nodes_edges(tree, dot=None, parent=None, edge_label=""):
    dot = dot or Digraph()
    if not isinstance(tree, dict):
        nid = str(id(tree))
        dot.node(nid, label=str(tree), shape="box", style="filled", color="lightgray")
        if parent: dot.edge(parent, nid, label=edge_label)
        return dot
    root = list(tree.keys())[0]
    rid = str(id(tree))
    dot.node(rid, label=root, shape="ellipse", style="filled", color="lightblue")
    if parent: dot.edge(parent, rid, label=edge_label)
    for branch, sub in tree[root].items():
        add_nodes_edges(sub, dot, rid, str(branch))
    return dot

dot = add_nodes_edges(tree)
dot.format = 'png'
dot.render("decision_tree", view=True)
print("Visualization saved as 'decision_tree.png'.")


Columns: ['outlook', 'temp', 'humidity', 'windy', 'play']
Decision Tree: {'outlook': {'overcast': 'yes', 'rainy': {'windy': {False: 'yes', True: 'no'}}, 'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}
Visualization saved as 'decision_tree.png'.
