In [1]:
import pandas as pd
import math

# Step 1: Calculate Entropy for a dataset
def calculate_entropy(data):
    labels = data.iloc[:, -1].value_counts(normalize=True)
    entropy = -sum(labels * labels.apply(lambda x: math.log2(x) if x > 0 else 0))
    return entropy

# Step 2: Calculate Information Gain for a feature
def calculate_information_gain(data, feature):
    total_entropy = calculate_entropy(data)
    feature_values = data[feature].value_counts(normalize=True)
    weighted_entropy = sum(feature_values.apply(lambda val: calculate_entropy(data[data[feature] == val])) * feature_values)
    return total_entropy - weighted_entropy

# Step 3: TDIDT Algorithm to build a Decision Tree
class TDIDT:
    def __init__(self, approach='take_first'):
        self.tree = None
        self.approach = approach

    def fit(self, data):
        self.tree = self._build_tree(data)
        return self.tree

    def _build_tree(self, data):
        # If all instances have the same label, return the label
        if len(data.iloc[:, -1].unique()) == 1:
            return data.iloc[0, -1]
        
        # If there are no more features to split on, return the majority label
        if len(data.columns) == 1:
            return data.iloc[:, -1].mode()[0]
        
        best_feature = None
        max_info_gain = -1
        
        # Decide the order of features based on the approach
        feature_order = self._get_feature_order()

        # Find the best feature to split on
        for feature in feature_order:  # Iterate based on feature order
            info_gain = calculate_information_gain(data, feature)
            if info_gain > max_info_gain:
                max_info_gain = info_gain
                best_feature = feature
        
        tree = {best_feature: {}}
        feature_values = data[best_feature].unique()

        # Recursively build the tree based on the selected feature
        for value in feature_values:
            subset = data[data[best_feature] == value].drop(columns=[best_feature])
            tree[best_feature][value] = self._build_tree(subset)
        
        return tree

    def _get_feature_order(self):
        if self.approach == 'take_first':
            return ['Buying', 'Maintenance', 'Doors', 'Persons', 'Lug_boot', 'Safety']
        elif self.approach == 'take_last':
            return ['Safety', 'Lug_boot', 'Persons', 'Doors', 'Maintenance', 'Buying']

    def print_tree(self, tree=None, indent=""):
        """Print the decision tree in a human-readable format."""
        if tree is None:
            tree = self.tree
        
        if isinstance(tree, dict):
            for feature, branches in tree.items():
                for value, subtree in branches.items():
                    print(f"{indent}{feature} = {value}?")
                    self.print_tree(subtree, indent + "  ")
        else:
            print(f"{indent}Predict: {tree}")

# Load the dataset (replace with your file path)
file_path = '/Users/rahatrihan/Desktop/AIUB/data mining/Assignment/car_evaluation.csv'
data = pd.read_csv(file_path)

# Train TDIDT model using 'Take First' approach
model_take_first = TDIDT(approach='take_first')
model_take_first.fit(data)

# Print the generated decision tree for Take First approach
print("Decision Tree using 'Take First' approach:")
model_take_first.print_tree()

# Train TDIDT model using 'Take Last' approach
model_take_last = TDIDT(approach='take_last')
model_take_last.fit(data)

# Print the generated decision tree for Take Last approach
print("\nDecision Tree using 'Take Last' approach:")
model_take_last.print_tree()


KeyError: 'Lug_boot'

In [3]:
print(data.columns)


Index(['Buying', 'Price', 'Maintenance', 'Doors', 'Persons', 'Lug_Boot',
       'Safety Class'],
      dtype='object')


In [None]:
s