In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import math
from sklearn.metrics import accuracy_score

In [2]:
trainSet = pd.read_csv('Santhosh/train.csv')

In [4]:
print(trainSet.head())

   41     services   married  secondary  no     0  yes no.1   unknown   5  \
0  48  blue-collar    single  secondary  no   312  yes  yes  cellular   3   
1  55   technician   married  secondary  no  1938   no  yes  cellular  18   
2  54       admin.   married   tertiary  no    59  yes   no  cellular  10   
3  34   management    single   tertiary  no  2646   no   no  cellular  14   
4  49       admin.  divorced  secondary  no  1709  yes   no   unknown  12   

   may  114  2   -1  0.1 unknown.1 no.2  
0  feb  369  2   -1    0   unknown   no  
1  aug  193  1  386    3   success  yes  
2  jul  268  1   -1    0   unknown   no  
3  apr  142  1   -1    0   unknown  yes  
4  jun  106  1   -1    0   unknown   no  


In [3]:
df=pd.DataFrame(trainSet)

In [5]:
print(df.describe())

                41              0            5          114            2  \
count  4999.000000    4999.000000  4999.000000  4999.000000  4999.000000   
mean     40.836767    1396.424685    15.811762   256.248850     2.775555   
std      10.695004    3333.735307     8.357794   251.960779     3.018423   
min      18.000000   -2604.000000     1.000000     4.000000     1.000000   
25%      33.000000      66.000000     8.000000   103.000000     1.000000   
50%      38.000000     453.000000    16.000000   180.000000     2.000000   
75%      48.000000    1421.000000    21.000000   318.000000     3.000000   
max      95.000000  102127.000000    31.000000  3253.000000    37.000000   

                -1          0.1  
count  4999.000000  4999.000000  
mean     37.825965     0.523505  
std      97.578028     1.719936  
min      -1.000000     0.000000  
25%      -1.000000     0.000000  
50%      -1.000000     0.000000  
75%      -1.000000     0.000000  
max     854.000000    41.000000  


In [41]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score

class TreeNode:
    def __init__(self, attribute, attributeName, is_leaf, label, depth, info_gain, entropy_parent_attr, parent_attr_val):
        self.attribute = attribute
        self.attributeName = attributeName
        self.children = {}
        self.is_leaf = is_leaf
        self.label = label
        self.depth = depth
        self.info_gain = info_gain
        self.entropy_parent_attr = entropy_parent_attr
        self.parent_attr_val = parent_attr_val

    def get_attribute(self):
        return self.attribute

    def add_child(self, child_node, attr_value):
        self.children[attr_value] = child_node
    
    def predict(self, x):
        if self.is_leaf:
            return self.label
        current_val = x[self.attribute]
        if current_val not in self.children.keys():
            return self.label
        return self.children[current_val].predict(x)

    def print_node(self, space=""):
        print(f"{space}Depth: {self.depth}")
        print(f"{space}Selected Feature: {self.attributeName}")
        print(f"{space}Information Gain for Parent Feature: {self.info_gain}")
        print(f"{space}Entropy for Parent Feature: {self.entropy_parent_attr}")
        print(f"{space}Parent Feature Value: {self.parent_attr_val}")
        print(f"{space}Label: {self.label}")
        for child in self.children.values():
            child.print_node(space + "\t")


class DecisionTreeClassifier:
    def __init__(self, max_depth=np.inf):
        self.root = None
        self.depth = 0
        if max_depth < 1:
            print("max_depth cannot be lower than 1! Setting it to 1.")
            max_depth = 1
        self.max_depth = max_depth
        self.longest_path_len = 0

    def build_tree(self, X, Y, attribute_names, attribute_list=[], current_depth=0,
                   parent_info={"max_info_gain": None, "attribute_list[max_attribute]": None, "value": None}):
        if current_depth > self.longest_path_len:
            self.longest_path_len = current_depth
        if current_depth >= self.max_depth or len(attribute_list) == 0 or len(np.unique(Y)) == 1:
            vals, counts = np.unique(Y, return_counts=True)
            return TreeNode(None, None, True, vals[np.argmax(counts)], current_depth,
                            parent_info["max_info_gain"], parent_info["attribute_list[max_attribute]"],
                            parent_info["value"])

        max_info_gain = -1
        max_attribute = None
        i = 0
        for attribute in attribute_list:
            info_gain, entropy_attribute, entropy_parent = self.calculate_information_gain(X, Y, attribute)
            if info_gain > max_info_gain:
                max_info_gain = info_gain
                max_attribute = i
                entropy = entropy_parent
            i += 1

        vals, counts = np.unique(Y, return_counts=True)
        root = TreeNode(attribute_list[max_attribute], attribute_names[attribute_list[max_attribute]],
                        False, vals[np.argmax(counts)], current_depth,
                        parent_info["max_info_gain"], parent_info["attribute_list[max_attribute]"],
                        parent_info["value"])

        attribute_values = np.unique(X[:, attribute_list[max_attribute]])
        new_attribute_list = np.delete(attribute_list, max_attribute)
        for value in attribute_values:
            indices = np.where(X[:, attribute_list[max_attribute]] == value)[0]
            if len(indices) == 0:
                root.add_child(TreeNode(None, None, True, vals[np.argmax(counts)], current_depth + 1,
                                        max_info_gain, attribute_list[max_attribute], value), current_depth)
            else:
                parent_info = {
                    "max_info_gain": max_info_gain,
                    "attribute_list[max_attribute]": entropy,
                    "value": value
                }
                root.add_child(self.build_tree(X[indices], Y[indices], attribute_names, new_attribute_list,
                                               current_depth + 1, parent_info), value)
        return root

    def calculate_entropy(self, counts):
        total = sum(counts)
        entropy_value = 0
        for element in counts:
            p = (element / total)
            if p != 0:
                entropy_value -= p * np.log2(p)
        return entropy_value

    def calculate_information_gain(self, X, Y, attribute):
        _, counts = np.unique(Y, return_counts=True)
        entropy_attribute = self.calculate_entropy(counts)
        entropy_parent = 0
        distinct_attr_values = list(set(X[:, attribute]))
        for val in distinct_attr_values:
            indices = np.where(X[:, attribute] == val)[0]
            _, counts = np.unique(Y[indices], return_counts=True)
            entr = self.calculate_entropy(counts)
            entropy_parent += (len(indices) / len(Y)) * entr
        info_gain = entropy_attribute - entropy_parent
        return info_gain, entropy_attribute, entropy_parent

    def fit(self, X, Y):
        attribute_names = list(range(X.shape[1]))  # Assume attributes are indexed
        attribute_list = np.arange(X.shape[1])
        self.root = self.build_tree(X, Y, attribute_names, attribute_list, 0)

    def predict(self, X):
        predictions = []
        for x in X:
            predictions.append(self.root.predict(x))
        return predictions

    def get_longest_path_len(self):
        return self.longest_path_len

    def get_root_attribute(self):
        if self.root:
            return self.root.get_attribute()
        return None

    def print_tree(self):
        self.root.print_node("")


# Load the corrected data
train_df = pd.read_csv("Santhosh/train.csv")
X_train = train_df.drop('no.2', axis=1).values  # Corrected target column name
y_train = train_df['no.2'].values  # Corrected target column name

test_df = pd.read_csv("Santhosh/test.csv")
X_test = test_df.drop('no.2', axis=1).values  # Corrected target column name
y_test = test_df['no.2'].values  # Corrected target column name

# Train and evaluate the Decision Tree Classifier
max_depth_range = list(range(1, 16))
criterion_list = ['information_gain', 'majority_error', 'gini']
results = {criterion: [] for criterion in criterion_list}

for criterion in criterion_list:
    train_errors = []
    test_errors = []
    for max_depth in max_depth_range:
        model = DecisionTreeClassifier(max_depth=max_depth)
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)

        train_errors.append(1 - train_acc)
        test_errors.append(1 - test_acc)

    results[criterion] = (train_errors, test_errors)

# Print results in a table

print('{:<12} {:<12} {:<12} {:<12}'.format('Max Depth', 'Test Error (IG)', 'Test Error (ME)', 'Test Error (Gini)'))
for i in range(len(max_depth_range)):
    print('{:<12} {:<12.3f} {:<12.3f} {:<12.3f}'.format(
        max_depth_range[i],
        results['information_gain'][1][i],
        results['majority_error'][1][i],
        results['gini'][1][i]
    ))


Max Depth    Test Error (IG) Test Error (ME) Test Error (Gini)
1            0.191        0.191        0.191       
2            0.195        0.195        0.195       
3            0.195        0.195        0.195       
4            0.195        0.195        0.195       
5            0.195        0.195        0.195       
6            0.195        0.195        0.195       
7            0.195        0.195        0.195       
8            0.195        0.195        0.195       
9            0.195        0.195        0.195       
10           0.195        0.195        0.195       
11           0.195        0.195        0.195       
12           0.195        0.195        0.195       
13           0.195        0.195        0.195       
14           0.195        0.195        0.195       
15           0.195        0.195        0.195       
