In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, label=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.label = label

class CharacterDecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # caso base 1: atingimos o limite de profundidade
        if self.max_depth is not None and depth >= self.max_depth:
            return Node(value=np.max(Counter(y)))

        # caso base 2: todas as amostras pertencem a mesma classe
        if len(set(y)) == 1:
            return Node(value=y[0])

        # caso base 3: não há mais características para dividir
        if n_features == 0:
            return Node(value=np.max(Counter(y)))

        # seleciona a melhor característica e limiar para dividir os dados
        best_feature, best_threshold = self._choose_split_feature(X, y)

        # divide os dados baseado na característica e limiar selecionados
        left_idx, right_idx = self._split_data(X[:, best_feature], best_threshold)
        X_left, y_left = X[left_idx], y[left_idx]
        X_right, y_right = X[right_idx], y[right_idx]

        # constrói a subárvore recursivamente
        left_subtree = self._build_tree(X_left, y_left, depth + 1)
        right_subtree = self._build_tree(X_right, y_right, depth + 1)

        # retorna a raiz da subárvore construída
        return Node(best_feature, best_threshold, left_subtree, right_subtree)

    def _choose_split_feature(self, X, y):
        best_feature = None
        best_threshold = None
        best_entropy = float('inf')

        for feature in range(X.shape[1]):
            values = X[:, feature]
            for threshold in set(values):
                left_idx, right_idx = self._split_data(values, threshold)

                left_entropy = self._calculate_entropy(y[left_idx])
                right_entropy = self._calculate_entropy(y[right_idx])

                total_entropy = (len(left_idx) / len(y)) * left_entropy + (len(right_idx) / len(y)) * right_entropy

                if total_entropy < best_entropy:
                    best_feature = feature
                    best_threshold = threshold
                    best_entropy = total_entropy

        return best_feature, best_threshold

    def _split_data(self, values, threshold):
        left_idx = np.where(values <= threshold)[0]
        right_idx = np.where(values > threshold)[0]
        return left_idx, right_idx

    def _calculate_entropy(self, labels):
        n_labels = len(labels)
        if n_labels <= 1:
            return 0

        counts = np.bincount(labels)
        probs = counts / n_labels
        probs = probs[np.nonzero(probs)]
        return -np.sum(probs * np.log2(probs))

    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

import json
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Carregar os dados do arquivo JSON
with open('characters.json', 'r') as f:
    data = json.load(f)

# Separar características e rótulos
X = [[character['height'], character['weight'], character['age'], character['gender'] == 'male'] for character in data['characters']]
y = [character['name'] for character in data['characters']]

# Dividir os dados em conjunto de treinamento e teste
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# X_train = np.array(X_train)
# y_train = np.array(y_train)
# X_test = np.array(X_test)
# y_test = np.array(y_test)
# X_train = X_train.astype(np.float)
# X_test = X_test.astype(np.float)
# y_train = y_train.astype(np.float)
# y_test = y_test.astype(np.float)

# Treinar a árvore de decisão
clf = DecisionTreeClassifier()
clf.fit(X, y)

# Testar a precisão da árvore
y_pred = clf.predict(X)
print(accuracy_score(y, y_pred))

# tree.plot_tree(clf)
