# 决策树

- ID3（基于信息增益）
- C4.5（基于信息增益比）
- CART（gini指数）

#### entropy：$H(Y) = -\sum_{i=1}^{n}p_i\log{p_i}$

#### conditional entropy: $H(Y|X)=\sum{P(Y|X)}\log{P(Y|X)}$

#### information gain : $g(Y, X)=H(Y)-H(Y|X)$

#### information gain ratio: $g_R(Y, X) = \frac{g(Y,X)}{H(X)}$

#### gini index:$Gini(Y)=\sum_{k=1}^{K}p_k\log{p_k}=1-\sum_{k=1}^{K}p_k^2$

In [None]:
import numpy as np
import pandas as pd
np.set_printoptions(precision=3) 

### 先用CAR数据集理解决策树的实现


In [None]:
car_test = pd.read_csv('../input/d/harveydeng/classification-cars/car-train.csv')
car_train = pd.read_csv('../input/d/harveydeng/classification-cars/car-test.csv')

In [None]:
car_train.head()

In [None]:
labels = car_test.columns.values
labels

In [None]:
# 熵
def calc_ent(y):
    y_count = pd.Series.value_counts(y)
    y_ratio = y_count / y_count.sum()
    ent = y_ratio * np.log2(y_ratio)
    return - ent.sum()

# 测试数据
y = car_train['CAR']
calc_ent(y)

In [None]:
# 经验条件熵
def cond_ent(d, X='buying', Y='CAR'):
    x_count = d[X].value_counts(sort=False)
    x_ratio = x_count / x_count.sum()
    ent_c = d.groupby(X)[Y].agg(calc_ent)
    ent_c = np.sum(ent_c * x_ratio)
    return ent_c

# 测试数据
d = car_train
X = 'buying'
Y = 'CAR'
cond_ent(d, X, Y)

In [None]:
# 最佳split变量
def best_x(d, Y='CAR'):

    ent = calc_ent(d[Y])
    ent_c = {}
    for i_x in d.columns:
        # 排除Y列
        if i_x == Y:
            continue

        # 对于每个X计算信息增益
        ent_c[i_x] = ent - cond_ent(d, i_x, Y)
        print(f'变量{i_x}的信息增益为 = {ent_c[i_x]}')

    # 从dict格式转化为pd.Series，便于分析
    d_ent = pd.Series(ent_c)

    # 取最大值
    x_max_id = d_ent.argmax()
    x_max = d_ent.keys()[x_max_id]
    print(f'最大信息增益变量为：{x_max}')
    return x_max_id, d_ent[x_max]

# 测试数据
d = car_train
Y = 'CAR'
best_x(d, Y)

---

# Sklearn!

In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC
import sklearn.metrics as skm

# type help(DTC) in console for help

In [None]:
# 数据准备
Y = 'CAR'
x_train = car_train.copy()
y_train = x_train.pop(Y)
x_test = car_test.copy()
y_test = x_test.pop(Y)

# 模型训练
tree = DTC() # 可以输出额外参数
tree.fit(x_train, y_train)

# 预测
y_pred = tree.predict(x_test)

In [None]:
# 对比预测结果
y2 = {'实际': y_test, '预测': y_pred}
y2 = pd.DataFrame(y2)
y2

In [None]:
# 评估
accuracy = skm.accuracy_score(y_test, y_pred)
print(f"预测准确度为：\n{accuracy}")


conf = skm.confusion_matrix(y_test, y_pred)
print(f"混淆矩阵为：\n{conf}")