In [2]:
import numpy as np
import pandas as pd


# 导入数据集并预览
stu_grade = pd.read_csv(
    "./course-13-student.csv"
)
stu_grade.head()

Unnamed: 0,school,sex,address,Pstatus,Pedu,reason,guardian,traveltime,studytime,schoolsup,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,U,A,4.0,course,mother,2,2,yes,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,U,T,1.0,course,father,1,2,no,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,U,T,1.0,other,mother,1,2,yes,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,U,T,3.0,home,mother,1,3,no,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,U,T,3.0,home,father,1,2,no,...,4,3,2,1,2,5,4,6,10,10


In [3]:
# 特征过多，选择一部分特征进行计算
new_data = stu_grade.iloc[:, [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 14, 15, 24, 25, 26]]
new_data.head()

Unnamed: 0,school,sex,address,Pstatus,Pedu,reason,guardian,studytime,schoolsup,famsup,paid,higher,internet,G1,G2,G3
0,GP,F,U,A,4.0,course,mother,2,yes,no,no,yes,no,5,6,6
1,GP,F,U,T,1.0,course,father,2,no,yes,no,yes,yes,5,5,6
2,GP,F,U,T,1.0,other,mother,2,yes,no,yes,yes,yes,7,8,10
3,GP,F,U,T,3.0,home,mother,3,no,yes,yes,yes,yes,15,14,15
4,GP,F,U,T,3.0,home,father,2,no,yes,yes,yes,no,6,10,10


In [4]:
# 对G1、G2、G3 进行等级划分
def choice_2(x):
    x = int(x)
    if x < 5:
        return "bad"
    elif x >= 5 and x < 10:
        return "medium"
    elif x >= 10 and x < 15:
        return "good"
    else:
        return "excellent"


stu_data = new_data.copy()
stu_data["G1"] = pd.Series(map(lambda x: choice_2(x), stu_data["G1"]))
stu_data["G2"] = pd.Series(map(lambda x: choice_2(x), stu_data["G2"]))
stu_data["G3"] = pd.Series(map(lambda x: choice_2(x), stu_data["G3"]))
stu_data.head()

Unnamed: 0,school,sex,address,Pstatus,Pedu,reason,guardian,studytime,schoolsup,famsup,paid,higher,internet,G1,G2,G3
0,GP,F,U,A,4.0,course,mother,2,yes,no,no,yes,no,medium,medium,medium
1,GP,F,U,T,1.0,course,father,2,no,yes,no,yes,yes,medium,medium,medium
2,GP,F,U,T,1.0,other,mother,2,yes,no,yes,yes,yes,medium,medium,good
3,GP,F,U,T,3.0,home,mother,3,no,yes,yes,yes,yes,excellent,good,excellent
4,GP,F,U,T,3.0,home,father,2,no,yes,yes,yes,no,medium,good,good


In [5]:
# 对 Pedu（父母教育程度）也进行划分
def choice_3(x):
    x = int(x)
    if x > 3:
        return "high"
    elif x > 1.5:
        return "medium"
    else:
        return "low"


stu_data["Pedu"] = pd.Series(map(lambda x: choice_3(x), stu_data["Pedu"]))
stu_data.head()

Unnamed: 0,school,sex,address,Pstatus,Pedu,reason,guardian,studytime,schoolsup,famsup,paid,higher,internet,G1,G2,G3
0,GP,F,U,A,high,course,mother,2,yes,no,no,yes,no,medium,medium,medium
1,GP,F,U,T,low,course,father,2,no,yes,no,yes,yes,medium,medium,medium
2,GP,F,U,T,low,other,mother,2,yes,no,yes,yes,yes,medium,medium,good
3,GP,F,U,T,medium,home,mother,3,no,yes,yes,yes,yes,excellent,good,excellent
4,GP,F,U,T,medium,home,father,2,no,yes,yes,yes,no,medium,good,good


In [12]:
# 将数据特征进行替换
# 根据特征总类，替换为 0......n
def replace_feature(data):
    """
    参数:
    data -- 数据集

    返回:
    data -- 将特征值替换后的数据集
    """
    # 特征值替换
    for each in data.columns:  # 遍历每一个特征名称
        feature_list = data[each]
        unique_value = set(feature_list)
        i = 0
        for fea_value in unique_value:
            data[each] = data[each].replace(fea_value, i)
            i += 1
    return data

In [13]:
stu_data = replace_feature(stu_data)
stu_data.head()

Unnamed: 0,school,sex,address,Pstatus,Pedu,reason,guardian,studytime,schoolsup,famsup,paid,higher,internet,G1,G2,G3
0,0,0,1,0,0,0,2,1,1,0,0,1,0,3,3,3
1,0,0,1,1,1,0,1,1,0,1,0,1,1,3,3,3
2,0,0,1,1,1,2,2,1,1,0,1,1,1,3,3,2
3,0,0,1,1,2,1,2,2,0,1,1,1,1,1,2,1
4,0,0,1,1,2,1,1,1,0,1,1,1,0,3,2,2


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    stu_data.iloc[:, :-1], stu_data["G3"], test_size=0.3, random_state=5
)

X_test.head()

Unnamed: 0,school,sex,address,Pstatus,Pedu,reason,guardian,studytime,schoolsup,famsup,paid,higher,internet,G1,G2
306,0,1,1,0,2,0,0,0,0,0,0,1,0,1,1
343,0,0,1,0,2,1,1,1,0,1,0,1,1,3,3
117,0,1,1,1,2,1,1,0,0,0,0,1,1,2,2
50,0,0,1,1,2,0,2,1,0,1,1,1,1,2,2
316,0,0,1,1,1,0,2,1,0,1,1,1,1,3,3


In [None]:
# DecisionTreeClassifier(criterion='gini', random_state=None) 常用参数如下：

# criterion 表示特征划分方法选择，默认为 gini (在后面会讲到)，可选择为 entropy (信息增益)。

# ramdom_state 表示随机数种子，当特征特别多时 scikit-learn 为了提高效率，随机选取部分特征来进行特征选择，即找到所有特征中较优的特征。

In [15]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(criterion="entropy", random_state=34)
dt_model.fit(X_train, y_train)  # 使用训练集训练模型

In [16]:
from sklearn.tree import export_graphviz
import graphviz

img = export_graphviz(
    dt_model,
    out_file=None,
    feature_names=stu_data.columns[:-1].values.tolist(),  # 传入特征名称
    class_names=np.array(["bad", "medium", "good", "excellent"]),  # 传入类别值
    filled=True,
    node_ids=True,
    rounded=True,
)

graphviz.Source(img)  # 展示决策树

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x1d1ecc0de10>

In [None]:
y_pred = dt_model.predict(X_test)  # 使用模型对测试集进行预测
y_pred

In [None]:
from sklearn.metrics import accuracy_score

# 计算分类准确度
accuracy_score(y_test, y_pred)