# 导入我们需要的包

其中DictVectorizer是将字典转为onehot类型的包。

In [1]:
import pandas as pd
import numpy as np
import sklearn

from sklearn import preprocessing
from sklearn import tree
from sklearn.feature_extraction import DictVectorizer

# 处理数据

1、labelList里面是最后一列数据，也就是买或则不买的结果。（yes or no)
2、heaerlist里面是数据的类别也就是(RID,age,income,student,credit_rating,class_buys_computer)
3、valueList里面是每个样本的数据，但是除了RID字段和class_buys_computer字段
4、featureList里面是{'age': 'youth', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}，包含每个属性对应值的多个字典的列表。因为DictVectorizer会把字典型数据转化为onehot型数据。

In [2]:
data = pd.read_csv('./AllElectronics.csv')
labelList = []
featureList = []
headerList = []
valueList = []

for name in data.head():
    headerList.append(name)
headerList.pop()
headerList.pop(0)

for i in data.iloc[:,-1]:
    labelList.append(i)

for row in data.index:
    valueList.append(data.iloc[row].values[1:5])

# 将两个列表合并成一个字典，用zip函数。
for i in valueList:
     feature = dict(zip(headerList,i))
     featureList.append(feature)

print(labelList)
print('\n')
print(featureList)
print('\n')
print(valueList)

['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']


[{'age': 'youth', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'youth', 'income': 'high', 'student': 'no', 'credit_rating': 'excellent'}, {'age': 'middle_aged', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'low', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'low', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'middle_aged', 'income': 'low', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'youth', 'income': 'medium', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'youth', 'income': 'low', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'youth', 'income': 'medium', 'student': 'yes', 'credit_rating': 'excellent'},

# 处理数据

将字典型数据转化为onehot数据

In [264]:
vec = DictVectorizer()
dummyX = vec.fit_transform(featureList).toarray()
dummyX

array([[0., 0., 1., 0., 1., 1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1., 1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1., 0., 0., 1., 1., 0.],
       [0., 1., 0., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1., 0., 0., 1., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 0., 1., 0., 0., 1., 0., 1.],
       [0., 0., 1., 1., 0., 0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0., 1., 1., 0.]])

# 处理数据

将labelList中的class_buys_computer中的数据处理成onehot数据

In [265]:
lb = preprocessing.LabelBinarizer()
dummyY = lb.fit_transform(labelList)
dummyY

array([[0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0]])

# 训练数据

训练这个数据

In [266]:
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(dummyX, dummyY)
print(clf)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


# 测试数据

我们构造一个数据，其中把年龄改一下。因为测试数据必须是二维数据，所以我们加一个[]使它成为二位数据。

In [267]:
oneRowX = dummyX[0, :]
newRowX = oneRowX
newRowX[0] = 1
newRowX[2] = 0
newRowX
newRowX = [newRowX]

# 预测结果

最后的结果

In [268]:
predictedY = clf.predict(newRowX)
print(predictedY)

[1]
