C4.5

In [1]:
import collections
import math
import numpy as np
import pandas as pd
import sklearn
from sklearn.utils import shuffle


class Node:
    def __init__(self, x, y, attribute_list, node_type):
        self.data = x
        self.labels = y
        self.attributes_list = attribute_list
        self.best_attribute = None
        self.split_criterion = None
        self.split_up_down = None
        self.node_type = node_type
        self.leaf_label = None
        self.depth = 0
        self.children = []
        self.parent = None

    def __lt__(self, other):
        return self.depth < other.depth

    def predict_leaf_class(self):
        """
            Computes the frequency of classes in partition D, output the leaf node label predicted class
        :return: pred_class
        """
        # takes frequency of classes in D to determine the majority class to set as output leaf label
        freq_classes = collections.Counter(self.labels)  # [4]
        pred_class = max(freq_classes, key=freq_classes.get)
        self.leaf_label = pred_class
        return pred_class

    def print_node(self):
        """
            Print node values
        """
        print('best att-', self.best_attribute, 'split_crit-', self.split_up_down, self.split_criterion, 'type-',
              self.node_type, 'depth-',
              self.depth, 'class label-', self.leaf_label)

    def copy(self):
        pass


class C45Tree:
    def __init__(self, attributes, data):
        self.tree_nodes = []
        self.depth = 0
        self.num_leaves = 0
        self.root_node = None
        self.attributes = attributes[:-1]
        self.dataset = data

    def train(self, x_train, y_train):
        """
            Helper function to grow tree recursively, creates root node for the tree and initializes the recursion for
            training the tree.
        :param x_train:
        :param y_train:
        """
        # create root node, put data partition in node
        self.root_node = Node(x_train, y_train, self.attributes, 'root')
        self.tree_nodes.append(self.root_node)
        # call grow_tree with root node as base
        self.grow_tree(self.root_node, self.attributes, (x_train, y_train))

    def grow_tree(self, prev_node, attribute_list, D):
        """
            Uses C4.5 decision tree algorithm to grow a tree during training, based on pseudocode from [1].
        :param attribute_list:
        :param D:
        :param prev_node:
        :return: N, the new node
        """
        if prev_node is not None and prev_node.parent is not None:
            if prev_node not in prev_node.parent.children:
                prev_node.parent.children.append(prev_node)

        # check for termination cases
        # check if all tuples in D are in the same class
        if self.check_same_class_labels(D[1]):
            N = Node(D[0], D[1], attribute_list, 'leaf')
            N.depth = prev_node.depth + 1
            N.predict_leaf_class()  # determine the class of the leaf
            N.best_attribute = str(prev_node.best_attribute)
            N.split_up_down = prev_node.split_up_down
            N.split_criterion = prev_node.split_criterion
            self.tree_nodes.append(N)
            prev_node.children.append(N)
            N.parent = prev_node
            return N

        # check if attribute list is empty, do majority voting on class
        if not attribute_list:
            N = Node(D[0], D[1], attribute_list, 'leaf')
            N.depth = prev_node.depth + 1
            N.predict_leaf_class()  # determine the class of the leaf
            N.best_attribute = str(prev_node.best_attribute)
            N.split_criterion = prev_node.split_criterion
            N.split_up_down = prev_node.split_up_down
            self.tree_nodes.append(N)
            prev_node.children.append(N)
            N.parent = prev_node
            return N

        # create new node
        N = Node(D[0], D[1], attribute_list, 'node')
        N.depth = prev_node.depth + 1
        N.parent = prev_node
        # conduct attribute selection method, label node with the criterion
        best_attribute, crit_split_val = self.attribute_selection_method(D, attribute_list)

        N.best_attribute = best_attribute  # label node with best attribute
        N.split_criterion = crit_split_val  # for discrete
        if best_attribute == '':
            # early stop
            N.best_attribute = str(best_attribute)
            N.split_up_down = None
            N.node_type = 'leaf'
            N.data = prev_node.data
            N.labels = prev_node.labels
            N.predict_leaf_class()
            self.tree_nodes.append(N)
            prev_node.children.append(N)
            return N

        # remove split attribute from attribute list
        if best_attribute in attribute_list:
            attribute_list.remove(best_attribute)

        # check if attribute is discrete NOTE THIS LINE NEEDS TO BE MODIFIED FOR DIFFERENT DATASET
        if len(self.dataset[
                   best_attribute].unique()) > 5:  # max 5 discrete categories in attributes from Thyroid set
            # continuous, divide up data at mid point of the values ai + ai1/2
            l_part, r_part, split_val = self.continuous_attribute_data_partition(D, best_attribute)
            N.split_criterion = split_val
            N.split_up_down = 'UP'
            l_child = self.grow_tree(N, attribute_list, l_part)  # upper -> att_val > split_val
            N_V = Node(D[0], D[1], attribute_list, 'node')
            N_V.depth = N.depth
            N_V.best_attribute = best_attribute
            N_V.split_criterion = split_val
            N_V.parent = prev_node
            N_V.split_up_down = 'DOWN'
            r_child = self.grow_tree(N_V, attribute_list, r_part)  # lower -> att_val <= split_val
            N.children.append(l_child)
            N_V.children.append(r_child)
            N.parent = prev_node
            self.tree_nodes.append(N)
            self.tree_nodes.append(N_V)
            prev_node.children.append(N)
            prev_node.children.append(N_V)
            return N
        else:
            # discrete, partition based on unique values of attribute to create nodes for recursion
            vals = self.dataset[best_attribute].unique()  # D[0][best_attribute].unique()
            for v in list(vals):
                data_part = self.partition_data(D, best_attribute, v)

                if not data_part:  # TOGGLED TO EMPTY CAUSES 2 LEAVES ONLY TO BE MADE ** check this
                    # majority class leaf node computed of D
                    L = Node(D[0], D[1], attribute_list, 'leaf')
                    L.depth = N.depth + 1
                    L.best_attribute = best_attribute
                    L.split_criterion = v
                    L.predict_leaf_class()  # determine the class of the leaf
                    self.tree_nodes.append(L)
                    N.children.append(L)
                    L.parent = N
                else:
                    # recursion
                    N_V = Node(D[0], D[1], attribute_list, 'node')
                    N_V.depth = N.depth
                    N_V.best_attribute = best_attribute
                    N_V.split_criterion = v
                    N_V.parent = prev_node
                    N_V.parent.children.append(N_V)
                    child = self.grow_tree(N_V, attribute_list, data_part)

        if N not in self.tree_nodes:
            self.tree_nodes.append(N)
            prev_node.children.append(N)
        return N

    def continuous_attribute_data_partition(self, D, attribute):
        """
            Creates data partitions (left and right) for continuous attributes, computing the mid point that
            enables the best information gain ratio to be calculated from the partition.
        :param D:
        :param attribute:
        :return: l_part, r_part, split_val
        """
        # sort the data, find the value that will gain the max info gain ratio
        data = D[0].sort_values(by=[attribute])
        split_val = 0
        best_igr = 0
        l_part = []
        r_part = []

        for i in range(0, len(data) - 1):
            mid_point = (float(data.iloc[i][attribute]) + float(data.iloc[i + 1][attribute])) / 2
            left_d = D[0].loc[pd.to_numeric(D[0][attribute]) > mid_point]
            left_idx = D[0].index[pd.to_numeric(D[0][attribute]) > mid_point]
            left_y = D[1].loc[left_idx]
            right_d = D[0].loc[pd.to_numeric(D[0][attribute]) <= mid_point]
            right_idx = D[0].index[pd.to_numeric(D[0][attribute]) <= mid_point]
            right_y = D[1].loc[right_idx]
            igr = self.compute_info_gain_ratio_continuous(D, left_y, right_y)

            if igr >= best_igr:
                best_igr = igr
                split_val = mid_point
                l_part = (left_d, left_y)
                r_part = (right_d, right_y)

        return l_part, r_part, split_val

    def compute_info_gain_ratio_continuous(self, D, left_y, right_y):
        """
            Computes the information gain ratio for a continuous attribute partition
        :return info_gain_ratio
        """
        l_y = left_y
        r_y = right_y

        dataset_entropy = self.data_entropy(D[1])
        l_part_entropy = self.data_entropy(l_y)
        l_p_j = float(len(l_y) / len(D))
        l_ent = l_p_j * l_part_entropy
        r_part_entropy = self.data_entropy(r_y)
        r_p_j = float(len(r_y) / len(D))
        r_ent = r_p_j * r_part_entropy

        split_info = - self.split_info(l_p_j) - self.split_info(r_p_j)
        att_ent = l_ent + r_ent

        if split_info == 0:  # prevent division by zero for ratio
            return 0
        else:
            info_gain = self.information_gain(dataset_entropy, att_ent)
            info_gain_ratio = self.information_gain_ratio(info_gain,
                                                          split_info)
        return info_gain_ratio

    @staticmethod
    def check_same_class_labels(labels):
        """
            Checks set of labels to ensure they are of the same class type
        :param labels:
        :return: bool
        """
        if len(set(labels)) == 1:
            return True
        else:
            return False

    def attribute_selection_method(self, D, attribute_list):
        """
            Attribute Selection Method for decision tree as discussed in [1] (Figure 8.3), selects attribute that
            provides the best information gain ratio as a result.
        :param D:
        :param attribute_list:
        :return: best_attribute
        """
        best_attribute = ''
        dataset_entropy = self.data_entropy(D[1])
        best_info_gain_ratio = 0.0
        split_val = ''

        for attribute in attribute_list:
            # a_idx = self.attributes.get(attribute) MIGHT NEED THIS
            v = D[0][attribute].unique()  # find v distinct values of attribute
            att_ent = 0.0
            split_info = 0.0
            curr_val = ''
            val_ent = 0.0
            for val in v:
                data_partition = self.partition_data(D, attribute, val)
                partition_labels = data_partition[1]
                part_entropy = self.data_entropy(partition_labels)
                p_j = float(len(data_partition[1]) / len(D[1]))
                att_ent = att_ent + (p_j * part_entropy)
                split_info = split_info - self.split_info(p_j)

                if part_entropy > val_ent:
                    val_ent = part_entropy
                    curr_val = val

            # Best Attribute checks
            if split_info == 0:  # prevent division by zero for ratio
                continue
            else:
                info_gain = self.information_gain(dataset_entropy, att_ent)
                info_gain_ratio = self.information_gain_ratio(info_gain,
                                                              split_info)  # calculate info gain ratio to select

            # compare the top performing attribute info gain ratio
            if info_gain_ratio > best_info_gain_ratio:
                best_info_gain_ratio = info_gain_ratio
                best_attribute = attribute
                split_val = curr_val
        return best_attribute, split_val

    def class_prob(self, feature_label, labels):
        """
            Computes class probabilities from labels
        :param feature_label:
        :param labels:
        :return: p
        """
        c = collections.Counter(labels)  # [4]
        p = c[feature_label] / len(labels)
        return float(p)

    def data_entropy(self, labels):
        """
            Computes the Entropy, or Info(D) [1]
        :param labels:
        :return: entropy
        """
        entropy = 0.0
        class_freq = collections.Counter(labels)  # [4]
        for l in class_freq.keys():
            p = float(class_freq[l] / len(labels))
            entropy = entropy - math.log(p, 2)
        return entropy

    def information_gain(self, dataset_entropy, attribute_entropy):
        """
            Computes information gain based on the data entropy and attribute entropy [1]
        :param dataset_entropy:
        :param attribute_entropy:
        :return: gain
        """
        gain = dataset_entropy - attribute_entropy
        return gain

    def split_info(self, p_j):
        """
            Computes the information split, used in gain ratio [1]
        :param p_j:
        :return: info_split
        """
        # error protection for zero case
        if p_j == 0:
            return 0

        info_split = (p_j * math.log(p_j, 2))
        return info_split

    def information_gain_ratio(self, gain, split_info):
        """
            Computes information gain ratio [1]
        :param gain:
        :param split_info:
        :return:
        """
        gain_ratio = float(gain / split_info)
        return gain_ratio

    def partition_data(self, D, attribute, val):
        """
            Partitions a dataset D based on the value of a specific attribute
        :param D:
        :param attribute:
        :param val:
        :return: part, part_y
        """
        part = D[0].loc[D[0][attribute] == val]
        part_idx = D[0].index[D[0][attribute] == val]
        part_y = D[1].loc[part_idx]
        return part, part_y

    def test_tree(self, test_sample, node):
        """
            Using recursion, we go through each node (from the root through to the children) to find a leaf label
            to classify the test sample as a prediction.
        :param test_sample:
        :param node:
        :return: node.leaf_label, or recursion
        """

        if node.node_type == 'leaf':
            return node.leaf_label
        else:
            for child in node.children:
                if (child.best_attribute is None or child.best_attribute == '') and child.node_type == 'leaf':
                    return self.test_tree(test_sample, child)

                if (child.best_attribute is None or child.best_attribute == '') and child.node_type == 'node':
                    pass
                else:
                    if child.split_criterion == test_sample[child.best_attribute]:
                        return self.test_tree(test_sample, child)
                    else:
                        if child.split_up_down == 'UP':
                            # check if att_val > split_criterion
                            if pd.to_numeric(test_sample[child.best_attribute]) > float(child.split_criterion):
                                return self.test_tree(test_sample, child)
                            else:
                                pass
                        elif child.split_up_down == 'DOWN':
                            if pd.to_numeric(test_sample[child.best_attribute]) <= float(child.split_criterion):
                                return self.test_tree(test_sample, child)
                            else:
                                pass

    def predict(self, test_x, test_y):  # TODO Add this functionality from the code in main routine
        # uses test set to predict class labels from the constructed tree
        preds = []
        true_pred = 0
        for i in range(len(test_x)):
            tester_instance = test_x.iloc[i]
            pred = self.test_tree(tester_instance, self.root_node)
            # print(str(i), 'pred', pred, 'label', y.iloc[i])
            if pred == test_y.iloc[i]:
                true_pred += 1
            preds.append(pred)

        return true_pred, preds

    def print_tree(self):
        nodes_created = sorted(self.tree_nodes)
        for n in nodes_created:
            n.print_node()
            for d in n.children:
                d.print_node()
            print()
        return

In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing 

In [3]:
# Pre-process
train_data = pd.read_csv('./adult/adult.data', header= None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class'])
test_data = pd.read_csv('./adult/adult.test', header= None, skiprows=1, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class'])
originTest = test_data

print("Original: Train", train_data.shape, ", Test", test_data.shape)

Original: Train (32561, 15) , Test (16281, 15)


In [4]:
#刪除重複的值

train_data.drop_duplicates(inplace=True)
test_data.drop_duplicates(inplace=True)

print("After Dropping: Train", train_data.shape, ", Test", test_data.shape)

After Dropping: Train (32537, 15) , Test (16276, 15)


In [5]:
# education and education-num have same meaning
train_data.drop(['education'], axis = 1, inplace = True)
test_data.drop(['education'], axis = 1, inplace = True)

# fnlwgt is not important feature
train_data.drop(['fnlwgt'], axis = 1, inplace = True)
test_data.drop(['fnlwgt'], axis = 1, inplace = True)

# remove the space
train_data = train_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
test_data = test_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# replace the "?" into NAN
train_data.replace("?", pd.NaT, inplace = True)
test_data.replace("?", pd.NaT, inplace = True)

# replace the NAN into mode value
train_data['workclass'] = train_data['workclass'].replace(float('nan'), train_data['workclass'].mode()[0])
train_data['occupation'] = train_data['occupation'].replace(float('nan'),train_data['occupation'].mode()[0])
train_data['native-country'] = train_data['native-country'].replace(float('nan'),train_data['native-country'].mode()[0])


test_data['workclass'] = test_data['workclass'].replace(float('nan'), test_data['workclass'].mode()[0])
test_data['occupation'] = test_data['occupation'].replace(float('nan'),test_data['occupation'].mode()[0])
test_data['native-country'] = test_data['native-country'].replace(float('nan'),test_data['native-country'].mode()[0])

train_data.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
# check if there is Nan or not
train_data.isnull().sum(axis=0)

age               0
workclass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64

In [7]:
#One Hot Encoding (Dummies)
trainData_dum = pd.get_dummies(train_data, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'], dtype=int)
testData_dum = pd.get_dummies(test_data, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'], dtype=int)
pd.DataFrame(trainData_dum)

## 數值屬性做Normalization(z-score)
numerical_columns = ['age','education-num','capital-gain','capital-loss','hours-per-week']
scaler = preprocessing.StandardScaler()
trainData_dum[numerical_columns] = scaler.fit_transform(trainData_dum[numerical_columns])
testData_dum[numerical_columns] = scaler.fit_transform(testData_dum[numerical_columns])

# tranfer the value of class(income) into int(1 or 0)
# >50K is 1, <=50K is 0
trainData_dum['class'] = trainData_dum['class'].apply(lambda x: 1 if x == ">50K" else 0)
testData_dum['class'] = testData_dum['class'].apply(lambda x: 1 if x == ">50K" else 0)

In [8]:
from sklearn.preprocessing import MinMaxScaler

#在讀熱編碼後會依照有名目之欄位產生資料，train_data比test_data多出了該欄位，故將test_data新增該欄位，讓兩個資料集欄位相同。
testData_dum['native-country_Holand-Netherlands'] = 0

#刪除重複列
trainData_dum.drop_duplicates(inplace=True)
testData_dum.drop_duplicates(inplace=True)

In [9]:
trainData_dum.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,class,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.03039,1.134777,0.148292,-0.216743,-0.035664,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.836973,1.134777,-0.145975,-0.216743,-2.222483,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.042936,-0.420679,-0.145975,-0.216743,-0.035664,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1.05695,-1.198407,-0.145975,-0.216743,-0.035664,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,-0.776193,1.134777,-0.145975,-0.216743,-0.035664,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
testData_dum.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,class,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,native-country_Holand-Netherlands
0,-0.994356,-1.196669,-0.142684,-0.218097,-0.031615,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,-0.055664,-0.417699,-0.142684,-0.218097,0.769762,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,-0.777734,0.750757,-0.142684,-0.218097,-0.031615,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.377579,-0.028214,0.870916,-0.218097,-0.031615,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,-1.499805,-0.028214,-0.142684,-0.218097,-0.832992,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [11]:
# put "class" to the last column
td = trainData_dum
column = td.pop("class")
td.insert(td.shape[1], "class", column)

columns = td.columns.tolist()

X_train = td.drop('class', axis=1)
y_train = td['class']

ted = testData_dum
t_column = ted.pop("class")
ted.insert(ted.shape[1], "class", t_column)

X_test = ted.drop('class', axis=1)
y_test = ted['class']


#C4.5
system_test = C45Tree(columns, td)
system_test.train(X_train, y_train)
true_pred, preds = system_test.predict(X_test, y_test)

print('Full set test accuracy:', true_pred / len(X_test))

# print(preds)

Full set test accuracy: 0.9243243243243243


In [30]:
#輸出混亂矩陣，顯示準確率
from sklearn.metrics import confusion_matrix,classification_report
print("輸出混亂矩陣，顯示準確率：使用驗證資料")
print(confusion_matrix(y_test, preds))
## 一般情況zero_division會設1(true)，除非確定每個類別都有被預測到才會設0
print(classification_report(y_test, preds, zero_division=1))

輸出混亂矩陣，顯示準確率：使用驗證資料
[[13851  1134]
 [    0     0]]
              precision    recall  f1-score   support

           0       1.00      0.92      0.96     14985
           1       0.00      1.00      0.00         0

    accuracy                           0.92     14985
   macro avg       0.50      0.96      0.48     14985
weighted avg       1.00      0.92      0.96     14985



## EXCEL 輸出

In [None]:
#將 R 的預測結果轉換為 Python 陣列
#rpy2
from openpyxl import Workbook

#產出Excel(Test data)
wb = Workbook()
ws = wb.active
ws.append(['age', 'workclass', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class', 'Predict result'])

for i in range(len(preds)):
    if preds[i] == 0:
        result = '<=50K.'
    else:
        result = '>50K.'
    #將現在loop到原始資料的列轉為list
    li = originTest.iloc[i,:].tolist()
    #
    li.append(result)
    ws.append(li)
wb.save('C45.xlsx')
# predictions = np.array(predictions)

In [None]:
print("Total Node：", len(system_test.tree_nodes))
leaf_count = 0
for n in system_test.tree_nodes:
    # print(n.print_node())
    if n.node_type == 'leaf':
        leaf_count += 1

print("Leaf Node：", leaf_count)

Total Node： 113
Leaf Node： 54


In [None]:
# test_data = pd.read_csv("adult/adult.test", header= None, names=['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
# test_data.drop(0,axis=0,inplace=True)
# #刪除重複的值
# test_data_new = test_data.drop_duplicates()

# preds_label = []
# for i in range(len(preds)):
#     if(preds[i]==0):
#         preds_label.append("<=50K")
#     else:
#         preds_label.append(">50K")
# preds_label = pd.DataFrame(preds_label)
# result = pd.concat([test_data_new, preds_label], axis=1)
# result.to_csv("adult_C45.csv",header=['age', 'workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income','income_predict'])




In [None]:
# # linear regression feature importance
# from sklearn.datasets import make_regression
# from sklearn.linear_model import LinearRegression
# from matplotlib import pyplot

In [None]:
# # get importance
# importance = system_test.feature_importances_
# # summarize feature importance
# for i,v in enumerate(importance):
# 	print('Feature: %0d, Score: %.5f' % (i,v))
# # plot feature importance
# pyplot.bar([x for x in range(len(importance))], importance)
# pyplot.show()

In [None]:
# import matplotlib.pyplot as plt

# #以圖表事每個特徵變數的重要程度(0最小，1最大)
# def plot_feature_importances_cancer(model):
#     n_features = X_train.shape[1]
#     plt.figure(figsize=(15,30))
#     plt.barh(np.arange(n_features), model.feature_importances_, align='center')
#     plt.yticks(np.arange(n_features), train_x.columns)
#     plt.xlabel("Feature importance")
#     plt.ylabel("Feature")
#     plt.ylim(-1, n_features)

# plot_feature_importances_cancer(system_test)