In [1]:
from collections import namedtuple,defaultdict
import numpy as np
import pandas as pd

In [2]:

TreeNode = namedtuple("TreeNode", 'feature iv branch label')

class DecisionTreeClassifier(object):
    def __init__(self):
        pass

    @staticmethod
    def devide_X(X_, feature):
        """切分集合
        :param X_: 被切分的集合, shape=[ni_samples, ni_features + 1]
        :param feature: 切分变量
        :param val: 切分变量的值
        :return: 数据集合list
        """
        X_list=[]
        feature_unique_val=np.unique(X_[:, feature])
        for _ in feature_unique_val:
            X_list.append(X_[X_[:, feature] == _])
        return X_list,feature_unique_val
    @staticmethod
    def iv_index(cluster, attr_index):
        """
        :param cluster: 训练集的一个子集
        :param attr_index:  特征编号（第N个特征）
        :return: 第N个特征的的IV
        """
        yi_dict = defaultdict(int)  # 每个特征，每种值中label =1的数量
        ni_dict = defaultdict(int)  # 每个特征，每种值 label =0的数量
        label_t_dict = defaultdict(int)  # label=1或0的数量
        for line in cluster:
            yi_dict[line[attr_index]] += line[-1]
            ni_dict[line[attr_index]] += (1 - line[-1])
            label_t_dict[line[-1]] += 1
        yi_arr = np.array(list(yi_dict.values()))
        ni_arr = np.array(list(ni_dict.values()))

        pyi = yi_arr / label_t_dict[1]
        pni = ni_arr / label_t_dict[0]
        pyi[pyi == 0] = 1
        pni[pni == 0] = 1  # Laplace平滑，令概率不为0

        pyi_div_pni = pyi / pni
        iv_i_arr = ((pyi) - (pni)) * np.log(pyi_div_pni)
        #         iv_i_arr[iv_i_arr > 0.8] = 0

        return sum(iv_i_arr)

    @staticmethod
    def get_best_iv_index(cluster, attr_indexs):
        '''
        :param cluster: 给定数据集
        :param attr_indexs: 给定的可供切分的特征编号的集合
        :return: 最佳切分特征，该特征的iv得分
        '''
        p = {}
        for attr_index in attr_indexs:
            p[attr_index] = DecisionTreeClassifier.iv_index(cluster, attr_index)
        attr_index = max(p, key=lambda x: p.get(x))#这里返回的是value最大的key值
        attr = p[attr_index]
        return attr_index, attr

    def build(self, X_, features):
        """建树
        :param X_: 候选集 shape=[ni_samples, n_features + 1]
        :param features: 候选特征集
        :param depth: 当前深度
        :return: 结点
        """

        if np.unique(X_[:, -1]).shape[0] == 1:
            return TreeNode(None, None, None, X_[0, -1])
        if features.shape[0] == 0 :
            classes, classes_count = np.unique(X_[:, -1], return_counts=True)
            return TreeNode(None, None, None, classes[np.argmax(classes_count)])
        feature_index, iv = DecisionTreeClassifier.get_best_iv_index(X_, features)
        new_features = features[features != feature_index]
        del features

        X_list,feature_unique_val = DecisionTreeClassifier.devide_X(X_, feature_index)
        branch_dict={}
        for fea_val,_ in zip(feature_unique_val,X_list):
            branch_temp = self.build(_, new_features)
            branch_dict[fea_val]=(branch_temp)

        return TreeNode(feature_index, iv, branch_dict, None)

    def fit(self, X, y):
        """
        :param X_: shape = [n_samples, n_features]
        :param y: shape = [n_samples]
        :return: self
        """
        features = np.arange(X.shape[1])
        X_ = np.c_[X, y]
        self.root = self.build(X_, features)
        return self

    def predict_one(self, x):
        p = self.root
        while p.label is None:
            print(p.feature)
            p = p.branch[x[p.feature]]
        return p.label

    def predict(self, X):
        """
        :param X: shape = [n_samples, n_features]
        :return: shape = [n_samples]
        """
        return np.array([self.predict_one(x) for x in X])



In [None]:
def count_ratio_every_col_obj(df_raw:pd.DataFrame):
    new_col=[]
    total=df_raw.shape[0]
    for _ in df_raw.columns:
        a=pd.value_counts(df_raw[_])
        df_=a/total
        print(_,max(df_))
        if (max(df_)<0.95):
            new_col.append(_)
    return new_col

In [None]:
df_equip_history=pd.read_csv("F:\\YIELD\\YoudaOptronics\\Archive(1)\\equip_history.csv",engine="python",sep=',',encoding='GBK')

df_equip_history['SHEET_ID']=df_equip_history['锘縎HEET_ID'] # modify unidentifiable columns 
df_equip_history.drop(columns=['锘縎HEET_ID'],inplace=True)
df_equip_history.fillna('-1',inplace=True)

col_list=[] #Found all productive station & machine under it .
for col in list(df_equip_history.columns):
    if 'R' not in col:
        col_list.append(col)
print(col_list)

df_measure_labels=pd.read_csv("F:\\YIELD\\YoudaOptronics\\Archive(1)\\measure_labels.csv",engine="python",sep=',',encoding='GBK')
df_measure_labels.dropna(inplace=True)

df_temp=pd.merge(df_equip_history[col_list],df_measure_labels[['SHEET_ID','Y']],how='inner',on='SHEET_ID')
df_temp['label']=0
df_temp['label'][(df_temp['Y']>=1)|(df_temp['Y']<=-1)]=1
col_list.remove('SHEET_ID')


df_label1=df_temp[df_temp['label']==1].copy()#sheetId有大量重复，这里选择只要有出现过不良的就作为响应标签
df_label1.drop_duplicates(inplace=True)
df_label0=df_temp[df_temp['label']==0].copy()
df_label0.drop_duplicates(inplace=True)
df_temp=pd.concat([df_label1,df_label0],axis=0)
df_temp.drop_duplicates('SHEET_ID','first',inplace=True)

print(df_temp.shape)
columns_list=count_ratio_every_col_obj(df_temp)
print(columns_list)
columns_list.remove('SHEET_ID')
columns_list.remove('label')
columns_list.remove('Y')
feature_data=df_temp[columns_list]
label_array=df_temp['label']

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,roc_curve,auc, accuracy_score,recall_score,precision_score
from sklearn import tree

In [8]:
X_train, X_test, y_train, y_test =train_test_split(feature_data,label_array,test_size=0.05, random_state=0)

In [9]:
iv_clf=DecisionTreeClassifier()
iv_clf=iv_clf.fit(X_train.values,y_train.values)

In [10]:
y_pred=iv_clf.predict(X_test.values)
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred,labels=[0,1]))

12
6
11
1
5
12
7
10
6
12
7
10
6
12
7
10
1
6
8
12
7
10
1
11
12
7
10
1
6
5
0
2
3


KeyError: 0

In [100]:
for _ in range(X_test.shape[0]):
    print('------------',_)
    iv_clf.predict_one(X_test.values[_])

------------ 0
12
6
11
1
5
------------ 1
12
7
6
10
------------ 2
12
7
6
10
------------ 3
12
7
6
11
------------ 4
12
7
6
------------ 5
12
7
6
10
11
1
0
2
3
------------ 6
12
6
11
3
0
------------ 7
12
7
6
11
------------ 8
12
7
6
------------ 9
12
11
10
------------ 10
12
6
11
8
------------ 11
12
7
6
11
9
8
10
------------ 12
12
7
6
11
------------ 13
12
7
6
11
9
------------ 14
12
7
6
11
9
8
------------ 15
12
7
6
11
1
8
------------ 16
12
7
6
------------ 17
12
7
6
8
------------ 18
12
7
6
------------ 19
12
7
6
10
11
------------ 20
12
7
6
11
------------ 21
12
7
6
10
11
0
1


KeyError: 0

In [106]:
print(iv_clf.root)

TreeNode(feature=12, iv=5.037030197406397, branch={1: TreeNode(feature=6, iv=6.637767416735521, branch={0: TreeNode(feature=11, iv=5.2281596608182, branch={1: TreeNode(feature=None, iv=None, branch=None, label=1), 2: TreeNode(feature=3, iv=2.1972245773362196, branch={0: TreeNode(feature=0, iv=0.0, branch={0: TreeNode(feature=None, iv=None, branch=None, label=1), 1: TreeNode(feature=None, iv=None, branch=None, label=0)}, label=None), 1: TreeNode(feature=None, iv=None, branch=None, label=1), 2: TreeNode(feature=None, iv=None, branch=None, label=1)}, label=None), 3: TreeNode(feature=1, iv=3.289196099636184, branch={0: TreeNode(feature=5, iv=1.9362939684855176, branch={0: TreeNode(feature=None, iv=None, branch=None, label=0), 1: TreeNode(feature=9, iv=2.0442680632985475, branch={1: TreeNode(feature=None, iv=None, branch=None, label=0), 2: TreeNode(feature=0, iv=0.6931471805599453, branch={0: TreeNode(feature=None, iv=None, branch=None, label=1), 1: TreeNode(feature=2, iv=0.0, branch={0: Tr

In [104]:
X_test.values[20:22]

array([[1, 0, 0, 1, 0, 2, 0, 2, 2, 1, 3, 5, 2],
       [1, 0, 0, 1, 0, 1, 0, 3, 1, 3, 2, 3, 2]], dtype=int64)

In [3]:
df_heart_disease=pd.read_csv("F:\\YIELD\\heart disease predicate\\heart.csv")
df_heart_disease['age_bin']=5
df_heart_disease['oldpeak_bin']=5

df_heart_disease['age_bin'][df_heart_disease['age']<=40]=1
df_heart_disease['age_bin'][(df_heart_disease['age']>40) & (df_heart_disease['age']<=50)]=2
df_heart_disease['age_bin'][(df_heart_disease['age']>50) & (df_heart_disease['age']<=55)]=3
df_heart_disease['age_bin'][(df_heart_disease['age']>55) & (df_heart_disease['age']<=60)]=4
df_heart_disease['age_bin'][(df_heart_disease['age']>60)]=5

for col in ['trestbps','thalach','chol']:
    df_heart_disease[col+'_bin']=5
    min_=int(df_heart_disease[col].min())
    max_=int(df_heart_disease[col].max())
    range_=int(((max_-min_)/5))
    low_bound=-1
    print(col,range_,max_,min_)
    for bin_index,_ in zip(range(1,5),range(min_+range_,max_+1,range_)):
        print(low_bound,_,bin_index)
        df_heart_disease[col+'_bin'][(df_heart_disease[col]>low_bound) & (df_heart_disease[col]<=_)]=bin_index
        low_bound=_
    print(low_bound)
    df_heart_disease[col+'_bin'][(df_heart_disease[col]>low_bound)]=5
    
df_heart_disease['oldpeak_bin'][df_heart_disease['oldpeak']<=0.1]=1
df_heart_disease['oldpeak_bin'][(df_heart_disease['oldpeak']>0.1) & (df_heart_disease['oldpeak']<=1)]=2
df_heart_disease['oldpeak_bin'][(df_heart_disease['oldpeak']>1) & (df_heart_disease['oldpeak']<=2)]=3
df_heart_disease['oldpeak_bin'][(df_heart_disease['oldpeak']>2)]=4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A

trestbps 21 200 94
-1 115 1
115 136 2
136 157 3
157 178 4
178
thalach 26 202 71
-1 97 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


97 123 2
123 149 3
149 175 4
175
chol 87 564 126
-1 213 1
213 300 2
300 387 3
387 474 4
474


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
df_heart_disease.drop(['age','trestbps','thalach','chol','oldpeak'],axis=1,inplace=True)

In [5]:
label_array=df_heart_disease['target']
columns_list=list(df_heart_disease.columns)
columns_list.remove('target')
feature_data=df_heart_disease[columns_list]

In [6]:
feature_data.info()
label_array.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 13 columns):
sex             303 non-null int64
cp              303 non-null int64
fbs             303 non-null int64
restecg         303 non-null int64
exang           303 non-null int64
slope           303 non-null int64
ca              303 non-null int64
thal            303 non-null int64
age_bin         303 non-null int64
oldpeak_bin     303 non-null int64
trestbps_bin    303 non-null int64
thalach_bin     303 non-null int64
chol_bin        303 non-null int64
dtypes: int64(13)
memory usage: 30.9 KB


(303,)

In [None]:
import matplotlib.pyplot as plt

In [None]:
decisionNode=dict(boxstyle='sawtooth',fc='0.8')
leafNode=dict(boxstyle='round4',fc='0.8')
arrow_args=dict(arrowstyle="<-")

In [None]:
def plotNode(nodeTxt,centerPt,parentPt,nodeType):
    createPlot.ax1.annotate(nodeTxt,xy=parentPt,
                             xycoords='axes fraction',xytext=centerPt,textcoords='axes fraction',
                             va='center',ha='center',bbox=nodeType,arrowprops=arrow_args)
def createPlot():
    fig=plt.figure(1,facecolor='white')
    fig.clf()
    createPlot.ax1=plt.subplot(111,frameon=False)
    plotNode('non-leaf node',(-0.5,0.1),(0.1,0.5),decisionNode)
    plotNode('leaf node',(0.8,0.5),(0.3,0.8),leafNode)
    plt.show()

    
fig=plt.figure(1,facecolor='white')
fig.clf()
createPlot.ax1=plt.subplot(111,frameon=False)
def traverse(node_head,x1,y1,x2,y2,level_node_num):
    if node_head.label==None:
        for index,machine in zip(range(1,len(node_head.branch)+1),node_head.branch):
            
            traverse(node_head.branch[machine],x1+(0.2*index),y1+0.2,x2+(0.2*index),y2+0.2,level_node_num=len(node_head.branch))
            plotNode('non-leaf node',(-0.5,0.1),(0.1,0.5),decisionNode)
    else:
        plotNode('leaf node',(x1,y1),(x2,y2+0.2),leafNode,0)
plt.show()
    
        
# def plotMidText(parent_postion_x2,parent_postion_y2,branch_dict,txtString):
#     xMid=(parentPt[0])

In [None]:
createPlot()

In [None]:
p=iv_clf.root
fig=plt.figure(1,facecolor='white')
fig.clf()
createPlot.ax1=plt.subplot(111,frameon=False)

root_postion_x1=0
root_postion_y1=1
root_postion_x2=0
root_postion_y2=1
plotNode(columns_list[p.feature],(root_postion_x1,root_postion_y1),(root_postion_x2,root_postion_y2),decisionNode)
while p.label is None:
    plotNode(columns_list[p.feature],(0,1),(0,1),decisionNode)    
    p = p.branch[x[p.feature]]
plt.show()

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

import jgraph
from jgraph import *
jgraph.__version__