In [1]:
#2017/6/18 兩兩類別比較並找出特徵值
#2017/5/8 根據「自由中國SVM作者預測」檔案內容包裝
#2017/5/10 完成部分包裝
#2017/5/12 完成全部包裝，加入路徑規則尋找
#2017/5/19 各函式模塊化

#建立基準特徵詞組向量 -> 將清理後的資料建成文本向量 -> 資料平衡 -> 預測模型建立 -> 評估報表產生
#同主題不同作者、同作者不同主題
#作者歸屬常用語言特徵：高頻詞、2-gram、3-gram、標點符號
#本研究提出語言特徵：詞性組合、否定程度組合、情態詞組合

#函式庫引入
import os
import time
import codecs
import itertools
import numpy as np
from sklearn import svm
from collections import OrderedDict,defaultdict,Counter
from sklearn.model_selection import train_test_split,GridSearchCV 
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
from IPython.display import clear_output,Image
from sklearn.externals import joblib
import pydotplus 

In [2]:
#輸入資料

classification_name = ['雷震','殷海光','夏道平','傅正','龍平甫','蔣勻田','朱伴耘','胡適','羅鴻詔']
classification_topic = ['社論','文章','日記']


#建立作者索引，提取各作者文章內容及索引
author_index = []
for index,name in enumerate(classification_name): #建立作者索引
    author_index.append((name,index))
author_index = OrderedDict(author_index) #作者索引排序(依文本數量高到低)

topic_index = []
for index,name in enumerate(classification_topic): #建立作者索引
    topic_index.append((name,index))
topic_index = OrderedDict(topic_index) #作者索引排序(依文本數量高到低)

condicate_author_path = "D:\\課業相關\\論文資料\\論文程式\\condicate\\author\\" #候選作者文本
condicate_topic_path = "D:\\課業相關\\論文資料\\論文程式\\condicate\\topic\\" #候選主題文本
SC_feature_path = "D:\\課業相關\\論文資料\\論文程式\\language_feature\\最終版\\平衡語料庫\\" #平衡語料庫語言特徵
FC_feature_path = "D:\\課業相關\\論文資料\\論文程式\\language_feature\\最終版\\自由中國\\" #自由中國語言特徵
lei_feature_path = "D:\\課業相關\\論文資料\\論文程式\\language_feature\\最終版\\雷震文本\\" #雷震文本語言特徵
condicate_tree_path = "D:\\課業相關\\論文資料\\論文程式\\預測資料\\condicate_path\\"

In [3]:
#輸入介面
def find_input(find):
    
    feature_file_path = '' #符合尋找的特徵檔案路徑
    feature_file_name = '' #符合尋找的特徵檔案名稱
    condicate_path = '' #候選類別路徑
    condicate_label = '' #候選類別名稱
    condicate_index = '' #候選類別索引

    temp = find.split()
    
    if len(temp) != 3:
        print ('請輸入正確值')
        return False
    
    if temp[0] == 'SC':
        feature_file_path = SC_feature_path
    elif temp[0] == 'FC':
        feature_file_path = FC_feature_path
    elif temp[0] == 'lei':
        feature_file_path = lei_feature_path
    else:
        print ('母體選項不符合')
        return False
    
    feature_file_name = [file for file in os.listdir(feature_file_path) if temp[1] in file]
    if len(feature_file_name) == 0:
        print ('輸入的語言特徵不在範圍內')
        return False
    feature_file_name = feature_file_name[0]

    if temp[2] == 'name':
        condicate_path = condicate_author_path
        condicate_label = classification_name
        condicate_index = author_index
    elif temp[2] == 'topic':
        condicate_path = condicate_topic_path
        condicate_label = classification_topic
        condicate_index = topic_index
    else:
        print ('領域選項不符合')
        return False
        
    return feature_file_path,feature_file_name,condicate_path,condicate_label,condicate_index

In [4]:
#抓取候選文本，並根據特徵轉換成文本向量

#抓取候選文本，回傳文章序列及各文章類別代號
def article_get(condicate_path,condicate_label,condicate_index):

    content_list = [] #所有作者文本內容(未處理)
    article_label = [] #各文本label，也就是索引

    for file in os.listdir(condicate_path):
        with codecs.open(condicate_path+file,'rb','utf8') as f:

            if file.split('_')[0] not in condicate_label:
                continue

            title = f.readline()
            content = f.readline().strip()

            content_list.append(content)

            article_label.append(condicate_index[file.split('_')[0]])
        
    return content_list,article_label

#選擇語言特徵，回傳文本向量詞組
def feature_select(feature_file_path):
    
    feature = [] #特徵
    with codecs.open(feature_file_path,'rb','utf8') as f: #抓取基準特徵
        for i in f.readlines():
            if '\ufeff' in i: #去掉開頭BOM
                i = i.replace('\ufeff','')
            if i.strip() != '':
                feature.append(i.strip().split(',')[0])
                
    return feature

#建立文本向量
def article_vector(X_raw,feature,feature_file_name):
    
    bi_pos_combine = ['N+N','N+V','VH+N','D+V','情態詞']
    more_pos_combine = ['否定','程度']
    
    def line_vec(line): #將文章轉換為特徵向量並回傳
        temp_feature = defaultdict(int)
        
        if any(word in feature_file_name for word in bi_pos_combine): #詞性組合
            line = [line[i]+line[i+1] for i in range(len(line)-1)] 
        elif any(word in feature_file_name for word in more_pos_combine): #2~3詞性組合
            line = [line[i]+line[i+1] for i in range(len(line)-1)] + [line[i]+line[i+1]+line[i+2] for i in range(len(line)-2)]
        else: #其他常用語言特徵
            if 'bigram' in feature_file_name:
                line = [line[i].split('(')[0]+line[i+1].split('(')[0] for i in range(len(line)-1)]
            elif 'trigram' in feature_file_name:
                line = [line[i].split('(')[0]+line[i+1].split('(')[0]+line[i+2].split('(')[0] for i in range(len(line)-2)]
            else:
                line = [line[i].split('(')[0] for i in range(len(line))]
            
        for i in line:
            if i in feature:
                temp_feature[i] += 1
        
        return temp_feature 
    
    vector_space = np.zeros((len(X_raw),len(feature)),np.float64)
        
    for index,element in enumerate(X_raw): #依序將文章轉換為特徵向量
        line = element.strip().split()
        temp_feature = line_vec(line)

        for i,j in enumerate(feature):
            vector_space[index, i] = round(temp_feature[j] * 1000000 / len(line)) #取相對頻率
            
    return vector_space

def random_balance(X,y):
    return RandomOverSampler(random_state=0).fit_sample(X,y)

In [5]:
#預測模型建立
def predict_model(X,y,test_size,model,find):

    #X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=test_size,stratify=y,random_state=0) 
    X_train = X
    y_train = y
    
    kernel = ''
    
    if model == 'RF':
        kernel = RandomForestClassifier(n_jobs=-1, oob_score=True,\
                                        class_weight = 'balanced',n_estimators=256,random_state=0,min_samples_leaf=2)
    elif model == 'SVM':
        kernel = svm.LinearSVC(class_weight='balanced',random_state=0)
    elif model == 'DT':
        kernel = DecisionTreeClassifier(class_weight='balanced',random_state=0)
    else:
        print ('model error')
        return
    
    start = time.time()
    
    '''param_grid = { 
        'n_estimators': [64,128,254,512], #1000和700結果差不多
        'max_features': ['auto', 'log2'] #sqrt = auto
    }
      
    clf = GridSearchCV(kernel, param_grid=param_grid,cv=5)'''
    clf = kernel
    clf.fit(X_train, y_train)

    end = time.time()

    #print ('訓練耗費時間：',end-start,'秒')
    #print ()
    
    #y_true, y_pred = y_test, clf.predict(X_test)
    
    '''print (clf.best_params_)
    print ()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))'''
    
    '''with codecs.open('C:\\Users\\user\\Desktop\\RF result\\param_grid\\'+find+'.txt','wb','utf8') as g:
        #print (clf.best_params_)
        #g.write(str(clf.best_params_)+'\r\n')
        #g.write('\r\n')
        #print ()
        #means = clf.cv_results_['mean_test_score']
        #stds = clf.cv_results_['std_test_score']
        #for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            #print("%0.3f (+/-%0.03f) for %r"
            #      % (mean, std * 2, params))
            #g.write("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)+'\r\n')
        #print()
        #g.write('\r\n')
        g.write('預測準確率：'+str(accuracy_score(y_true, y_pred))+'\r\n')
        #g.write('oob error rate:'+str(1-clf.best_estimator_.oob_score_)+'\r\n')
        g.write('oob error rate:'+str(1-clf.oob_score_)+'\r\n')
        def sparsity_ratio(X):
            return 1.0 - np.count_nonzero(X) / float(X.shape[0] * X.shape[1])
        g.write("輸入稀疏比:"+str(sparsity_ratio(X))+'\r\n')
        g.write('\r\n')
        g.write(classification_report(y_true, y_pred)+'\r\n')'''
    
    #return clf,y_true,y_pred
    return clf

In [6]:
#結果評估輸出    
def predict_report(y_true, y_pred, condicate_index):    
    
    for name,index in condicate_index.items():
        print (str(index)+':'+name,end=' ')
    print ()
    print ()
    
    print (classification_report(y_true, y_pred))
    print ('預測準確率：',accuracy_score(y_true, y_pred))
    print ()

#建立混淆矩陣
def predict_confusion_matrix(y_true, y_pred, normal, condicate_label, find):

    ZF1 = FontProperties(fname='C:\Windows\Fonts\kaiu.ttf', size=14)
    def plot_confusion_matrix(cm, classes,
                              normalize=False,
                              title='Confusion matrix',
                              cmap=plt.cm.Blues):
        """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
        """
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45, fontproperties=ZF1)
        plt.yticks(tick_marks, classes, fontproperties=ZF1)

        if normalize:
            #F-1 score
            '''cm1 = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            cm2 = cm.astype('float') / cm.sum(axis=0)[:, np.newaxis]
            cm3 = np.zeros((cm.shape[0],cm.shape[1]),np.float64)
            for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
                if cm1[i,j]+cm2[i,j] != 0.0:
                    cm3[i,j] = (2*cm1[i,j]*cm2[i,j])/(cm1[i,j]+cm2[i,j])
            cm = cm3'''
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] #recall
            #cm = cm.astype('float') / cm.sum(axis=0)[:, np.newaxis] #precision
            #print("Normalized confusion matrix")
        else:
            #print('Confusion matrix, without normalization')
            pass

        #print(cm)

        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, round(cm[i, j],2),
                     horizontalalignment="center", verticalalignment="center",
                     color="white" if cm[i, j] > thresh else "black",
                     size=24 if len(condicate_label) < 5 else 14)

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_true, y_pred)
    np.set_printoptions(precision=2)

    plt.figure(figsize=(8, 6))
    
    if normal:
        plot_confusion_matrix(cnf_matrix, classes=condicate_label, normalize=True,
                          title='Normalized confusion matrix')
    elif not normal:
        plot_confusion_matrix(cnf_matrix, classes=condicate_label,
                          title='Confusion matrix, without normalization')
        
    #plt.savefig('C:\\Users\\user\\Desktop\\RF result\\picture\\'+find+'.png', bbox_inches="tight")
    
    #plt.close()
    
    plt.show()

In [14]:
#輸出各作者決策樹規則
def set_tree_path(model,condicate_label):
    tree_path = []
    tree_all_path = []

    label_path = defaultdict(dict)
    for i in condicate_label:
        label_path[i] = {}

    def get_code(tree, feature_names, target_names,
                 spacer_base="    "):
        """Produce psuedo-code for decision tree.

        Args
        ----
        tree -- scikit-leant DescisionTree.
        feature_names -- list of feature names.
        target_names -- list of target (class) names.
        spacer_base -- used for spacing code (default: "    ").

        Notes
        -----
        based on http://stackoverflow.com/a/30104792.
        """
        left      = tree.tree_.children_left
        right     = tree.tree_.children_right
        threshold = tree.tree_.threshold
        features  = [feature_names[i] for i in tree.tree_.feature]
        value = tree.tree_.value

        def recurse(left, right, threshold, features, node, depth):
            spacer = spacer_base * depth
            if (threshold[node] != -2):
                tree_path.append(features[node]+' <= '+str(threshold[node]))
                if left[node] != -1:
                    recurse(left, right, threshold, features,
                            left[node], depth+1)
                tree_path.pop()
                tree_path.append(features[node]+' > '+str(threshold[node]))
                if right[node] != -1:
                    recurse(left, right, threshold, features,
                            right[node], depth+1)
                tree_path.pop()
            else:
                target = value[node]
                for i, v in zip(np.nonzero(target)[1],target[np.nonzero(target)]):
                    target_name = target_names[i]
                    target_count = int(v)
                    
                    if len(tree_path) == 0:
                        continue
                    
                    temp = ' & '.join(tree_path)+':'+str(target_name)+" (" + str(target_count) + " examples)"
                    #tree_all_path.append(' and '.join(tree_path)+':'+str(target_name)+" (" + str(target_count) + " examples)")
                    if target_count not in label_path[target_name]:
                        label_path[target_name][target_count] = [temp]
                    else:
                        label_path[target_name][target_count].append(temp)

        recurse(left, right, threshold, features, 0, 0)

    for i in range(len(model.best_estimator_)):
        get_code(model.best_estimator_[i], feature, condicate_label)
        tree_path = []

    for name in label_path: 
        l = sorted([i for i in label_path[name]],reverse=True)
        with codecs.open(condicate_tree_path+name+'.txt','wb','utf8') as g:
            for i in l:
                g.write('#'+str(i)+'\r\n')
                for e in label_path[name][i]:
                    g.write(e+'\r\n')
                g.write('\r\n')

#觀察類別路徑規則
def get_tree_path(fit_class,condicate_label):
    classification_path = defaultdict(list)

    #找出各路徑規則
    for i,e in fit_class:
        name = condicate_label[i]
        with codecs.open(condicate_tree_path+name+'.txt','rb','utf8') as f:
            #print (name)
            num = ''
            for line in f.readlines():
                line = line.strip()    

                if line == '':
                    continue
                elif line[0] == '#':
                    num = line[1:]
                    continue

                if num != '' and int(num) < 3:
                    continue

                element_temp = line.split(':')[0].split(' & ')

                element = {}

                for j in element_temp: #重複清理
                    j = j.split()
                    if (j[0],j[1]) not in element:
                        element[(j[0],j[1])] = j[2]
                    else:
                        if j[1] == '<=':
                            element[(j[0],j[1])] = str(min(float(j[2]),float(element[(j[0],j[1])])))
                        elif j[1] == '>':
                            element[(j[0],j[1])] = str(max(float(j[2]),float(element[(j[0],j[1])])))
                        else:
                            print ('ERROR')

                element = [k[0]+' '+k[1]+' '+v for k,v in element.items()]
                element.insert(0,num)
                classification_path[name].append(element)

    classification_rule = defaultdict(dict)

    #建立兩兩規則
    for i in classification_path:
        for j in classification_path[i]:
            comb = [i for i in itertools.combinations(j[1:],2)]
            for k in comb:
                if k[0] > k[1]: #避免有著順序不同的key
                    k = (k[1],k[0])
                if k not in classification_rule[i]:
                    classification_rule[i][k] = int(j[0])
                else:
                    classification_rule[i][k] += int(j[0])

            if len(comb) == 0:
                classification_rule[i][(j[1],)] = int(j[0])

    #印出
    for i in classification_rule:
        print (i)
        so = sorted(classification_rule[i].items(), key=lambda d:d[1], reverse = True)
        index = 0
        for x,y in so:
            index += 1
            if index == 20:
                break
            print (x,y)
        print ()

        '''ee = defaultdict(str)
        index = 1
        with codecs.open('C:\\Users\\user\\Desktop\\node.txt','wb','utf8') as g:
            g.write('id\tcombine\r\n')
            for k,v in classification_rule[i].items():
                if k[0] not in ee:
                    ee[k[0]] = index
                    g.write(str(index)+'\t'+k[0]+'\r\n')
                    index += 1
                if len(k) > 1 and k[1] not in ee:
                    ee[k[1]] = index
                    g.write(str(index)+'\t'+k[1]+'\r\n')
                    index += 1
            g.write(str(index)+'\tX\r\n')

        with codecs.open('C:\\Users\\user\\Desktop\\link.txt','wb','utf8') as g:
            g.write('source\ttarget\ttype\tweight\r\n')
            for x,y in so:
                if len(x) < 2:
                    g.write(str(ee[x[0]])+'\t'+str(index)+'\tUndirected\t'+str(y)+'\r\n')
                else:
                    g.write(str(ee[x[0]])+'\t'+str(ee[x[1]])+'\tUndirected\t'+str(y)+'\r\n')'''
        
def out_tree_image(model,feature,condicate_label):
    dot_data = export_graphviz(model.estimators_[0], out_file=None,
                                filled=True,feature_names=feature,class_names=condicate_label,
                                 proportion=True,rounded=True,special_characters=False) 
    dot_data = dot_data.replace('helvetica','kaiu') #字型調換
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_png("C:\\Users\\user\\Desktop\\tree2.png")

In [49]:
#輸出介面
def main(find,label):
    #SC(平衡語料庫)/FC(自由中國)/lei(雷震文本)+語言特徵+name(同主題不同作者)/topic(同作者不同主題)
    #FC N+N name
    try:
        feature_file_path,feature_file_name,condicate_path,condicate_label,condicate_index = find_input(find)
    except:
        return
    #print (find)
    #print ()
    
    condicate_label = label

    test_size = 0.2
    kernel = 'RF' #DT/RF/SVM
    threshold = 0.5
    
    if 'topic' in find:
        threshold = 0.8

    X_raw,y_raw = article_get(condicate_path,condicate_label,condicate_index)
    feature = feature_select(feature_file_path+feature_file_name)
    X = article_vector(X_raw,feature,feature_file_name)
    y = np.array(y_raw)
    
    feature_index = defaultdict(int)
    for i,e in enumerate(feature):
        feature_index[e] = i
    
    def sparsity_ratio(X):
        return 1.0 - np.count_nonzero(X) / float(X.shape[0] * X.shape[1])
    #print("輸入稀疏比:", sparsity_ratio(X))
    
    zero_feature = []
    for i in range(X.shape[1]): #找出數值皆0特徵
        if sum(X[:,i]) == 0:
            zero_feature.append(feature[i]) #是否觀察各別皆為0?
            
    '''with codecs.open('C:\\Users\\user\\Desktop\\RF result\\vector\\'+find+'.csv','wb','utf8') as g:
        for i in range(len(X)):
            g.write(str(y[i])+','+','.join(list(map(str,X[i])))+'\r\n')'''
    
    #X,y = random_balance(X,y)

    #model,y_true,y_pred = predict_model(X,y,test_size,kernel,find)
    model = predict_model(X,y,test_size,kernel,find)
    
    #predict_report(y_true, y_pred, condicate_index)
    #print ('預測準確率：',accuracy_score(y_true, y_pred))
    
    '''model,y_true,y_pred = predict_model(X,model.predict(X),test_size,'DT',find)
    
    predict_report(y_true, y_pred, condicate_index)'''

    #f1 = f1_score(y_true, y_pred, average=None)
    #print ('F1預測閥值：',threshold)
    #fit_class = [(i,e) for i,e in enumerate(f1) if e > threshold]
    #print ('可預測： '+' '.join([str(condicate_label[x])+':'+str(round(y,2)) 
    #                         for x,y in fit_class]))
    
    #print ('oob error rate:',1-model.best_estimator_.oob_score_)
    print ('oob error rate:',1-model.oob_score_)

    #predict_confusion_matrix(y_true, y_pred, False, condicate_label, find)
    
    #set_tree_path(model,condicate_label)
    #get_tree_path(fit_class,condicate_label)
    #out_tree_image(model,feature,condicate_label)
    
    #for i in sorted([i for i in zip(feature,model.feature_importances_)],key=lambda t: t[1],reverse=True):
    #    print (i[0],i[1])
    
    #joblib.dump(model, 'C:\\Users\\user\\Desktop\\RF result\\model\\'+find+'.pkl')
    #joblib.dump(model, 'C:\\Users\\user\\Desktop\\temp\\model\\'+find+' '+' '.join(label)+'.pkl')
    
    #with codecs.open('C:\\Users\\user\\Desktop\\new\\特徵值\\'+find+'.txt','ab','utf8') as a:
    with codecs.open('C:\\Users\\user\\Desktop\\temp\\特徵值\\'+find+'.txt','ab','utf8') as a:
        a.write('#'+' '.join(label)+'\r\n')
        #a.write('預測準確率：'+str(accuracy_score(y_true, y_pred))+'\r\n')
        a.write('預測準確率：-0.0\r\n')
        a.write('oob error rate:'+str(1-model.oob_score_)+'\r\n')
        
        for i in sorted([i for i in zip(feature,model.feature_importances_)],key=lambda t: t[1],reverse=True):
            if i[1] != 0.0:
                a.write(i[0]+' '+str(i[1])+'\r\n')
            elif i[1] == 0.0:
                label = []
                cut = 0
                for j in range(len(y)-1):
                    if y[j] != y[j+1]:
                        cut = j+1
                        label.append(y[j])
                        label.append(y[j+1])
                        break

                a_class = X[:cut]
                b_class = X[cut:]

                def class_caculator(c): 
                    class_max = max(c[:,feature_index[i[0]]])
                    class_min = min(c[:,feature_index[i[0]]])
                    class_mean = np.mean(c, axis=0)[feature_index[i[0]]]
                    class_std = np.std(c, axis=0)[feature_index[i[0]]]
                    class_zero = Counter(c[:,feature_index[i[0]]].tolist())[0]/c.shape[0]
                    class_sum = sum(c[:,feature_index[i[0]]])

                    return (class_max,class_min,class_mean,class_std,class_zero,class_sum)

                a_class = class_caculator(a_class)
                b_class = class_caculator(b_class)

                if 'name' in find:
                    label = [classification_name[label[0]],classification_name[label[1]]]
                elif 'topic' in find:
                    label = [classification_topic[label[0]],classification_topic[label[1]]]
                if a_class[4] <= 0.5 and b_class[4] <= 0.5:
                    print (i[0])   
                    print (label[0],a_class)
                    print (label[1],b_class)
                    print ()
                    a.write(i[0]+' '+str(i[1])+'\r\n')
        
        a.write('\r\n')
    '''for i in sorted([i for i in zip(feature,model.feature_importances_)],key=lambda t: t[1],reverse=True):
        if i[1] == 0.0:
            label = []
            cut = 0
            for j in range(len(y)-1):
                if y[j] != y[j+1]:
                    cut = j+1
                    label.append(y[j])
                    label.append(y[j+1])
                    break

            a_class = X[:cut]
            b_class = X[cut:]

            def class_caculator(c): 
                class_max = max(c[:,feature_index[i[0]]])
                class_min = min(c[:,feature_index[i[0]]])
                class_mean = np.mean(c, axis=0)[feature_index[i[0]]]
                class_std = np.std(c, axis=0)[feature_index[i[0]]]
                class_zero = Counter(c[:,feature_index[i[0]]].tolist())[0]/c.shape[0]
                class_sum = sum(c[:,feature_index[i[0]]])

                return (class_max,class_min,class_mean,class_std,class_zero,class_sum)

            a_class = class_caculator(a_class)
            b_class = class_caculator(b_class)

            if 'name' in find:
                label = [classification_name[label[0]],classification_name[label[1]]]
            elif 'topic' in find:
                label = [classification_topic[label[0]],classification_topic[label[1]]]
            if a_class[4] <= 0.5 and b_class[4] <= 0.5:
                print (i[0])   
                print (label[0],a_class)
                print (label[1],b_class)
                print ()'''

In [50]:
vector_base = ['FC','lei']
feature_condicate = ['高頻','bigram','trigram','標點','N+N','N+V','VH+N','D+V','否定','程度','情態']
classification_select = ['name','topic']
#classification_select = ['topic']

for i in vector_base:
    for j in feature_condicate:
        for k in classification_select:
            if (i != 'lei' and k == 'name') or (i != 'FC' and k == 'topic'):
                temp_label = ''
                if k == 'name':
                    temp_label = classification_name
                elif k == 'topic':
                    temp_label = classification_topic
                print (i+' '+j+' '+k)
                for a in range(len(temp_label)):
                    for b in range(len(temp_label)-a-1):
                        b = b + a + 1
                        print (temp_label[a],temp_label[b])
                        main(i+' '+j+' '+k,[temp_label[a],temp_label[b]])
                print ()

FC 高頻 name
雷震 殷海光
oob error rate: 0.0535714285714
雷震 夏道平
oob error rate: 0.134328358209
雷震 傅正
oob error rate: 0.102040816327
雷震 龍平甫
oob error rate: 0.0142857142857
一
雷震 (17379.0, 4038.0, 9868.9310344827591, 3880.4483915369483, 0.0, 286199.0)
龍平甫 (11439.0, 2752.0, 6888.707317073171, 2188.4509862491213, 0.0, 282437.0)

種
雷震 (11102.0, 0.0, 2855.6896551724139, 2901.5340901150694, 0.10344827586206896, 82815.0)
龍平甫 (3836.0, 378.0, 1808.9756097560976, 892.35540573071194, 0.0, 74168.0)

說
雷震 (7825.0, 612.0, 3153.2068965517242, 1921.4877386045453, 0.0, 91443.0)
龍平甫 (8023.0, 0.0, 3081.268292682927, 2004.2378275749743, 0.024390243902439025, 126332.0)

政府
雷震 (11503.0, 0.0, 4171.7241379310344, 3101.8369073441286, 0.06896551724137931, 120980.0)
龍平甫 (11702.0, 632.0, 3697.2439024390242, 2251.8204427452156, 0.0, 151587.0)

他們
雷震 (9413.0, 0.0, 1436.2068965517242, 1812.7612083705694, 0.20689655172413793, 41650.0)
龍平甫 (6051.0, 0.0, 1209.4878048780488, 1253.3512758133636, 0.12195121951219512, 49589.0)

國家


In [None]:
#main('FC N+N name','RF')