In [None]:
import codecs
import os
from collections import OrderedDict,defaultdict,Counter
import time
import numpy as np
from pandas import Series, DataFrame
from itertools import permutations

input_path = 'C:\\Users\\user\\Desktop\\RF result\\顯著特徵尋找\\特徵值\\'
vector_path = 'C:\\Users\\user\\Desktop\\RF result\\vector\\'
FC_feature_path = "D:\\課業相關\\論文資料\\論文程式\\language_feature\\最終版\\自由中國\\" #自由中國語言特徵
lei_feature_path = "D:\\課業相關\\論文資料\\論文程式\\language_feature\\最終版\\雷震文本\\" #雷震文本語言特徵
condicate_author_path = "D:\\課業相關\\論文資料\\論文程式\\condicate\\author\\" #候選作者文本
condicate_topic_path = "D:\\課業相關\\論文資料\\論文程式\\condicate\\topic\\" #候選主題文本

classification_name = ['雷震','殷海光','夏道平','傅正','龍平甫','蔣勻田','朱伴耘','胡適','羅鴻詔']
classification_topic = ['社論','文章','日記']
feature_condicate = ['高頻','bigram','trigram','標點','N+N','N+V','VH+N','D+V','否定','程度','情態']


#建立作者索引，提取各作者文章內容及索引
author_index = []
for index,name in enumerate(classification_name): #建立作者索引
    author_index.append((name,index))
author_index = OrderedDict(author_index) #作者索引排序(依文本數量高到低)

topic_index = []
for index,name in enumerate(classification_topic): #建立作者索引
    topic_index.append((name,index))
topic_index = OrderedDict(topic_index) #作者索引排序(依文本數量高到低)

In [None]:
#輸入介面
def find_input(find):
    
    feature_file_path = '' #符合尋找的特徵檔案路徑
    feature_file_name = '' #符合尋找的特徵檔案名稱
    condicate_path = '' #候選類別路徑
    condicate_label = '' #候選類別名稱
    condicate_index = '' #候選類別索引

    temp = find.split()
    
    if len(temp) != 3:
        print ('請輸入正確值')
        return False
    
    if temp[0] == 'SC':
        feature_file_path = SC_feature_path
    elif temp[0] == 'FC':
        feature_file_path = FC_feature_path
    elif temp[0] == 'lei':
        feature_file_path = lei_feature_path
    else:
        print ('母體選項不符合')
        return False
    
    feature_file_name = [file for file in os.listdir(feature_file_path) if temp[1] in file]
    if len(feature_file_name) == 0:
        print ('輸入的語言特徵不在範圍內')
        return False
    feature_file_name = feature_file_name[0]

    if temp[2] == 'name':
        condicate_path = condicate_author_path
        condicate_label = classification_name
        condicate_index = author_index
    elif temp[2] == 'topic':
        condicate_path = condicate_topic_path
        condicate_label = classification_topic
        condicate_index = topic_index
    else:
        print ('領域選項不符合')
        return False
        
    return feature_file_path+feature_file_name,condicate_path,condicate_label,condicate_index

#抓取候選文本，並根據特徵轉換成文本向量

#抓取候選文本，回傳文章序列及各文章類別代號
def article_get(condicate_path,condicate_label):

    content_list = defaultdict(list) #所有作者文本內容(未處理)
    
    for label in condicate_label:
        for file in os.listdir(condicate_path):
            with codecs.open(condicate_path+file,'rb','utf8') as f:

                if label not in file.split('_')[0]:
                    continue

                title = f.readline()
                content = f.readline().strip()

                content_list[label].append(content)
        
    return content_list

#選擇語言特徵，回傳文本向量詞組
def feature_select(feature_file_path):
    
    feature_dict = {} #特徵
    feature = []
    with codecs.open(feature_file_path,'rb','utf8') as ff: #抓取基準特徵
        count = 0
        for i in ff.readlines():
            if '\ufeff' in i: #去掉開頭BOM
                i = i.replace('\ufeff','')
            if i.strip() != '':
                feature.append(i.strip().split(',')[0])
                feature_dict[i.strip().split(',')[0]] = count
                count += 1
                
    return feature,feature_dict

#建立文本向量
def article_vector(X_raw,feature,feature_file_name):
    
    bi_pos_combine = ['N+N','N+V','VH+N','D+V','情態']
    more_pos_combine = ['否定','程度']
    
    def line_vec(line): #將文章轉換為特徵向量並回傳
        temp_feature = defaultdict(int)
        
        if any(word in feature_file_name for word in bi_pos_combine): #詞性組合
            line = [line[i]+line[i+1] for i in range(len(line)-1)] 
        elif any(word in feature_file_name for word in more_pos_combine): #2~3詞性組合
            line = [line[i]+line[i+1] for i in range(len(line)-1)] + [line[i]+line[i+1]+line[i+2] for i in range(len(line)-2)]
        else: #其他常用語言特徵
            if 'bigram' in feature_file_name:
                line = [line[i].split('(')[0]+line[i+1].split('(')[0] for i in range(len(line)-1)]
            elif 'trigram' in feature_file_name:
                line = [line[i].split('(')[0]+line[i+1].split('(')[0]+line[i+2].split('(')[0] for i in range(len(line)-2)]
            else:
                line = [line[i].split('(')[0] for i in range(len(line))]
            
        for i in line:
            if i in feature:
                temp_feature[i] += 1
        
        return temp_feature 
    
    vector_space = np.zeros((len(X_raw),len(feature)),np.float64)
    
    all_line = 0
        
    for index,element in enumerate(X_raw): #依序將文章轉換為特徵向量
        line = element.strip().split()
        temp_feature = line_vec(line)

        for i,j in enumerate(feature):
            #vector_space[index, i] = round(temp_feature[j] * 1000000 / len(line)) #取相對頻率
            vector_space[index, i] += temp_feature[j]
            
        all_line += len(line)
            
    vector_space = np.sum(vector_space,axis=0) * 1000000 / all_line
    vector_space = [int(round(i)) for i in vector_space]
            
    return vector_space

In [6]:
#新版，直接觀察各類別
#找出各類別重點詞組
#觀察獨特性、鄰近性、群體性及特殊關係(gini特別高)

one = 0
more = 0
error = 0

for file in os.listdir(input_path):
    with codecs.open(input_path+file,'rb','utf8') as f:
            
        print (file.split('.')[0])
        
        feature_path,condicate_path,condicate_label,condicate_index = find_input(file.split('.')[0])
        feature,feature_dict = feature_select(feature_path)
        
        content_list = article_get(condicate_path,condicate_label)
        for k,v in content_list.items():
            content_list[k] = article_vector(v,feature,file.split('.')[0])
        
        content = f.readlines()
        
        feature_importance = defaultdict(list) #類別 (特徵,重要值])
        feature_appear = defaultdict(list) #特徵 (類別,重要值,oob)
        name = ''
        count = 1
        oob = 0
        for line in content:
            if line[0] == '#':
                name = line[1:].strip()
            #elif name != '' and '雷震' not in name:
            #    continue
            elif count > 0:
                oob = float(line.strip().split(':')[1])
                count = count - 1
            elif line.strip() == '':
                count = 1
            else:
                temp_line = line.strip().split()
                if len(temp_line) == 2:
                    feature_importance[name].append((temp_line[0],float(temp_line[1])))
                    feature_appear[temp_line[0]].append((name,float(temp_line[1]),oob))
        
        feature_gini_pair = {}
        special_label = defaultdict(list)
        
        for i in feature:
            vector_space = np.zeros((len(condicate_label),len(condicate_label)),np.float64)
            for x in permutations(condicate_label,2):
                for z in feature_appear[i]:
                    if x[0] in z[0] and x[1] in z[0]:
                        vector_space[condicate_index[x[0]],condicate_index[x[1]]] = z[1]
                                    
            feature_gini_pair[i] = vector_space
            
            for x in range(len(condicate_label)):
                if len([i for i in vector_space[x] if i > 0.01]) == len(condicate_label)-1:
                    mean_value = round(sum(vector_space[x])/(0.01*(len(condicate_label)-1)),3)
                    max_value = round(max(vector_space[x])/0.01,3)
                    std_value = round(np.std(np.array([i/0.01 for i in vector_space[x] if i > 0.01])),3)
                    special_label[i].append((condicate_label[x],mean_value,max_value))
                    
            if len(special_label[i]) == 1:
                one += 1
            elif len(special_label[i]) > 1:
                more += 1
            
            #if len(special_label[i]) != 0:
            if len(special_label[i]) == 1:
                print (i)
                for j in special_label[i]:
                    for k in j:
                        print (k,end=' ')
                print ()
                label_feature_value = [content_list[j][feature_dict[i]] for j in condicate_label]
                for j in special_label[i]:
                    temp_mark = []
                    for k in label_feature_value:
                        check_value = k-label_feature_value[condicate_index[j[0]]]
                        if check_value > 0:
                            temp_mark.append('+')
                        elif check_value < 0:
                            temp_mark.append('-')
                        elif check_value == 0:
                            temp_mark.append('0')
                    print (j[0],temp_mark)
                    if '+' in temp_mark and '-' in temp_mark:
                        error += 1
                print ('各類別在該語言特徵下的總相對頻率:',label_feature_value)
                print ()
            
            #time.sleep(0.5)
        print ('--------------------------------')
        
print ('one:',one)
print ('more:',more)
print ('error:',error)
        
print ('END')

FC bigram name
這種
殷海光 7.451 9.939 
殷海光 ['-', '0', '-', '-', '-', '-', '-', '-', '-']
各類別在該語言特徵下的總相對頻率: [959, 2660, 1055, 968, 597, 823, 1558, 1071, 464]

這一
龍平甫 7.856 15.421 
龍平甫 ['+', '+', '+', '+', '0', '+', '+', '+', '+']
各類別在該語言特徵下的總相對頻率: [818, 3558, 2958, 1892, 68, 424, 1439, 575, 240]

這是
胡適 3.04 5.816 
胡適 ['-', '-', '-', '-', '-', '-', '-', '0', '-']
各類別在該語言特徵下的總相對頻率: [928, 942, 1194, 682, 665, 507, 1175, 1594, 545]

的一
胡適 6.424 12.571 
胡適 ['-', '-', '-', '-', '-', '-', '-', '0', '-']
各類別在該語言特徵下的總相對頻率: [629, 632, 889, 858, 416, 283, 490, 1633, 192]

他的
胡適 6.542 11.943 
胡適 ['-', '-', '-', '-', '-', '-', '-', '0', '-']
各類別在該語言特徵下的總相對頻率: [157, 288, 444, 44, 733, 507, 320, 1933, 488]

美國的
朱伴耘 3.295 6.273 
朱伴耘 ['-', '-', '-', '-', '-', '-', '0', '-', '-']
各類別在該語言特徵下的總相對頻率: [110, 111, 83, 0, 317, 599, 1426, 235, 104]

的事
殷海光 6.622 10.572 
殷海光 ['-', '0', '-', '-', '-', '-', '-', '-', '-']
各類別在該語言特徵下的總相對頻率: [142, 1031, 319, 264, 95, 91, 157, 418, 96]

事實上
傅正 8.988 15.787 
傅正 ['-', '-', 

In [None]:
#找出各類別重點詞組
#觀察獨特性、鄰近性、群體性及特殊關係(gini特別高)
a_threshold = 0.0109
b_threshold = 0.00263

all_gini_value = []
all_class_feature = defaultdict(list)

for file in os.listdir(input_path):
    with codecs.open(input_path+file,'rb','utf8') as f:
        
        if file.split('.')[0].split()[1] not in feature_condicate:
            continue
            
        #print (file.split('.')[0])
        #print ('詞組','類別','分數','平均','標準差','數值0次數','總合','類別數據')
        
        feature_path,condicate_label,condicate_index = find_input(file.split('.')[0])
        
        feature_dict = {} #特徵
        feature = []
        with codecs.open(feature_path,'rb','utf8') as ff: #抓取基準特徵
            count = 0
            for i in ff.readlines():
                if '\ufeff' in i: #去掉開頭BOM
                    i = i.replace('\ufeff','')
                if i.strip() != '':
                    feature.append(i.strip().split(',')[0])
                    feature_dict[i.strip().split(',')[0]] = count
                    count += 1
        
        class_vector = defaultdict(list) #各類別文本向量
        with codecs.open(vector_path+file.split('.')[0]+'.csv','rb','utf8') as ff:
            for i in ff.readlines():
                if '\ufeff' in i: #去掉開頭BOM
                    i = i.replace('\ufeff','')
                if i.strip() != '':
                    i = i.strip().split(',')
                    class_vector[condicate_label[int(i[0])]].append([float(j) for j in i[1:]])
        
        content = f.readlines()
        
        feature_importance = defaultdict(list)
        feature_appear = defaultdict(list)
        name = ''
        count = 2
        oob = 0
        for line in content:
            if line[0] == '#':
                name = line[1:].strip()
            #elif name != '' and '雷震' not in name:
            #    continue
            elif count > 0:
                if count == 1:
                    oob = float(line.strip().split(':')[1])
                count = count - 1
            elif line.strip() == '':
                count = 2
            else:
                temp_line = line.strip().split()
                if len(temp_line) == 2:# and float(temp_line[1]) >= 0.01:
                    feature_importance[name].append((temp_line[0],float(temp_line[1])))
                    feature_appear[temp_line[0]].append((name,float(temp_line[1]),oob))
                    all_gini_value.append(float(temp_line[1]))
        
        feature_gini_pair = {}
        for i in feature:
            vector_space = np.zeros((len(condicate_label),len(condicate_label)),np.float64)
            for x in condicate_label:
                for y in condicate_label:
                    if x != y and i in feature_appear:
                        class_check = True
                        for z in feature_appear[i]:
                            if x in z[0] and y in z[0]:
                                vector_space[condicate_index[x],condicate_index[y]] = z[1]#*(1-z[2])
                                #all_gini_value.append(z[1]*(1-z[2]))
                                class_check = False
                                
                        if class_check:
                            vector_space[condicate_index[x],condicate_index[y]] = -1.0
            if i in feature_appear:                       
                feature_gini_pair[i] = vector_space
                
            '''for j in vector_space:
                if -1.53778936498e-18 in j: #美(Nc)蘇(Nc) #蔣勻田 胡適
                    print (i)'''
        
        #特徵觀察
        '''for i in feature:
            for index,j in enumerate(feature_gini_pair[i]):
                if Counter(j.tolist())[0.0] == 1:
                    print (i,condicate_label[index],j)'''
                    
        for i in feature:
            if i not in feature_gini_pair:
                continue
            #分群
            temp_group = []
            for a in range(len(condicate_label)):
                all_check = True
                wait_append = []
                for b in temp_group:
                    if type(b) == int and  0.0 <= feature_gini_pair[i][a,b] < b_threshold:
                        temp_group.remove(b)
                        wait_append.append([b,a])
                        all_check = False
                    elif type(b) == list:
                        check = True
                        for c in b:
                            if feature_gini_pair[i][a,c] >= b_threshold or feature_gini_pair[i][a,c] < 0.0:
                                check = False
                                break
                        if check:
                            b.append(a)
                            all_check = False
                    if not all_check:
                        break
                for d in wait_append:
                    temp_group.append(d)
                if all_check:
                    temp_group.append(a)
                    
            #找出獨特群
            temper = []
            for e in temp_group:
                if type(e) == int:
                    check = True
                    for a in range(len(condicate_label)):
                        if e != a:
                            if feature_gini_pair[i][e,a] < a_threshold:
                                check = False
                                break
                    if check:
                        #temp_group.remove(e)
                        #temp_group.append([e])
                        temper.append([e])
                    else:
                        temper.append(e)
                else:
                    temper.append(e)
                        
            temp_group= temper
            
            #確保群內相似，群外不相似
            temper = []
            for e in temp_group:
                if type(e) == list:
                    check = True
                    for a in range(len(condicate_label)):
                        if a not in e:
                            for t in e:
                                if feature_gini_pair[i][t,a] < a_threshold:
                                    check = False
                                    break
                        if not check:
                            break
                    if not check:
                        #temp_group.remove(e)
                        for t in e:
                            #temp_group.append(t)
                            temper.append(t)
                    else:
                        temper.append(e)
                else:
                    temper.append(e)
                    
            temp_group= temper
        
            #print (feature_gini_pair[i])
            #print (i,temp_group)
            class_group = []
            class_group2 = []
            for e in temp_group:
                if type(e) == list:
                    class_group.append([condicate_label[j] for j in e])
                    #class_group2.append([np.mean(np.array(class_vector[condicate_label[j]]), 
                    #axis=0)[feature_dict[i]] for j in e])
            if len(class_group) != 0 and 'lei' in file:# and len([j for j in class_group if len(j) > 1]) > 0:
                #print (feature_gini_pair[i])
                #print (i,temp_group)
                print (i,class_group)
                #print (class_group2)
                for j in class_group:
                    all_class_feature[' '.join(j)].append(i)
                
        #time.sleep(0.5)
        
        '''feature_importance = defaultdict(list)
        name = ''
        oob = 0
        count = 2
        for line in content:
            if line[0] == '#':
                name = line[1:].strip()
            #elif name != '' and '雷震' not in name:
            #    continue
            elif count > 0:
                if count == 1:
                    oob = float(line.strip().split(':')[1])
                count = count - 1
            elif line.strip() == '':
                count = 2
            else:
                temp_line = line.strip().split()
                if len(temp_line) == 2:# and float(temp_line[1]) > 0.01:
                    feature_importance[name].append((temp_line[0],float(temp_line[1]),oob))
                    
        #詞組 : [兩兩類別,排序,切割分數,
        # (A類別文本該詞組數值最大、最小、平均、標準差、數值0次數、綜合),(B類別文本該詞組數值最大、最小、平均、標準差、數值0次數、綜合)]
        feature_appear = defaultdict(list) 
        for k,v in feature_importance.items():
            for index,e in enumerate(v):
                a_class = np.array(class_vector[k.split()[0]]) 
                b_class = np.array(class_vector[k.split()[1]])
                
                def class_caculator(c): 
                    class_max = max(c[:,feature_dict[e[0]]])
                    class_min = min(c[:,feature_dict[e[0]]])
                    class_mean = np.mean(c, axis=0)[feature_dict[e[0]]]
                    class_std = np.std(c, axis=0)[feature_dict[e[0]]]
                    class_zero = Counter(c[:,feature_dict[e[0]]].tolist())[0]/c.shape[0]
                    class_sum = sum(c[:,feature_dict[e[0]]])
                    
                    return (class_max,class_min,class_mean,class_std,class_zero,class_sum)
                
                a_class = class_caculator(a_class)
                b_class = class_caculator(b_class)
                
                a_b_zero = 0
                if a_class[4] != 0 and b_class[4] != 0:
                    a_b_zero = a_class[4] * b_class[4]
                elif a_class[4] == 0:
                    if b_class[4] != 0:
                        a_b_zero = b_class[4]
                    else:
                        a_b_zero = 0.0000000000001
                else:
                    a_b_zero = a_class[4]
                
                #排名分數 = 排名 * (分割數值/0.01)倒數 * 1-oob倒數 * 兩兩0數值比例倒數   數值越小越好 
                ranking_value = index 
                if e[1]/0.01 == 0:
                    ranking_value = 100/((1-e[2])*a_b_zero)
                else:
                    ranking_value = index/((e[1]/0.01)*(1-e[2])*a_b_zero)
                    
                feature_appear[e[0]].append([k,ranking_value,e[1],a_class,b_class])
                
        for i in feature:
            #if len(feature_appear[i]) == (len(condicate_label)*(len(condicate_label)-1))/2: 
            #    print (i)
            #print (len(feature_appear[i]),condicate_label)
            #print (i,feature_appear[i])
            #print (i,len(feature_appear[i]),sum([j[1] for j in feature_appear[i]]))
            for j in condicate_label:
                temp_check = []
                for k in feature_appear[i]:
                    if j in k[0]:
                        temp_check.append(k)
                #if len(temp_check) == len(condicate_label)-1 and sum([k[1] for k in temp_check]) < 30:
                if len(temp_check) >= len(condicate_label)-2 and sum([k[1] for k in temp_check]) < 30:
                    compare = len(temp_check)
                    def compare_value(com,num):
                        compare = com
                        for l in temp_check:
                            if l[0].split()[0] == j:
                                if l[3][num] > l[4][num]:
                                    compare -= 1
                            elif l[0].split()[1] == j:
                                if l[3][num] < l[4][num]:
                                    compare -= 1
                        return compare
                    
                    class_value = ''
                    if temp_check[0][0].split()[0] == j:
                        class_value = temp_check[0][3]
                    elif temp_check[0][0].split()[1] == j:
                        class_value = temp_check[0][4]
                        
                    #if class_value[4] > 0.5:
                    #    continue
                        
                    print (i,j,sum([k[1] for k in temp_check]),compare_value(compare,2),compare_value(compare,3)
                           ,compare_value(compare,4),compare_value(compare,5),class_value)
                    
                    vector_space = np.zeros((len(condicate_label),len(condicate_label)),np.float64)
                    for x in condicate_label:
                        for y in condicate_label:
                            if x != y:
                                for z in feature_appear[i]:
                                    if x in z[0] and y in z[0]:
                                        vector_space[condicate_index[x],condicate_index[y]] = z[2]
                    print (vector_space[condicate_index[j]])'''
        
        #print ()
        #time.sleep(3)
'''with codecs.open('C:\\Users\\user\\Desktop\\all_gini.txt','wb','utf8') as g:
    so = sorted(all_gini_value, reverse = True)
    for i in so:
        g.write(str(i)+'\r\n')'''
for k,v in all_class_feature.items():
    print (k)
    #for i in v:
    #    print (i)
    #print ()
print ('END')

In [None]:
#計算各類別間的OOB error
for file in os.listdir(input_path):
    with codecs.open(input_path+file,'rb','utf8') as f:
        content = f.readlines()
        
        print (file)
        
        temp_label = ''
        temp_index = ''
        if 'name' in file:
            temp_label = classification_name
            temp_index = author_index
        elif 'topic' in file:
            temp_label = classification_topic
            temp_index = topic_index
        
        oob_error = []
        vector_space = np.zeros((len(temp_index)+1,len(temp_index)+1),np.float64) #多平均
        
        for i in range(len(content)-1):
            if content[i][0] == '#':
                oob_error.append((content[i][1:],float(content[i+1].split(':')[1]))) #oob error
                
        for i in oob_error:
            #print (i[0],round(i[1],3))
            vector_space[temp_index[i[0].split()[0]],temp_index[i[0].split()[1]]] = round(i[1],3)
            vector_space[temp_index[i[0].split()[1]],temp_index[i[0].split()[0]]] = round(i[1],3)
            
        for i in range(len(temp_index)):
            vector_space[i,-1] = round(sum(vector_space[i])/(len(temp_index)-1),3)
        for i in range(len(temp_index)):
            vector_space[-1,i] = round(sum(vector_space[:,i])/(len(temp_index)-1),3)
        vector_space[-1,-1] = round(sum(vector_space[-1])/(len(temp_index)),3)
        
        print (' '.join(temp_label+['平均']))
        print (vector_space)
        #df = DataFrame(vector_space,index=temp_label,columns=temp_label)
        print ()

In [None]:
muti_class_path = 'C:\\Users\\user\\Desktop\\RF result\\顯著特徵尋找\\特徵值全\\'

for file in os.listdir(muti_class_path):
    with codecs.open(muti_class_path+file,'rb','utf8') as f:
        name = f.readline()
        oob_score = float(f.readline().strip().split(':')[1])
        print (file)
        print (round(oob_score,3))

In [None]:
condicate_author_path = "D:\\課業相關\\論文資料\\論文程式\\condicate\\author\\" #候選作者文本
condicate_topic_path = "D:\\課業相關\\論文資料\\論文程式\\condicate\\topic\\" #候選主題文本

for file in os.listdir(condicate_author_path):
    with codecs.open(condicate_author_path+file,'rb','utf8') as f:
        head = f.readline()
        content = f.readline().strip().split()
        print (file)
        #反共(VH)戰爭(Na) 自由(VH)中國(Nc)
        for i in range(len(content)-1):
            if content[i] == '中國(Nc)' and content[i+1] == '大陸(Nc)':
                if i > 4 and len(content) - i > 5:
                    print (''.join(content[i-5:i+6]))
                elif i <= 4 and len(content) - i > 5:
                    print (''.join(content[:i+6]))
                elif i > 4 and len(content) - i <= 5:
                    print (''.join(content[i-5:]))
                else:
                    print (''.join(content))
    print ()
    #time.sleep(0.5)

In [None]:
from sklearn.externals import joblib
from sklearn.tree import export_graphviz
import pydotplus

model_path = 'C:\\Users\\user\\Desktop\\RF result\\顯著特徵尋找\\model\\'

def out_tree_image(model,feature,condicate_label,name):
    dot_data = export_graphviz(model.estimators_[-1], out_file=None,
                                filled=True,feature_names=feature,class_names=condicate_label,
                                 proportion=True,rounded=True,special_characters=False) 
    dot_data = dot_data.replace('helvetica','kaiu') #字型調換
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_png("C:\\Users\\user\\Desktop\\tree2\\"+name+".png")

for model in os.listdir(model_path):
    clf = joblib.load(model_path+model)
    if any(i in model for i in ['高頻','bigram','trigram','標點']):
        continue
    print (model.split('.')[0])
    #print (clf.oob_decision_function_)
    #print (clf.oob_score_)
    
    feature_path,condicate_label,condicate_index = find_input(' '.join(model.split('.')[0].split()[:3]))
    temp_label = []
    for i in condicate_label:
        if i in model.split('.')[0].split():
            temp_label.append(i)
        
    feature = []
    with codecs.open(feature_path,'rb','utf8') as ff: #抓取基準特徵
        for i in ff.readlines():
            if '\ufeff' in i: #去掉開頭BOM
                i = i.replace('\ufeff','')
            if i.strip() != '':
                feature.append(i.strip().split(',')[0])
    
    #print (clf.feature_importances_)
    so = [(i[0],i[1]) for i in sorted([i for i in zip(feature,clf.feature_importances_)],key=lambda t: t[1],reverse=True)
         if i[1] >= 0.01]
    for index,e in enumerate(so):
        if index < 10:
            print (index+1,e)
        else:
            break
    #out_tree_image(clf,feature,temp_label,model.split('.')[0])
    
    print ()
    #time.sleep(0.5)

In [37]:
#找出前10排名詞組作為預測
input_path = 'C:\\Users\\user\\Desktop\\RF result\\顯著特徵尋找\\'

class_path1 = 'C:\\Users\\user\\Desktop\\雙類別\\'
class_path2 = 'C:\\Users\\user\\Desktop\\多類別\\'

for path in ['特徵值\\','特徵值全\\']:
    for file in os.listdir(input_path+path):
        with codecs.open(input_path+path+file,'rb','utf8') as f:
            
            temp_path = ''
            if path == '特徵值\\':
                temp_path = class_path1
            elif path == '特徵值全\\':
                temp_path = class_path2

            if any(i in file.split('.')[0] for i in ['高頻','bigram','trigram','標點']):
                continue

            print (file.split('.')[0])

            content = f.readlines()

            rank_feature = defaultdict(int)

            count = 2
            index = 0
            for line in content:
                if count > 0:
                    count -= 1
                elif line.strip() == '':
                    count = 2
                    index = 0
                else:
                    rank_feature[line.strip().split()[0]] += index
                    index += 1

            so = sorted(rank_feature.items(),key=lambda t: t[1],reverse=False)

            with codecs.open(temp_path+file.split('.')[0].split()[0]+'.txt','ab','utf8') as a:
                a.write('#'+file.split('.')[0]+'\r\n')
                for i,(x,y) in enumerate(so):
                    if i == 20:
                        break
                    print (i+1,x,y)
                    a.write(x+' '+str(y)+'\r\n')
                a.write('\r\n')

            print ()
print ('END')

FC D+V name
1 最(Dfa)大(VH) 330
2 可以(D)說(VE) 336
3 不(D)願(VK) 341
4 不(D)敢(VL) 349
5 所(D)說(VE) 447
6 不(D)知(VK) 490
7 可(D)說(VE) 498
8 最(Dfa)高(VH) 757
9 又(D)說(VE) 789
10 可(D)言(VE) 810
11 不能(D)說(VE) 891
12 很(Dfa)大(VH) 933
13 不(D)知道(VK) 940
14 最(Dfa)重要(VH) 947
15 可(D)知(VK) 986
16 來(D)看(VC) 1106
17 不(D)出(VC) 1118
18 不(D)受(VJ) 1153
19 不(D)難(VH) 1155
20 並(D)無(VJ) 1159

FC N+N name
1 民主(Na)國家(Na) 261
2 民主(Na)政治(Na) 327
3 主義(Na)者(Na) 402
4 自由(Na)中國(Nc) 440
5 帝國(Na)主義(Na) 551
6 自由(Na)世界(Nc) 570
7 政府(Na)當局(Na) 795
8 西方(Ncd)國家(Na) 846
9 世界(Nc)大戰(Na) 864
10 中國(Nc)人民(Na) 960
11 英(Nc)法(Na) 973
12 美國(Nc)政府(Na) 985
13 中國(Nc)大陸(Nc) 1019
14 極權(Na)政治(Na) 1048
15 生活(Na)水準(Na) 1054
16 民主(Na)制度(Na) 1122
17 政治(Na)制度(Na) 1124
18 外交(Na)政策(Na) 1126
19 極權(Na)國家(Na) 1147
20 一段(Nc)話(Na) 1163

FC N+V name
1 民主(Na)自由(VH) 223
2 人(Na)說(VE) 260
3 言論(Na)自由(VH) 306
4 自由(Na)民主(VH) 705
5 人(Na)認為(VE) 806
6 近年(Nd)來(VA) 849
7 自由(Na)為(VG) 853
8 人民(Na)自由(VH) 873
9 黨(Na)專政(VA) 932
10 者(Na)為(VG) 945
11 極權(Na)統治(VC) 1040
12 世界(Nc)革命(VA

In [42]:
#整理出重點語言特徵
feature_condicate = ['N+N','N+V','VH+N','D+V','否定','程度','情態']

class_path1 = 'C:\\Users\\user\\Desktop\\雙類別\\'
class_path2 = 'C:\\Users\\user\\Desktop\\多類別\\'

for model_path in [class_path1,class_path2]:
    for base in ['FC','lei']:
        
        class_path = model_path
        search = ''
        search2 = ''
        
        if base == 'FC':
            search = 'FC'
            search2 = 'lei'
        elif base == 'lei':
            search = 'lei'
            search2 = 'topic'
            
        print (base,class_path.split('\\')[-2])
    
        condicate_path = ''
        condicate_label = ''

        if base == 'FC':
            condicate_path = condicate_author_path
            condicate_label = classification_name
        else:
            condicate_path = condicate_topic_path
            condicate_label = classification_topic


        new_feature = defaultdict(list)

        with codecs.open(class_path+search+'.txt','rb','utf8') as f:
            content = f.readlines()

            name = ''
            for line in content:
                if line[0] == '#':
                    name = line.strip().split()[1]
                elif line.strip() == '':
                    continue
                else:
                    new_feature[name].append(line.strip().split()[0])
                    
        temp_remove = []
        for i in new_feature['D+V']:
            if i in new_feature['否定'] or i in new_feature['程度'] or i in new_feature['情態']:
                temp_remove.append(i)
        for i in temp_remove:
            new_feature['D+V'].remove(i)

        #建立文本向量
        def new_article_vector(X_raw,feature,condicate_label):

            bi_pos_combine = ['N+N','N+V','VH+N','D+V','情態']
            more_pos_combine = ['否定','程度']

            def line_vec(line,feature,feature_file_name): #將文章轉換為特徵向量並回傳
                temp_feature = defaultdict(int)

                if any(word in feature_file_name for word in bi_pos_combine): #詞性組合
                    line = [line[i]+line[i+1] for i in range(len(line)-1)] 
                elif any(word in feature_file_name for word in more_pos_combine): #2~3詞性組合
                    line = [line[i]+line[i+1] for i in range(len(line)-1)] + \
                    [line[i]+line[i+1]+line[i+2] for i in range(len(line)-2)]
                else: #其他常用語言特徵
                    if 'bigram' in feature_file_name:
                        line = [line[i].split('(')[0]+line[i+1].split('(')[0] for i in range(len(line)-1)]
                    elif 'trigram' in feature_file_name:
                        line = [line[i].split('(')[0]+line[i+1].split('(')[0]+\
                                line[i+2].split('(')[0] for i in range(len(line)-2)]
                    else:
                        line = [line[i].split('(')[0] for i in range(len(line))]

                for i in line:
                    if i in feature:
                        temp_feature[i] += 1

                temp_feature = [temp_feature[i] for i in feature]

                return temp_feature 

            feature_length = 0
            for k,v in new_feature.items():
                feature_length += len(v)

            vector_label = defaultdict(list)

            for i in condicate_label:

                vector_space = np.zeros((len(X_raw[i]),feature_length),np.float64)

                for index,element in enumerate(X_raw[i]): #依序將文章轉換為特徵向量
                    temp_feature = []
                    line = element.strip().split()
                    for j in feature_condicate:
                        temp_feature += line_vec(line,new_feature[j],j)

                    for index2,j in enumerate(temp_feature):
                        vector_space[index,index2] = round(j * 1000000 / len(line))

                vector_label[i] = vector_space

            return vector_label


        content_list = article_get(condicate_path,condicate_label)
        vector_space = new_article_vector(content_list,new_feature,condicate_label)

        with codecs.open(class_path+search+' combine.csv','wb','utf8') as g:
            for index,e in enumerate(condicate_label):
                for x in vector_space[e]:
                    g.write(str(index)+','+','.join(list(map(str,x.tolist())))+'\r\n')

print ('END')

FC 雙類別
lei 雙類別
FC 多類別
lei 多類別
END
