In [1]:
#函式庫引入
import os
import time
import codecs
import itertools
import numpy as np
from sklearn import svm
from collections import OrderedDict,defaultdict,Counter
from sklearn.model_selection import train_test_split,GridSearchCV 
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from imblearn.over_sampling import SMOTE,RandomOverSampler
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
from IPython.display import clear_output,Image
from sklearn.externals import joblib
import pydotplus 
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [2]:
#輸入資料

classification_name = ['雷震','殷海光','夏道平','傅正','龍平甫','蔣勻田','朱伴耘','胡適','羅鴻詔']
classification_topic = ['社論','文章','日記']


#建立作者索引，提取各作者文章內容及索引
author_index = []
for index,name in enumerate(classification_name): #建立作者索引
    author_index.append((name,index))
author_index = OrderedDict(author_index) #作者索引排序(依文本數量高到低)

topic_index = []
for index,name in enumerate(classification_topic): #建立作者索引
    topic_index.append((name,index))
topic_index = OrderedDict(topic_index) #作者索引排序(依文本數量高到低)

class_path1 = 'C:\\Users\\user\\Desktop\\雙類別\\'
class_path2 = 'C:\\Users\\user\\Desktop\\多類別\\'

In [3]:
def find_input(find):
    temp_path = ''
    if '多' in find:
        temp_path = class_path2
    elif '雙' in find:
        temp_path = class_path1
        
    type_base = ''    
    if 'FC' in find:
        type_base = 'FC'
    elif 'lei' in find:
        type_base = 'lei'
        
    document_name = ''
    feature_name = ''
    
    for file in os.listdir(temp_path):
        if type_base in file:
            if 'txt' in file:
                feature_name = file
            elif 'csv' in file:
                document_name = file
    
    feature = []
    with codecs.open(temp_path+feature_name,'rb','utf8') as f:
        content = f.readlines()
        
        for line in content:
            if line.strip() == '' or line[0] == '#':
                continue
            else:
                feature.append(line.strip().split()[0])
                
    X = []
    y = []
    with codecs.open(temp_path+document_name,'rb','utf8') as f:
        content = f.readlines()
        
        for line in content:
            line = line.strip().split(',')
            y.append(int(line[0]))
            X.append([int(i) for i in list(map(float,line[1:]))])
    
    X = np.array(X)
    y = np.array(y)
    
    return feature,X,y

In [4]:
#預測模型建立
def predict_model(X,y,model):
    
    kernel = ''
    
    if model == 'RF':
        kernel = RandomForestClassifier(n_jobs=-1, oob_score=True,\
                                        class_weight = 'balanced',n_estimators=256,random_state=0,min_samples_leaf=2)
    elif model == 'SVM':
        kernel = svm.LinearSVC(class_weight='balanced',random_state=0)
    elif model == 'DT':
        kernel = DecisionTreeClassifier(class_weight='balanced',random_state=0)
    elif model == 'GNB':
        kernel = GaussianNB()
    elif model == 'MNB':
        kernel = MultinomialNB()
    elif model == 'BNB':
        kernel = BernoulliNB()
    else:
        print ('model error')
        return
    
    start = time.time()

    clf = kernel
    clf.fit(X, y)

    end = time.time()

    print ('訓練耗費時間：',end-start,'秒')
    
    return clf

In [5]:
#輸出介面
def main(find):
    
    feature,X,y = find_input(find)
    
    test_size = 0.2
    kernel = 'RF' #DT/RF/SVM
    threshold = 0.5
    
    def sparsity_ratio(X):
        return 1.0 - np.count_nonzero(X) / float(X.shape[0] * X.shape[1])
    print("輸入稀疏比:", sparsity_ratio(X))

    model = predict_model(X,y,kernel)
    
    print ('oob error rate:',1-model.oob_score_)
    
    #for i in sorted([i for i in zip(feature,model.feature_importances_)],key=lambda t: t[1],reverse=True):
    #    print (i[0],i[1])
    
    #joblib.dump(model, 'C:\\Users\\user\\Desktop\\RF result\\model\\'+find+'.pkl')

In [7]:
model_type = ['多','雙']
base_type = ['FC','lei']

for i in model_type:
    for j in base_type:
            print (j+' '+i)
            main(j+' '+i)
            print ()

FC 多
輸入稀疏比: 0.8552102481617647
訓練耗費時間： 0.5382041931152344 秒
oob error rate: 0.297794117647

lei 多
輸入稀疏比: 0.7468098958333333
訓練耗費時間： 0.537996768951416 秒
oob error rate: 0.141666666667

FC 雙
輸入稀疏比: 0.8478529411764706
訓練耗費時間： 0.5504236221313477 秒
oob error rate: 0.419117647059

lei 雙
輸入稀疏比: 0.7388227513227513
訓練耗費時間： 0.5236775875091553 秒
oob error rate: 0.15

