用樸素貝葉斯的方法進行分類

In [3]:
#coding:utf-8
import jieba,xlrd,re
import numpy as np 
import pandas as pd

In [4]:
#讀取資料
data = pd.read_excel('data/news_data_2019.xlsx',encoding='utf-8')
data.dropna() 

#按照主題儲存新聞
chinaAndtw_news = data[ data.name == '兩岸新聞' ]
international_news = data[ data.name == '國際新聞' ]
political_news = data[ data.name == '政治新聞' ]
social_news = data[ data.name == '社會新聞' ]
financial_news = data[ data.name == '財經新聞' ]

print('兩岸新聞:{}篇, 國際新聞:{}篇, 政治新聞:{}篇, 社會新聞:{}篇, 財經新聞:{}篇'.format(len(chinaAndtw_news),len(international_news),len(political_news),len(social_news),len(financial_news)))

兩岸新聞:7343篇, 國際新聞:22976篇, 政治新聞:16153篇, 社會新聞:8366篇, 財經新聞:20665篇


In [5]:
#對每個新聞隨即抽樣7343確保樣本平均
#這邊可能數字太大會跑很慢，可以適當減少
chinaAndtw_news = chinaAndtw_news.sample(n=7343,replace=False)
international_news = international_news.sample(n=7343,replace=False)
political_news = political_news.sample(n=7343,replace=False)
social_news = social_news.sample(n=7343,replace=False)
financial_news = financial_news.sample(n=7343,replace=False)

print('兩岸新聞:{}篇, 國際新聞:{}篇, 政治新聞:{}篇, 社會新聞:{}篇, 財經新聞:{}篇'.format(len(chinaAndtw_news),len(international_news),len(political_news),len(social_news),len(financial_news)))

兩岸新聞:7343篇, 國際新聞:7343篇, 政治新聞:7343篇, 社會新聞:7343篇, 財經新聞:7343篇


In [6]:
#讀取停用詞
stop_list = []
with open('data/stopwords.txt','r',encoding='utf-8') as f:
    for line in f.readlines():
        stop_list.append(line.strip())

In [7]:
#斷詞、去停用詞
def preprocess(data,all_list,category):
    for line in data:
        line = re.sub(r'[^\w]','',line)
        line = re.sub(r'[A-Za-z0-9]','',line)
        line = re.sub(u'[\uFF01-\uFF5A]','',line)
        segment_list = jieba.lcut(line)
        segment_list = filter(lambda x: len(x)>1,segment_list)
        segment_list = filter(lambda x: x not in stop_list,segment_list)
        all_data.append((' '.join(segment_list),category))

all_data = []
preprocess((chinaAndtw_news['title']+chinaAndtw_news['content']).values.tolist(),all_data,'兩岸新聞')
preprocess((international_news['title']+international_news['content']).values.tolist(),all_data,'國際新聞')
preprocess((chinaAndtw_news['title']+chinaAndtw_news['content']).values.tolist(),all_data,'政治新聞')
preprocess((chinaAndtw_news['title']+chinaAndtw_news['content']).values.tolist(),all_data,'社會新聞')
preprocess((chinaAndtw_news['title']+chinaAndtw_news['content']).values.tolist(),all_data,'財經新聞')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/vk/4wfw6yvn67b0gpkhvj8wy0cw0000gn/T/jieba.cache
Loading model cost 1.507 seconds.
Prefix dict has been built succesfully.


接下來是將原始數據進行切割分成訓練資料與測試資料來做訓練與驗證

In [8]:
# 將取得的資料全部重洗打亂順序
import random
random.shuffle(all_data)

In [9]:
# 導入scikit-learn來處理
from sklearn.model_selection import train_test_split #切分訓練與測試資料
from sklearn.feature_extraction.text import CountVectorizer #抽取詞語特徵
from sklearn.naive_bayes import MultinomialNB #導入模型

In [10]:
# 切割成訓練資料與測試資料
x, y = zip(*all_data)
x_train, x_test, y_train, y_test  = train_test_split(x,y,test_size=0.3,random_state=666) # 設定相同的random_state可以得到相同的切割結果

In [22]:
# 將訓練資料中的詞建立詞袋模型提取特徵(轉成向量)
vec = CountVectorizer(
    analyzer='word', # 特徵由單詞構成
    max_features=5000 # 選最常出現5000個單詞構成詞袋
)
vec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=5000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [23]:
# 改善前
# 讀取模型訓練做訓練並在測試集上面測試效果
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train),y_train) # 訓練
classifier.score(vec.transform(x_test),y_test) # 測試

0.2658193372673627

In [30]:
# 改進提取特徵的方法(取更多的ngram以及詞構成詞袋)
vec = CountVectorizer(
    analyzer='word', # 特徵由單詞構成
    ngram_range=(1,4), # ngram取1gram到4gram
    max_features=4000 # 選最常出現400個單詞構成詞袋
)
vec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=4000, min_df=1,
                ngram_range=(1, 4), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [31]:
# 改善後
# 讀取模型訓練做訓練並在測試集上面測試效果
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train),y_train) # 訓練
classifier.score(vec.transform(x_test),y_test) # 測試

0.28370403994552884

In [57]:
# 交叉驗證
from sklearn.model_selection import StratifiedKFold # K-Fold Validation
from sklearn.metrics import accuracy_score, recall_score, precision_score #評估準確度

stratifiedkfold = StratifiedKFold(n_splits=5,shuffle=True)
x_tem = vec.transform(x)
y_tem = np.array(y)
y_ref = y_tem[:]
for train_index,test_index in stratifiedkfold.split(x_tem,y_tem):
    x_train,x_test = x_tem[train_index], x_tem[test_index]
    y_train = y_tem[train_index]
    classifier = MultinomialNB()
    classifier.fit(x_train,y_train)
    y_ref[test_index] = classifier.predict(x_test)

print('Accuracy: {}'.format( accuracy_score(y,y_ref) ))
print('Precision: {}'.format( precision_score(y,y_ref,average='macro') ))
print('Recall: {}'.format( recall_score(y,y_ref,average='macro') ))

Accuracy: 0.3582731853465886
Precision: 0.34007712174807403
Recall: 0.35827318534658864


In [64]:
stop_list=[]
with open('data/stopwords.txt','r',encoding='utf-8') as f:
    for line in f.readlines():
        stop_list.append(line.strip())

In [67]:
#讀取分類的檔案
car_news = pd.read_csv('class_data/car_news.csv',encoding='utf-8')
car_news = car_news.dropna()

technology_news = pd.read_csv('class_data/technology_news.csv',encoding='utf-8')
technology_news = technology_news.dropna()

society_news = pd.read_csv('class_data/society_news.csv',encoding='utf-8')
society_news = society_news.dropna()

sports_news = pd.read_csv('class_data/sports_news.csv',encoding='utf-8')
sports_news = sports_news.dropna()

finance_news = pd.read_csv('class_data/finance_news.csv',encoding='utf-8')
finance_news = finance_news.dropna()

print('Car News:{}\nTechnology News:{}\nSociety News:{}\nSports News:{}\nFinance News:{}\n'.format(len(car_news),len(technology_news),len(society_news),len(sports_news),len(finance_news)))

#每個新聞取出8000筆
car_news = car_news[:8000]
technology_news = technology_news[:8000]
society_news = society_news[:8000]
sports_news = sports_news[:8000]
finance_news = finance_news[:8000]

Car News:11740
Technology News:25057
Society News:268829
Sports News:32728
Finance News:143141



In [73]:
def preprocess(data,all_data,category):
    for line in data:
        line = re.sub(r'[^\w]','',line)
        line = re.sub(r'[A-Za-z0-9]','',line)
        line = re.sub(u'[\uFF01-\uFF5A]','',line)
        segement_list = jieba.lcut(line)
        segement_list = filter(lambda x: len(x)>1,segement_list)
        segement_list = filter(lambda x: x not in stop_list,segement_list)
        all_data.append( (' '.join(segement_list),category) )

all_data = []
preprocess(car_news.content.values.tolist(),all_data,'Car News')
preprocess(technology_news.content.values.tolist(),all_data,'Technology News')
preprocess(society_news.content.values.tolist(),all_data,'Society News')
preprocess(sports_news.content.values.tolist(),all_data,'Sports News')
preprocess(finance_news.content.values.tolist(),all_data,'Finance News')

In [79]:
#切分資料集
x,y = zip(*all_data)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=666)

#創建貝氏分類器
class Bayes_Classification():
    def __init__(self, classifier=MultinomialNB()): # 初始化選擇分類模型與詞袋模型
        self.classifier = classifier
        self.vectorizer = CountVectorizer( analyzer='word', ngram_range=(1,4), max_features=4000 )

    def features(self,x_train): # 將詞語用詞袋模型轉成向量
        return self.vectorizer.transform(x_train)

    def fit(self,x_train,y_train): 
        self.vectorizer.fit(x_train)
        self.classifier.fit(self.features(x_train),y_train)

    def predict(self,x_test):
        return self.classifier.predict(self.features([x_test]))

    def score(self,x_test,y_test):
        return self.classifier.score(self.features(x_test),y_test)

In [82]:
bayes_classifier = Bayes_Classification()
bayes_classifier.fit(x_train,y_train)
print(bayes_classifier.predict('汽車 很新 好看'))
print(bayes_classifier.score(x_test,y_test))

['Car News']
0.75125
