用Bagging進行新聞分類

In [1]:
import re,jieba,random
import numpy as np 
import pandas as pd 
from sklearn.ensemble import BaggingClassifier #BaggingClassifier分類模型
from sklearn.model_selection import train_test_split #切割訓練與測試資料
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer #提取詞的特徵

In [2]:
#讀取分類的檔案
car_news = pd.read_csv('class_data/car_news.csv',encoding='utf-8')
car_news = car_news.dropna()

technology_news = pd.read_csv('class_data/technology_news.csv',encoding='utf-8')
technology_news = technology_news.dropna()

society_news = pd.read_csv('class_data/society_news.csv',encoding='utf-8')
society_news = society_news.dropna()

sports_news = pd.read_csv('class_data/sports_news.csv',encoding='utf-8')
sports_news = sports_news.dropna()

finance_news = pd.read_csv('class_data/finance_news.csv',encoding='utf-8')
finance_news = finance_news.dropna()

print('Car News:{}\nTechnology News:{}\nSociety News:{}\nSports News:{}\nFinance News:{}\n'.format(len(car_news),len(technology_news),len(society_news),len(sports_news),len(finance_news)))

#每個新聞取出8000筆
car_news = car_news[:8000]
technology_news = technology_news[:8000]
society_news = society_news[:8000]
sports_news = sports_news[:8000]
finance_news = finance_news[:8000]

Car News:11740
Technology News:25057
Society News:268829
Sports News:32728
Finance News:143141



In [3]:
stop_list=[]
with open('data/stopwords.txt','r',encoding='utf-8') as f:
    for line in f.readlines():
        stop_list.append(line.strip())

In [4]:
def preprocess(data,all_data,category):
    for line in data:
        line = re.sub(r'[^\w]','',line)
        line = re.sub(r'[A-Za-z0-9]','',line)
        line = re.sub(u'[\uFF01-\uFF5A]','',line)
        segment_list = jieba.lcut(line)
        segment_list = filter(lambda x: len(x)>1,segment_list)
        segment_list = filter(lambda x: x not in stop_list,segment_list)
        all_data.append( (' '.join(segment_list),category) )

all_data = []
preprocess(car_news.content.values.tolist(),all_data,'Car News')
preprocess(technology_news.content.values.tolist(),all_data,'Technology News')
preprocess(society_news.content.values.tolist(),all_data,'Society News')
preprocess(sports_news.content.values.tolist(),all_data,'Sports News')
preprocess(finance_news.content.values.tolist(),all_data,'Finance News')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/vk/4wfw6yvn67b0gpkhvj8wy0cw0000gn/T/jieba.cache
Loading model cost 1.588 seconds.
Prefix dict has been built successfully.


In [5]:
random.shuffle(all_data) #將所有資料打亂順序
x,y = zip(*all_data)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=666)

In [6]:
# 詞袋模型提取特徵
vec = CountVectorizer(
    analyzer='word', # 特徵由單詞構成
    ngram_range=(1,4), # ngram取1gram到4gram
    max_features=8000 # 選最常出現400個單詞構成詞袋
)
vec.fit(x_train)

# TFIDF模型提取特徵
tvec = TfidfVectorizer(
    analyzer='word', # 特徵由單詞構成
    ngram_range=(1,4), # ngram取1gram到4gram
    max_features=8000 # 選最常出現400個單詞構成詞袋
)
tvec.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=8000,
                min_df=1, ngram_range=(1, 4), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [13]:
# 交叉驗證
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,recall_score,precision_score

def K_Fold_Validation(x_d,y_d,k,classifier):
    stratifiedkfold = StratifiedKFold(n_splits=k,shuffle=True)
    x_tem = tvec.transform(x_d)
    y_tem = np.array(y_d)
    y_ref = y_tem[:]
    for train_index, test_index in stratifiedkfold.split(x_tem,y_tem):
        x_tra, x_tes = x_tem[train_index], x_tem[test_index]
        y_tra = y_tem[train_index]
        cl = classifier
        cl.fit(x_tra,y_tra)
        y_ref[test_index] = cl.predict(x_tes)

    print('Accuracy: {}'.format( accuracy_score(y_d,y_ref) ))
    print('Precision: {}'.format( precision_score(y_d,y_ref,average='macro') ))
    print('Recall: {}'.format( recall_score(y_d,y_ref,average='macro') ))

In [17]:
# 訓練並測試模型(提取特徵用TFIDF)
bagging_model = BaggingClassifier()
bagging_model.fit(tvec.transform(x_train),y_train)
print(bagging_model.score(tvec.transform(x_test),y_test))

K_Fold_Validation(x,y,5,BaggingClassifier())

0.6903333333333334
Accuracy: 0.689475
Precision: 0.6944319659918803
Recall: 0.689475


用Adaboost進行新聞分類

In [15]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_model = AdaBoostClassifier()
adaboost_model.fit(tvec.transform(x_train),y_train)
print(adaboost_model.score(tvec.transform(x_test),y_test))

K_Fold_Validation(x,y,5,adaboost_model)

0.5661666666666667
Accuracy: 0.549425
Precision: 0.6755483396748644
Recall: 0.549425


用GBDT進行新聞分類

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

GBDT_model = GradientBoostingClassifier()
GBDT_model.fit(tvec.transform(x_train),y_train)
print(GBDT_model.score(tvec.transform(x_test),y_test))

K_Fold_Validation(x,y,5,GBDT_model)

0.6805
Accuracy: 0.63445
Precision: 0.7328213691865656
Recall: 0.63445
