## 基于doc2vec的文本分类

本文主要介绍了使用[`gensim`](https://radimrehurek.com/gensim/index.html)中`doc2vec`的用法，用于训练文档向量模型，并应用于文本分类模型之中  
本文的代码和使用的数据可以在[github](https://github.com/snowhyzhang/yukino/tree/master/nlp)上找到

### 语料处理

- 处理语料库路径与语料类别

In [1]:
import pandas as pd
import glob

# 获取所有语料路径
doc_files = glob.glob('data/*/*.txt')

# 根据文件名处理语料的类别
file_cat = []
for file_name in doc_files:
    category = file_name.split('/')[1].split('-')[1]
    file_cat.append((file_name, category))
    
file_pd = pd.DataFrame(file_cat, columns=['file', 'category'])
file_pd['category'] = pd.Categorical(file_pd['category'])
file_pd.iloc[:5]

Unnamed: 0,file,category
0,data/C32-Agriculture/C32-Agriculture0002.txt,Agriculture
1,data/C32-Agriculture/C32-Agriculture0003.txt,Agriculture
2,data/C32-Agriculture/C32-Agriculture0005.txt,Agriculture
3,data/C32-Agriculture/C32-Agriculture0007.txt,Agriculture
4,data/C32-Agriculture/C32-Agriculture0009.txt,Agriculture


- 划分为训练语料与测试语料

In [2]:
from sklearn.model_selection import train_test_split

files_train, files_test, y_train, y_test = train_test_split(file_pd['file'], file_pd['category'], test_size=0.3, 
                                                            stratify=file_pd['category'], random_state=1024)

- 处理语料分词，封装成TaggedDocument，用于`doc2vec`训练

In [3]:
import jieba
import re
from gensim.models.doc2vec import TaggedDocument

class TaggedDocumentReader:
    def __init__(self, corpus_files, rexp=None):
        self.corpus_files = corpus_files
        if rexp is None:
            # 只保留数字、英文和中文
            self.rexp = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5]')
        else:
            self.rexp = rexp
    
    def __iter__(self):
        for file in self.corpus_files:
            with open(file, 'r', encoding='GB18030') as f:
                content = f.read()
                words = jieba.lcut(content)
                words = map(lambda x: self.rexp.sub('', x).strip(), words)
                words = filter(lambda x: len(x) > 0, words)
                yield TaggedDocument(words=list(words), tags=[file])

### doc2vec模型

- 读入训练集

In [4]:
tagged_documents_train = []
for tdr in TaggedDocumentReader(files_train.tolist()):
    tagged_documents_train.append(tdr)
len(tagged_documents_train)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/24/5z49fxqn0zx_9ymk936f_9t00000gn/T/jieba.cache
Loading model cost 0.902 seconds.
Prefix dict has been built succesfully.


1892

- 配置doc2vec模型

In [5]:
from gensim.models.doc2vec import Doc2Vec
import multiprocessing

# 获取CPU数目，做并行计算
cores = multiprocessing.cpu_count()
# 设置输出的向量数目
vct_size = 256

d2v_model = Doc2Vec(dm=1, vector_size=vct_size, workers=cores, min_count=5, sample=1e-5, negative=5)
d2v_model.build_vocab(tagged_documents_train)

- 训练doc2vec模型

In [6]:
d2v_model.train(tagged_documents_train, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

### 文本模型

#### 模型训练

- 将训练语料转化为向量表示

In [7]:
import numpy as np

vect = [d2v_model.docvecs[doc.tags[0]].reshape((1, vct_size)) for doc in tagged_documents_train]
X_train = np.concatenate(vect)
X_train.shape

(1892, 256)

- 训练逻辑斯蒂回归模型

In [8]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr_params = [{'C': [10**n for n in range(0, 5)]}]
cv = KFold(n_splits=10, shuffle=True, random_state=1024)

lr_clf = GridSearchCV(estimator=lr, param_grid=lr_params, cv = cv)
lr_clf.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=1024, shuffle=True),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

#### 模型测试

- 处理测试集，将测试集转为向量表示

In [9]:
tagged_documents_test = []
for tdr in TaggedDocumentReader(files_test.tolist()):
    tagged_documents_test.append(tdr)

vect = [d2v_model.infer_vector(doc.words).reshape((1, vct_size)) for doc in tagged_documents_test]
X_test = np.concatenate(vect)
X_test.shape

(812, 256)

- 预测

In [10]:
y_predict = lr_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
round(accuracy, 4)

0.968

- 查看混淆矩阵

In [11]:
pd.crosstab(pd.Series(y_test.values, name='truth'), pd.Series(y_predict, name='predict'))

predict,Agriculture,History,Sports
truth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Agriculture,300,3,2
History,7,129,0
Sports,3,11,357
