# task4 基于深度学习的文本分类（1）

In [38]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
import fasttext
from sklearn.model_selection import StratifiedKFold

## 1.单次模型训练

In [2]:
# 转换为FastText需要的格式
train_df = pd.read_csv('train_set.csv', sep='\t', nrows=15000)
train_df['label_ft'] = '__label__' + train_df['label'].astype(str)
train_df[['text','label_ft']].iloc[:-5000].to_csv('train.csv', index=None, header=None, sep='\t')

# 训练样本
model = fasttext.train_supervised('train.csv', lr=1.0, wordNgrams=2, 
                                  verbose=2, minCount=1, epoch=25, loss="hs")

val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[-5000:]['text']]
print(f1_score(train_df['label'].values[-5000:].astype(str), val_pred, average='macro'))
# 0.82

0.8276528375115089


## 2.在验证集上进行模型调参

查询fasttext官方文档，训练模型的参数如下

![title](fasttext_api.PNG)

In [43]:
train_df = pd.read_csv('train_set.csv', sep='\t', nrows=10000)
train_df['label_ft'] = '__label__' + train_df['label'].astype(str)
data = train_df[['text','label_ft']]
cv = StratifiedKFold(n_splits=5)

In [39]:
score = []
for train_index, test_index in cv.split(train_df.text, train_df.label_ft):
    train_df[['text','label_ft']].loc[train_index,:].to_csv('train.csv', index=None, header=None, sep='\t')
    model = fasttext.train_supervised('train.csv', lr=5, wordNgrams=2, 
                                  verbose=2, minCount=1, epoch=25, loss="hs")
    val_pred = [model.predict(x)[0][0].split('__')[-1] for x in  train_df.loc[test_index,:]['text']]
    score.append (f1_score(train_df.loc[test_index,:]['label'].astype(str), val_pred, average='macro'))


In [42]:
print (np.array(score).mean())

0.8380843831075996


In [49]:
for mincount in [1,3,5,10]:
    score = []
    for train_index, test_index in cv.split(train_df.text, train_df.label_ft):
        train_df[['text','label_ft']].loc[train_index,:].to_csv('train.csv', index=None, header=None, sep='\t')
        model = fasttext.train_supervised('train.csv', lr=5, wordNgrams=2, 
                                      verbose=2, minCount=mincount, epoch=25, loss="hs")
        val_pred = [model.predict(x)[0][0].split('__')[-1] for x in  train_df.loc[test_index,:]['text']]
        score.append (f1_score(train_df.loc[test_index,:]['label'].astype(str), val_pred, average='macro'))
    print ("mincount = {0}, score = {1} ".format(mincount, np.array(score).mean()))

mincount = 1, score = 0.8349351979652864 
mincount = 3, score = 0.8307314954910353 
mincount = 5, score = 0.8276919970983496 
mincount = 10, score = 0.828450628740544 


In [51]:
for n in [1,2,3]:
    score = []
    for train_index, test_index in cv.split(train_df.text, train_df.label_ft):
        train_df[['text','label_ft']].loc[train_index,:].to_csv('train.csv', index=None, header=None, sep='\t')
        model = fasttext.train_supervised('train.csv', lr=1, wordNgrams=n, 
                                      verbose=2, minCount=1, epoch=25, loss="hs")
        val_pred = [model.predict(x)[0][0].split('__')[-1] for x in  train_df.loc[test_index,:]['text']]
        score.append (f1_score(train_df.loc[test_index,:]['label'].astype(str), val_pred, average='macro'))
    print ("wordNgrams = {0}, score = {1} ".format(n, np.array(score).mean()))

wordNgrams = 1, score = 0.7819347049788359 
wordNgrams = 2, score = 0.8246107160490512 
wordNgrams = 3, score = 0.8221268604321488 


## 结论
- 尝试了修改几个超参数调参，采用5折分层交叉验证，mincount在等于1时效果最好，wordNgrams在等于2时效果最好