# task4 基于DL的文本分类

## 1. FastText

In [2]:
from __future__ import unicode_literals

from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras.layers import Dense

VOCAB_SIZE = 2000
EMBEDDING_DIM = 100
MAX_WORDS = 500
CLASS_NUM = 5

def build_fastText():
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length = MAX_WORDS))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(CLASS_NUM, activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer = 'SGD', metrics = ['accuracy'])
    return model

if __name__ == '__main__':
    model = build_fastText()
    print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          200000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 200,505
Trainable params: 200,505
Non-trainable params: 0
_________________________________________________________________
None


## 2. 分类模型

In [1]:
import pandas as pd
from sklearn.metrics import f1_score

In [9]:
train_df = pd.read_csv('../../data/train_set.csv', sep='\t')
train_df['label_ft'] = '__label__' + train_df['label'].astype(str)
train_df[['text','label_ft']].iloc[:-5000].to_csv('train.csv', index=None, header=None, sep='\t')

In [3]:
import fasttext

In [10]:
model = fasttext.train_supervised('train.csv', lr=1.0, wordNgrams=2, 
                                  verbose=2, minCount=1, epoch=25, loss="hs")

In [11]:
val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[-5000:]['text']]
print(f1_score(train_df['label'].values[-5000:].astype(str), val_pred, average='macro'))

0.9094308653334665


## 验证集调参

In [7]:
label2id = {}
for i in range(total):
    label = str(all_labels[i])
    if label not in label2id:
        label2id[label] = [i]
    else:
        label2id[label].append(i)

NameError: name 'total' is not defined

## 作业

### 1. 调整faxttext参数

`train_supervised`函数一些可调参数：  
    lr # 学习率 [0.1]  
    dim # 单词向量维度 [100]  
    ws # 窗口大小 [5]  
    epoch # epoch个数 [5]  
    minCount # 单词出现最小次数 [1]  
    minCountLabel # 标签出现最小次数 [1]  
    minn # char ngram最小长度 [0]  
    maxn # char ngram最大长度 [0]  
    neg # 负样本数 [5]  
    wordNgrams # word ngram最大长度 [1]  
    loss # loss function {ns, hs, softmax, ova} [softmax]  
    bucket # bucket个数 [2000000]  
    thread # threads个数 [number of cpus]  
    lrUpdateRate # 更新学习率的概率 [100]  
    t # 采样阈值 [0.0001]  
    label # label prefix ['__label__']  
    verbose # 日志显示 [2]  
    pretrainedVectors # 用于监督学习的预训练词向量（.vec文件）[]  

In [7]:
model = fasttext.train_supervised('train.csv', lr=0.8, wordNgrams=3, 
                                  verbose=1, minCount=0, epoch=50)

In [8]:
val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[-5000:]['text']]
print(f1_score(train_df['label'].values[-5000:].astype(str), val_pred, average='macro'))

0.8844365032711271
