# 情感分析

## 第1步：导入豆瓣电影数据集

- TorchText中的一个重要概念是`Field`。`Field`决定了你的数据会被怎样处理。在我们的情感分类任务中，我们所需要接触到的数据有文本字符串和两种情感，"pos"或者"neg"。
- `Field`的参数制定了数据会被怎样处理。
- 我们使用`TEXT` field来定义如何处理电影评论，使用`LABEL` field来处理两个情感类别。
- 我们的`TEXT` field带有`tokenize='spacy'`，这表示我们会用[spaCy](https://spacy.io) tokenizer来tokenize英文句子。如果我们不特别声明`tokenize`这个参数，那么默认的分词方法是使用空格。
- 安装spaCy
```
pip install -U torchtext
pip install -U spacy==3.0.6
pip install en_core_web_sm.3.0.0.tar.gz
```
- `LABEL`由`LabelField`定义。这是一种特别的用来处理label的`Field`。我们后面会解释dtype。
- 更多关于`Fields`，参见https://github.com/pytorch/text/blob/master/torchtext/data/field.py
- 和之前一样，我们会设定random seeds使实验可以复现。

- TorchText支持很多常见的自然语言处理数据集。
- 下面的代码会自动下载IMDb数据集，然后分成train/test两个`torchtext.datasets`类别。数据被前面的`Fields`处理。IMDb数据集一共有50000电影评论，每个评论都被标注为正面的或负面的。

<b>先了解下Spacy库：[spaCy介绍和使用教程](https://juejin.im/post/5971a4b9f265da6c42353332?utm_source=gold_browser_extension%5D)</b></font>  
<b>再了解下torchtext库：[torchtext介绍和使用教程](https://blog.csdn.net/u012436149/article/details/79310176)：这个新手必看，不看下面代码听不懂</b></font> 

In [4]:
import torch
from torchtext.legacy import data,datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

SEED = 1234
torch.manual_seed(SEED) #为CPU设置随机种子
torch.cuda.manual_seed(SEED)#为GPU设置随机种子
torch.backends.cudnn.deterministic = True  #在程序刚开始加这条语句可以提升一点训练速度，没什么额外开销。

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ModuleNotFoundError: No module named 'torch'

In [None]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, n_filters, filter_sizes, 
                 output_size, dropout, pad_idx):
        super(CNN,self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=pad_idx)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1, out_channels = n_filters, 
                                              kernel_size = (fs, embedding_size)) 
                                    for fs in filter_sizes
                                    ])
        self.linear = nn.Linear(len(filter_sizes) * n_filters, output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # [seq_len, batch_size] => [batch_size, seq_len]
        text = text.permute(1, 0) 
        # [batch_size, seq_len] => [batch_size, seq_len, embedding_size]
        embeded = self.embed(text) 
        # [batch_size, seq_len, embedding_size] => [batch_size, 1, seq_len, embedding_size]
        embeded = embeded.unsqueeze(1) 
        # [batch_size, 1, seq_len, embedding_size] => [batch_size, n_filters, seq_len-filter_size+1, 1] 
        #                                          => [batch_size, n_filters, seq_len-filter_size+1]
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]    
        # [batch_size, n_filters, seq_len-filter_size+1] => [batch_size, n_filters, 1] 
        #                                                => [batch_size, n_filters]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]  
        # [batch_size, n_filters] => [batch_size, len(filter_sizes)*n_filters]
        concat = self.dropout(torch.cat(pooled, dim=1))
        # [batch_size, len(filter_sizes)*n_filters] => [batch_size, 1]
        out = self.linear(concat)
        return out

In [None]:
def binary_accuracy(preds, y): #计算准确率
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    #.round函数：四舍五入
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc
def train(model, iterator, optimizer, criterion):
    epoch_loss,epoch_acc,total_len = 0.,0.,0.
    #有时候训练时会用到dropout、归一化等方法，但是测试的时候不能用dropout等方法。 
    model.train() #这步一定要加，是为了区分model训练和测试的模式的。
    for batch in iterator:
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        optimizer.zero_grad() 
        loss.backward() #反向传播
        optimizer.step() #梯度下降
        
        epoch_loss += loss.item() * len(batch.label)
        epoch_acc += acc.item() * len(batch.label)
        total_len += len(batch.label)
    return epoch_loss / total_len, epoch_acc / total_len
def evaluate(model, iterator, criterion):
    epoch_loss,epoch_acc,total_len = 0.,0.,0.
    model.eval()
    #转换成测试模式，冻结dropout层或其他层。
    with torch.no_grad():
        for batch in iterator: 
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            epoch_loss += loss.item() * len(batch.label)
            epoch_acc += acc.item() * len(batch.label)
            total_len += len(batch.label)
    model.train() #调回训练模式   
    return epoch_loss / total_len, epoch_acc / total_len

In [None]:
TEXT = data.Field(tokenize='spacy') 
LABEL = data.LabelField(dtype=torch.float)
# 划分 训练集，验证集，测试集
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED)) #默认split_ratio=0.7
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

TEXT.build_vocab(train_data, 
                 max_size=25000, 
                 vectors="glove.6B.100d", 
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data) 

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
                                        (train_data, valid_data, test_data), 
                                        batch_size=BATCH_SIZE,
                                        device=device)

In [None]:
INPUT_SIZE = len(TEXT.vocab)
EMBEDDING_SIZE = 100
NUM_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_SIZE = 1
DROPOUT = 0.5
pretrained_embeddings = TEXT.vocab.vectors

BATCH_SIZE = 64
LEARNING_RATE = 1e-4
N_EPOCHS = 10

model = CNN(INPUT_SIZE, 
            EMBEDDING_SIZE, 
            NUM_FILTERS, 
            FILTER_SIZES, 
            OUTPUT_SIZE, 
            DROPOUT, 
            PAD_IDX)

model.embedding.weight.data.copy_(pretrained_embeddings)
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
model = model.to(device)

optimizer = optim.Adam(model.parameters(),lr = LEARNING_RATE) #定义优化器
criterion = nn.BCEWithLogitsLoss()  #定义损失函数，这个BCEWithLogitsLoss特殊情况，二分类损失函数

In [None]:
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'CNN-model.pt')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
model.load_state_dict(torch.load('CNN-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
import spacy  #分词工具，跟NLTK类似
nlp = spacy.load('en')
def predict_sentiment(sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]#分词
    indexed = [TEXT.vocab.stoi[t] for t in tokenized] 
    #sentence的索引
    tensor = torch.LongTensor(indexed).to(device) #seq_len
    tensor = tensor.unsqueeze(1) 
    #seq_len * batch_size（1）
    prediction = torch.sigmoid(model(tensor))
    #tensor与text一样的tensor
    return prediction.item()

predict_sentiment("I love This film bad ")
predict_sentiment("This film is great")