### 目标：情感分类
数据集 Sentiment140, Twitter上的内容 
包含160万条记录，0 ： 负面， 2 ： 中性， 4 ： 正面

In [1]:
# 导入数据
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

# 读取数据 ， engine默认是'c'
dataset = pd.read_csv("training.1600000.processed.noemoticon.csv", engine='python', header=None)

In [2]:
dataset.shape

(1600000, 6)

In [3]:
dataset.info() # 数据表信息

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   0       1600000 non-null  int64 
 1   1       1600000 non-null  int64 
 2   2       1600000 non-null  object
 3   3       1600000 non-null  object
 4   4       1600000 non-null  object
 5   5       1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [4]:
dataset.describe() # 数据表描述

Unnamed: 0,0,1
count,1600000.0,1600000.0
mean,2.0,1998818000.0
std,2.000001,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,2.0,2002102000.0
75%,4.0,2177059000.0
max,4.0,2329206000.0


In [5]:
dataset.columns # 列名

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [6]:
dataset.head() # 默认显示前5行

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
dataset[0].value_counts() # 统计各个类别数据占比

4    800000
0    800000
Name: 0, dtype: int64

In [8]:
dataset['sentiment_category'] = dataset[0].astype('category') # 类型转换 --> 分类变量

In [9]:
dataset['sentiment_category'].value_counts() # 统计各个类别数量

4    800000
0    800000
Name: sentiment_category, dtype: int64

In [10]:
dataset['sentiment'] = dataset['sentiment_category'].cat.codes # 分类变量值转换为 0 和 1 两个类别

In [11]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,sentiment_category,sentiment
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,0
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0,0
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0,0
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0,0


In [12]:
dataset['sentiment'].value_counts() # 统计类别占比

1    800000
0    800000
Name: sentiment, dtype: int64

In [13]:
dataset.to_csv('training-processed.csv', header=None, index=None) # 保存文件

In [14]:
# 随机选择10000个样本当作测试集
dataset.sample(10000).to_csv("test_sample.csv", header=None, index=None)

In [15]:
# 设置标签和文本
from torchtext import data

LABEL = data.LabelField() # 标签
TWEET = data.Field(lower=True) # 内容/文本

In [16]:
# 设置表头
fields = [('score', None), ('id',None), ('date',None), ('query',None),
          ('name',None),('tweet',TWEET), ('category',None), ('label',LABEL)]

# 读取数据
twitterDataset = data.TabularDataset(
    path = 'training-processed.csv',
    format = 'CSV',
    fields = fields,
    skip_header = False
)

# 分离 train, test, val
train, test, val = twitterDataset.split(split_ratio=[0.8, 0.1, 0.1], stratified=True, strata_field='label')

In [17]:
len(train)

1280000

In [18]:
len(test)

160000

In [19]:
len(val)

160000

In [20]:
# 显示一个样本
vars(train.examples[11])

{'tweet': ['@monica2112',
  'oh',
  "don't",
  'worry,',
  'i',
  "don't",
  'mind',
  'if',
  'you',
  'are.',
  "i'm",
  'just',
  'happy',
  'u',
  'want',
  'to',
  'meet',
  'me!'],
 'label': '1'}

In [21]:
# 构建词汇表
vocab_size = 20000
TWEET.build_vocab(train, max_size=vocab_size)
LABEL.build_vocab(train)

In [22]:
# 词汇表大小
len(TWEET.vocab) # unk --> 未知单词，pad --> 填充

20002

In [23]:
# 词汇表中最常见的单词
TWEET.vocab.freqs.most_common(10)

[('i', 597446),
 ('to', 447324),
 ('the', 415058),
 ('a', 300964),
 ('my', 250409),
 ('and', 236538),
 ('you', 190004),
 ('is', 184795),
 ('for', 171218),
 ('in', 167840)]

In [24]:
TWEET.vocab.itos[:10] # 索引 --> 单词

['<unk>', '<pad>', 'i', 'to', 'the', 'a', 'my', 'and', 'you', 'is']

In [25]:
TWEET.vocab.stoi # 单词 -->  索引

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f1cbe4a2520>>,
            {'<unk>': 0,
             '<pad>': 1,
             'i': 2,
             'to': 3,
             'the': 4,
             'a': 5,
             'my': 6,
             'and': 7,
             'you': 8,
             'is': 9,
             'for': 10,
             'in': 11,
             'it': 12,
             'of': 13,
             'on': 14,
             'so': 15,
             'have': 16,
             'me': 17,
             'that': 18,
             "i'm": 19,
             'just': 20,
             'but': 21,
             'with': 22,
             'be': 23,
             'at': 24,
             'was': 25,
             'not': 26,
             'this': 27,
             'get': 28,
             'good': 29,
             'are': 30,
             'like': 31,
             'all': 32,
             'up': 33,
             'out': 34,
             '-': 35,
             'go': 36,
             'your': 37,


In [26]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

In [27]:
# 文本批处理，即一批一批地读取数据
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), 
                                                             batch_size=32, 
                                                             device = device,
                                                             sort_within_batch = True,
                                                             sort_key = lambda x : len(x.tweet))
"""
sort_within_batch = True，一个batch内的数据就会按sort_key的排列规则降序排列，
sort_key是排列的规则，这里使用tweet的长度，即每条用户评论所包含的单词数量。
"""

'\nsort_within_batch = True，一个batch内的数据就会按sort_key的排列规则降序排列，\nsort_key是排列的规则，这里使用tweet的长度，即每条用户评论所包含的单词数量。\n'

### 模型构建

In [28]:
import torch.nn as nn

class simple_LSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size):
        super(simple_LSTM, self).__init__() # 调用父类的构造方法
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # vocab_size词汇表大小， embedding_dim词嵌入维度
        self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size, 2) # 全连接层
        
    def forward(self, seq):
        output, (hidden, cell) = self.encoder(self.embedding(seq))
        # output :  torch.Size([24, 32, 100])
        # hidden :  torch.Size([1, 32, 100])
        # cell :  torch.Size([1, 32, 100])
        preds = self.predictor(hidden.squeeze(0))
        return preds

In [29]:
lstm_model = simple_LSTM(hidden_size=100, embedding_dim=300, vocab_size=20002)

lstm_model.to(device)

simple_LSTM(
  (embedding): Embedding(20002, 300)
  (encoder): LSTM(300, 100)
  (predictor): Linear(in_features=100, out_features=2, bias=True)
)

### 模型训练

In [30]:
from torch import optim

# 优化器
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# 损失函数
criterion = nn.CrossEntropyLoss() # 多分类 （负面、正面、中性）

In [31]:
def train_val_test(model, optimizer, criterion, train_iter, val_iter, test_iter, epochs):
    for epoch in range(1, epochs+1):
        train_loss = 0.0 # 训练损失
        val_loss = 0.0 # 验证损失
        model.train() # 声明开始训练
        for indices, batch in enumerate(train_iter):
            optimizer.zero_grad() # 梯度置0
            outputs = model(batch.tweet) # 预测后输出 outputs shape :  torch.Size([32, 2])
            # batch.label shape :  torch.Size([32])
            loss = criterion(outputs, batch.label) # 计算损失
            loss.backward() # 反向传播
            optimizer.step() # 更新参数
            # batch.tweet shape :  torch.Size([26, 32]) --> 26:序列长度， 32:一个batch_size的大小
            train_loss += loss.data.item() * batch.tweet.size(0) # 累计每一批的损失值
        train_loss /= len(train_iter) # 计算平均损失 len(train_iter) :  40000
        print("Epoch : {}, Train Loss : {:.2f}".format(epoch, train_loss))
        
        model.eval() # 声明模型验证
        for indices, batch in enumerate(val_iter):
            context = batch.tweet.to(device) # 部署到device上
            target = batch.label.to(device)
            pred = model(context) # 模型预测
            loss = criterion(pred, target) # 计算损失 len(val_iter) :  5000
            val_loss += loss.item() * context.size(0) # 累计每一批的损失值
        val_loss /= len(val_iter) # 计算平均损失 
        print("Epoch : {}, Val Loss : {:.2f}".format(epoch, val_loss))
        
        model.eval() # 声明
        correct = 0.0 # 计算正确率
        test_loss = 0.0 # 测试损失
        with torch.no_grad(): # 不进行梯度计算
            for idx, batch in enumerate(test_iter):
                context = batch.tweet.to(device) # 部署到device上
                target = batch.label.to(device)
                outputs = model(context) # 输出
                loss = criterion(outputs, target) # 计算损失
                test_loss += loss.item() * context.size(0) # 累计每一批的损失值
                # 获取最大预测值索引
                preds = outputs.argmax(1)
                # 累计正确数
                correct += preds.eq(target.view_as(preds)).sum().item()
            test_loss /= len(test_iter) # 平均损失 len(test_iter) :  5000
            print("Epoch : {}, Test Loss : {:.2f}".format(epoch, test_loss))
            print("Accuracy : {}".format(100 * correct / (len(test_iter) * batch.tweet.size(1))))

In [32]:
# 开始训练和验证
train_val_test(lstm_model,  optimizer, criterion, train_iter, val_iter, test_iter, epochs=5)

Epoch : 1, Train Loss : 5.95
Epoch : 1, Val Loss : 5.58
Epoch : 1, Test Loss : 5.57
Accuracy : 81.628125
Epoch : 2, Train Loss : 5.36
Epoch : 2, Val Loss : 5.47
Epoch : 2, Test Loss : 5.48
Accuracy : 82.045
Epoch : 3, Train Loss : 5.11
Epoch : 3, Val Loss : 5.47
Epoch : 3, Test Loss : 5.48
Accuracy : 82.185625
Epoch : 4, Train Loss : 4.92
Epoch : 4, Val Loss : 5.51
Epoch : 4, Test Loss : 5.51
Accuracy : 82.220625
Epoch : 5, Train Loss : 4.77
Epoch : 5, Val Loss : 5.51
Epoch : 5, Test Loss : 5.53
Accuracy : 82.275


### 知识点：text 数据增强 data argumentation

1. random insertion 随机插入
2. random deletion 随机删除
3. random swap 随机交换

参考论文： EDA : Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks

4. Back Translation

举例： 英语 -->  中文 --> 英语

In [33]:
# 需要安装 : pip install google_trans_new

from google_trans_new import google_translator

translator = google_translator()

In [34]:
sentence = ['stay hungry, stay foolish. -- spoken / said by Steve Jobs']

In [35]:
# 英 --> 中
translation_cn = translator.translate(sentence, lang_tgt='zh-cn')

In [36]:
translation_cn

"['保持饥饿，保持愚蠢。 -史蒂夫·乔布斯（Steve Jobs）说的话/ "

In [37]:
# 中 --> 英
translation_en = translator.translate(translation_cn, lang_tgt='en')

In [38]:
translation_en

"['stay Hungry Stay Foolish. -What Steve Jobs said/ "

### 随机选择一种语言翻译

In [39]:
import random
import google_trans_new

languages = list(google_trans_new.LANGUAGES.keys())

In [40]:
len(languages) # 可翻译的语言种类 108 种

108

In [41]:
object_lang = random.choice(languages)

object_lang

'hu'

In [42]:
# 正向翻译

translations = translator.translate(sentence, lang_tgt=object_lang)

translations

"['maradj éhes, maradj őrült. - Steve Jobs mondta / mondta "

In [43]:
# 反向翻译

back_trans = translator.translate(translations, lang_tgt='en')

back_trans

"['stay hungry, stay crazy. - Steve Jobs said "