In [None]:
# 参考 https://github.com/AnthonyK97/Text-Classification-on-IMDB
# https://github.com/AnthonyK97/Text-Classification-on-IMDB/blob/main/2%20CNN%2BGlove.ipynb

import os
import sys
import random
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import OrderedDict
import re, string
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True
    
!mkdir ./model_bakup/

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class CFG:
    batch_size = 20
    lr = 0.02
    eval_step_num = 50
    mid_eval = False
    best_eval_acc = 0.0
    model_output_dir = './model_bakup/'
    seed = 2032
    use_ema = False
    use_adversial_training = False
    
DEBUG_RUN = True

global_start_t = time.time()
print('ok')

In [None]:
seed_everything(seed=42)

imdb_data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})
print('before drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.drop_duplicates()
print('after drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(30000)
print('after sample, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(len(imdb_data)).reset_index(drop=True)  # shuffle

imdb_data.head(5)

In [None]:
MAX_WORDS = 10000   # 仅考虑最高频的10000个词
MAX_LEN = 200
word_count_dict = {}

def clean_text(text):
    lowercase = text.lower().replace('\n', ' ')
    stripped_html = re.sub('<br />', ' ', lowercase)
    cleaned_punctuation = re.sub('[%s]'%re.escape(string.punctuation), '', stripped_html)
    return cleaned_punctuation

for review in imdb_data['review'].values:
    cleaned_text = clean_text(review)
    for word in cleaned_text.split(' '):
        word_count_dict[word] = word_count_dict.get(word, 0) + 1
            
df_word_dict = pd.DataFrame(pd.Series(word_count_dict, name='count'))
df_word_dict = df_word_dict.sort_values(by='count', ascending=False)

df_word_dict = df_word_dict[:MAX_WORDS-2]     # 总共取前max_words-2个词
df_word_dict['word_id'] = range(2, MAX_WORDS)

word_id_dict = df_word_dict['word_id'].to_dict()
word_id_dict['<unknown>'] = 0
word_id_dict['<padding>'] = 1

df_word_dict.head(15)

In [None]:
def pad(data_list, pad_length):
    padded_list = data_list.copy()
    
    if len(data_list) > pad_length:
        padded_list = data_list[-pad_length:]
        
    if len(data_list) < pad_length:
        padded_list = [1] * (pad_length-len(data_list)) + data_list
        
    return padded_list

def text_to_token(text):
    cleaned_text = clean_text(text)
    word_token_list = [word_id_dict.get(word, 0) for word in cleaned_text.split(' ')]
    pad_list = pad(word_token_list, MAX_LEN)
    token = ' '.join([str(x) for x in pad_list])
    return token
            
process_start_t = time.time()
print('start processing...')
imdb_data['review_tokens'] = imdb_data['review'].map(text_to_token)
print('ok, cost time: ', time.time()-process_start_t)
imdb_data.head(5)

In [None]:
print(imdb_data['review'].values[0])

In [None]:
TRAIN_NUM = 15000
imdb_data_test = imdb_data.iloc[:5000]
imdb_data_valid = imdb_data.iloc[5000:10000]
imdb_data_train = imdb_data.iloc[10000:TRAIN_NUM+10000]

if DEBUG_RUN:
    SAMPLE_NUM = 3000
    imdb_data_test = imdb_data_test.sample(SAMPLE_NUM)
    imdb_data_valid = imdb_data_valid.sample(SAMPLE_NUM)
    imdb_data_train = imdb_data_train.sample(2*SAMPLE_NUM)

print(f'imdb_data_train.shape: {imdb_data_train.shape}, imdb_data_valid.shape: {imdb_data_valid.shape}, '
      f'imdb_data_test.shape: {imdb_data_test.shape}')

In [None]:
cfg = CFG()
seed_everything(seed=cfg.seed)

print('ok')

In [None]:
class imdbDataset(Dataset):
    def __init__(self, data_df):
        self.data_df = data_df
        
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, index):
        label = self.data_df.iloc[index]['sentiment']
        label = torch.tensor([float(label)], dtype=torch.float, device=device)
        
        tokens = self.data_df.iloc[index]['review_tokens']
        feature = torch.tensor([int(x) for x in tokens.split(' ')], dtype=torch.long, device=device)
            
        return feature, label
    
def generate_data_iter(cfg):
    global imdb_data_train, imdb_data_valid, imdb_data_test
    ds_train = imdbDataset(imdb_data_train)
    ds_valid = imdbDataset(imdb_data_valid)
    ds_test = imdbDataset(imdb_data_test)
    print('len of ds_train: ', len(ds_train), 'len of ds_valid: ', len(ds_valid),
          'len of ds_test: ', len(ds_test))

    dl_train = DataLoader(ds_train, batch_size=cfg.batch_size, shuffle=True, num_workers=0)
    dl_valid = DataLoader(ds_valid, batch_size=cfg.batch_size, shuffle=False, num_workers=0)
    dl_test = DataLoader(ds_test, batch_size=cfg.batch_size, shuffle=False, num_workers=0)
    return dl_train, dl_valid, dl_test

dl_train, dl_valid, dl_test = generate_data_iter(cfg)
print('ok')

In [None]:
class model_EMA:
    '''
    # https://zhuanlan.zhihu.com/p/68748778
    Example
    # 初始化
    ema = EMA(model, 0.999)

    # 训练阶段，更新完参数后

    '''
    def __init__(self, model, decay=0.99):
        self.model = model
        self.decay = decay
        self.registered = False
        self.shadow = {}
        self.backup = {}

    def is_registered(self):
        return self.registered

    def register(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()
        self.registered = True

    def update(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                assert name in self.shadow
                new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
                self.shadow[name] = new_average.clone()

    def apply_shadow(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                assert name in self.shadow
                self.backup[name] = param.data
                param.data = self.shadow[name]

    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}
        
class FGM():
    '''
    Example
    # 初始化
    fgm = FGM(model,epsilon=1,emb_name='word_embeddings.')
    for batch_input, batch_label in data:
        # 正常训练
        loss = model(batch_input, batch_label)
        loss.backward() # 反向传播，得到正常的grad
        # 对抗训练
        fgm.attack() # 在embedding上添加对抗扰动
        #model.zero_grad()  # 如果需要两次回传梯度不累加, 只使用后面添加扰动之后的得到的梯度，则去掉该行的注释！
        loss_adv = model(batch_input, batch_label)
        loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
        fgm.restore() # 恢复embedding参数
        # 梯度下降，更新参数
        optimizer.step()
        model.zero_grad()
    '''
    def __init__(self, model, emb_name, epsilon=1.0, adv_random=False):
        # emb_name这个参数要换成你模型中embedding的参数名
        self.model = model
        self.epsilon = epsilon
        self.emb_name = emb_name
        self.adv_random = adv_random
        self.backup = {}

    def attack(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                #print('found an param: ', name)
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                #print('in attack() norm is ', norm)
                #print('param.data: ', param.data, 'param.grad: ', param.grad)
                #print('in attack() norm.shape is ', norm.shape, 'param.data.shape: ', param.data.shape, 'param.grad.shape: ', param.grad.shape)
                if norm!=0 and not torch.isnan(norm):
                    epsilon = self.epsilon
                    if self.adv_random:
                        epsilon *= random.uniform(0.5, 1.5)
                    r_at = epsilon * param.grad / norm
                    #r_at = 0.1 * random.uniform(0.5, 1.5) * param.grad
                    #r_at = 0.1 * param.grad
                    param.data.add_(r_at)

    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}
        
print('ok')

In [None]:
EMBEDDING_DIM = 50

class MLP_Net(nn.Module):
    def __init__(self, hidden_size=150):
        super().__init__()
        
        self.embedding = nn.Embedding(num_embeddings=MAX_WORDS, embedding_dim=EMBEDDING_DIM, padding_idx=1)
        
        self.fc = nn.Sequential()
        self.fc.add_module('fc_1', nn.Linear(EMBEDDING_DIM*MAX_LEN, hidden_size))
        self.fc.add_module('relu_1', nn.ReLU())
        self.fc.add_module('fc_2', nn.Linear(hidden_size, 1))
        self.fc.add_module('sigmoid_1', nn.Sigmoid())
        
    def forward(self, x):
        x = self.embedding(x).view(x.size(0), -1)
        x = self.fc(x)
        return x
    
model = MLP_Net()
print(model)
model.to(device)     

model_param_num = sum(p.numel() for p in model.parameters())
model_trainable_param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('model_param_num: ', model_param_num, 'model_trainable_param_num: ', 
      model_trainable_param_num)

print('ok')

# model_param_num:  2000301 model_trainable_param_num:  2000301

In [None]:
def accuracy(y_pred, y_true):
    if type(y_pred)==list:
        y_pred = np.array(y_pred)
    y_pred = (y_pred > 0.5)
    if type(y_true)==list:
        y_true = np.array(y_true)
    acc = (y_pred==y_true).mean()
    return acc

def evaluate(model, dl_test, device):
    global cfg
    model.eval()
    
    y_true_lst, y_pred_lst = [], []
    with torch.no_grad():
        for step, batch in enumerate(dl_test):
            feature, label = batch
            feature, label = feature.to(device), label.to(device)
            y_pred = model(feature)
            y_pred_lst += list(y_pred.detach().cpu().numpy())
            y_true_lst += list(label.detach().cpu().numpy())
            
    model.train() # 恢复模型为训练状态
    acc = accuracy(y_pred_lst, y_true_lst)

    return acc
    
def train(model, dl_train, optimizer, loss_func, device):
    global cfg, global_step_num, global_best_valid_acc, dl_valid,  model_ema, fgm
    model.train()  # 将模型置为训练状态
    
    y_true_lst, y_pred_lst = [], []
    for step, batch in enumerate(dl_train):
        global_step_num += 1
        feature, label = batch
        feature, label = feature.to(device), label.to(device)
        y_pred = model(feature)
        train_loss = loss_func(y_pred, label)
        y_pred_lst += list(y_pred.detach().cpu().numpy())
        y_true_lst += list(label.detach().cpu().numpy())
        train_loss.backward()
        optimizer.step()
        model.zero_grad()
        
        if cfg.mid_eval and (global_step_num % cfg.eval_step_num == 0):
            valid_acc = evaluate(model, dl_valid, device)
            print(f'step_num: {global_step_num}, valid_acc: {valid_acc:.5f}')
            if valid_acc > global_best_valid_acc:
                global_best_valid_acc = valid_acc
                print(f'step_num: {global_step_num}, get new best val_acc: {valid_acc:.5f}, save the model now!')                
                torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model.pth'))
        
    acc = accuracy(y_pred_lst, y_true_lst)
    return acc

print('ok')

In [None]:
global_best_train_acc, global_best_valid_acc = 0.0, 0.0
global_train_acc = 0.0
global_step_num = 0

epochs = 10
# optimizer=torch.optim.Adagrad(model.parameters(), lr=0.06)
# optimizer=torch.optim.Adadelta(model.parameters(), lr=10.0)
optimizer=torch.optim.Adam(model.parameters(), lr=0.007, weight_decay=1e-5)
# optimizer=torch.optim.AdamW(model.parameters(), lr=0.007, weight_decay=0.01)
loss_func = nn.BCELoss()

model_ema = None
fgm = None
if cfg.use_ema:
    model_ema = model_EMA(model, decay=0.999)
    model_ema.register()
    
if cfg.use_adversial_training:
    fgm = FGM(model, 'embedding', epsilon=1.0, adv_random=True)
    
for epoch in range(epochs):
    train_acc = train(model, dl_train, optimizer, loss_func, device)
    valid_acc = evaluate(model, dl_valid, device)
    test_acc = evaluate(model, dl_test, device)
    print(f'in epoch: {epoch}, train_acc: {train_acc:.5f}, valid_acc: {valid_acc:.5f}, test_acc: {test_acc:.5f}')
    if train_acc > global_best_train_acc:
        global_best_train_acc = train_acc
    if valid_acc > global_best_valid_acc:
        global_best_valid_acc = valid_acc
        global_train_acc = train_acc
        print(f'at the end of epoch, global_step_num: {global_step_num} get new best_valid_acc: {valid_acc:.5f}, save the model now!')
        torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model.pth'))

In [None]:
model = MLP_Net()
model.to(device)

model.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, 'best_step_model.pth')))
test_acc = evaluate(model, dl_test, device)
print(f'final test_acc: {test_acc:.5f}, best_val_acc: {global_best_valid_acc:.5f}, '
      f'train_acc: {global_train_acc:.5f}, best_train_acc: {global_best_train_acc:.5f}')

print('total finished, cost time: ', time.time() - global_start_t)

# Adam lr=0.007, weight_decay=1e-5
# final test_acc: 0.74667, best_val_acc: 0.74333, train_acc: 0.97450, best_train_acc: 0.97683
# total finished, cost time:  111.0548083782196