In [None]:
# 参考 https://www.topbots.com/fine-tune-transformers-in-pytorch/

import io
import os
import sys
import random
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import OrderedDict
import re, string
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from transformers import (AutoConfig, 
                          AutoModelForSequenceClassification,
                          AutoTokenizer, AdamW,
                          get_linear_schedule_with_warmup,
                          set_seed,
                         )

def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True
    
!mkdir ./model_bakup/

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device is', device)

class CFG:
    batch_size = 8
    lr = 0.01
    eval_step_num = 20
    mid_eval = False
    best_eval_acc = 0.0
    model_output_dir = './model_bakup/'
    seed = 2032
    use_ema = False
    use_adversial_training = True
    use_lr_scheduler = True
    model_name_or_path = 'bert-base-cased'
    #model_name_or_path = 'bert-large-cased'
    
DEBUG_RUN = False

global_start_t = time.time()
print('ok')

In [None]:
seed_everything(seed=42)

imdb_data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})
print('before drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.drop_duplicates()
print('after drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(30000)
print('after sample, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(len(imdb_data)).reset_index(drop=True)  # shuffle

imdb_data.head(5)

In [None]:
# TRAIN_NUM = 15000
TRAIN_NUM = 2500
imdb_data_test = imdb_data.iloc[:5000]
imdb_data_valid = imdb_data.iloc[5000:10000]
imdb_data_train = imdb_data.iloc[10000:TRAIN_NUM+10000]

if DEBUG_RUN:
    SAMPLE_NUM = 300
    imdb_data_test = imdb_data_test.sample(SAMPLE_NUM)
    imdb_data_valid = imdb_data_valid.sample(SAMPLE_NUM)
    #imdb_data_train = imdb_data_train.sample(2*SAMPLE_NUM)

print(f'imdb_data_train.shape: {imdb_data_train.shape}, imdb_data_valid.shape: {imdb_data_valid.shape}, '
      f'imdb_data_test.shape: {imdb_data_test.shape}')

# imdb_data_train.head(10)
imdb_data_test.head(5)

In [None]:
imdb_data_test.tail(5)

In [None]:
cfg = CFG()
seed_everything(seed=cfg.seed)

print('ok')

In [None]:
class MovieReviewsDataset(Dataset):
    def __init__(self, data_df, use_tokenizer, max_sequence_len=None):
        self.data_df = data_df
        max_sequence_len = use_tokenizer.max_len if max_sequence_len is None else max_sequence_len
        texts = list(data_df['review'].values)
        labels = list(data_df['sentiment'].values)
        self.n_examples = len(labels)
        self.inputs = use_tokenizer(texts, add_special_tokens=True, truncation=True, padding=True, 
                                    return_tensors='pt', max_length=max_sequence_len)
        self.sequence_len = self.inputs['input_ids'].shape[-1]
        self.inputs.update({'labels': torch.tensor(labels)})
    
    def __len__(self):
        return self.n_examples
    
    def __getitem__(self, item):
        return {key: self.inputs[key][item] for key in self.inputs.keys()}
    
def train(dataloader, optimizer, scheduler, device):
    global model, global_step_num, global_best_val_acc, cfg
    predictions_labels = []
    true_labels = []
    total_loss = 0
    model.train()
    for batch in dataloader:
        global_step_num += 1
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k: v.type(torch.long).to(device) for k, v in batch.items()}
        model.zero_grad()
        outputs = model(**batch)
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        logits = logits.detach().cpu().numpy()
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()
        
        if cfg.mid_eval and (global_step_num % cfg.eval_step_num == 0):
            valid_labels, valid_predict, val_loss = validation(dataloader_valid, device)
            val_acc = accuracy_score(valid_labels, valid_predict)
            print(f'step_num: {global_step_num}, val_acc: {val_acc:.5f}')
            if val_acc > global_best_val_acc:
                global_best_val_acc = val_acc
                print(f'step_num: {global_step_num}, get new best val_acc: {val_acc:.5f}, save the model now!')                
                torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model.pth'))
        
    avg_epoch_loss = total_loss / len(dataloader)
    
    return true_labels, predictions_labels, avg_epoch_loss

def validation(dataloader, device):
    global model
    predictions_labels, true_labels = [], []
    total_loss = 0
    model.eval()
    for batch in dataloader:
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k: v.type(torch.long).to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            total_loss += loss.item()
            predict_content = logits.argmax(axis=-1).flatten().tolist()
            predictions_labels += predict_content
            
    model.train()  # 将模型重新置为训练状态
    avg_epoch_loss = total_loss / len(dataloader)
    return true_labels, predictions_labels, avg_epoch_loss

print('ok')

In [None]:
print('Loading configuration...')
max_length = 500

model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=cfg.model_name_or_path,
                                          num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=cfg.model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=cfg.model_name_or_path,
                                                           config=model_config)
model.to(device)

model_param_num = sum(p.numel() for p in model.parameters())
model_trainable_param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('model_param_num: ', model_param_num, 'model_trainable_param_num: ', 
      model_trainable_param_num)

dataset_train = MovieReviewsDataset(imdb_data_train,
                                    use_tokenizer=tokenizer,
                                    max_sequence_len=max_length)
dataloader_train = DataLoader(dataset_train, batch_size=cfg.batch_size, shuffle=True)

dataset_valid = MovieReviewsDataset(imdb_data_valid,
                                    use_tokenizer=tokenizer,
                                    max_sequence_len=max_length)
dataloader_valid = DataLoader(dataset_valid, batch_size=2*cfg.batch_size, shuffle=False)

dataset_test = MovieReviewsDataset(imdb_data_test,
                                    use_tokenizer=tokenizer,
                                    max_sequence_len=max_length)
dataloader_test = DataLoader(dataset_test, batch_size=2*cfg.batch_size, shuffle=False)

print('len of dataloader_train: ', len(dataloader_train),
      'len of dataloader_valid: ', len(dataloader_valid),
      'len of dataloader_test: ', len(dataloader_test))

print('ok')

# bert-base-cased model 
# model_param_num:  108311810 model_trainable_param_num:  108311810
# len of dataloader_train:  188 len of dataloader_valid:  47 len of dataloader_test:  47

# bert-large-cased model
# model_param_num:  333581314 model_trainable_param_num:  333581314
# len of dataloader_train:  188 len of dataloader_valid:  47 len of dataloader_test:  47

In [None]:
epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=1e-5)
total_steps = len(dataloader_train) * epochs
lr_scheduler = None
if cfg.use_lr_scheduler:
    lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                        num_warmup_steps=int(0.1 * total_steps), 
                        num_training_steps=total_steps)
    
global_step_num = 0
global_best_val_acc = 0.0
all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}

for epoch in range(epochs):
    train_labels, train_predict, train_loss = train(dataloader_train, optimizer, lr_scheduler, device)
    train_acc = accuracy_score(train_labels, train_predict)
    print(f'epoch: {epoch}, train_acc: {train_acc:.5f}')
    
    valid_labels, valid_predict, val_loss = validation(dataloader_valid, device)
    val_acc = accuracy_score(valid_labels, valid_predict)
    
    test_labels, test_predict, test_loss = validation(dataloader_test, device)
    test_acc = accuracy_score(test_labels, test_predict)
    
    print(f'epoch: {epoch}, train_acc: {train_acc:.5f}, val_acc: {val_acc:.5f}, test_acc: {test_acc:.5f}')
    if val_acc > global_best_val_acc:
        global_best_val_acc = val_acc
        print(f'at the end of epoch, step_num: {global_step_num}, get new best val_acc: {val_acc:.5f}, save the model now!')                
        torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model.pth'))
    
print('ok')

# epoch: 0, train_acc: 0.46000
# epoch: 1, train_acc: 0.46000
# epoch: 2, train_acc: 0.64000

###########################################################

# epoch: 0, train_acc: 0.46000
# epoch: 0, train_acc: 0.46000, val_acc: 0.53140, test_acc: 0.51260
# at the end of epoch, step_num: 7, get new best val_acc: 0.53140, save the model now!
# epoch: 1, train_acc: 0.60000
# epoch: 1, train_acc: 0.60000, val_acc: 0.54500, test_acc: 0.53620
# at the end of epoch, step_num: 14, get new best val_acc: 0.54500, save the model now!
# epoch: 2, train_acc: 0.74000
# epoch: 2, train_acc: 0.74000, val_acc: 0.53600, test_acc: 0.53240

###########################################################

In [None]:
del model  # 删除原来的模型，以减少GPU内存占用
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=cfg.model_name_or_path,
                                                           config=model_config)
model.to(device)
model.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, 'best_step_model.pth')))

test_labels, test_predict, test_loss = validation(dataloader_test, device)
test_acc = accuracy_score(test_labels, test_predict)
valid_labels, valid_predict, valid_loss = validation(dataloader_valid, device)
valid_acc = accuracy_score(valid_labels, valid_predict)
print(f'TRAIN_NUM: {TRAIN_NUM} final test_acc: {test_acc:.5f}, valid_acc: {valid_acc:.5f}, global_best_val_acc: {global_best_val_acc:.5f}')

print('total finished, cost time: ', time.time() - global_start_t)

# final test_acc: 0.52200, valid_acc: 0.51940

#######################################################
# TRAIN_NUM: 50 final test_acc: 0.53620, valid_acc: 0.54500, global_best_val_acc: 0.54500
# total finished, cost time:  301.0138831138611

# TRAIN_NUM: 200 final test_acc: 0.70560, valid_acc: 0.70340, global_best_val_acc: 0.70340  no_mid_eval
# TRAIN_NUM: 200 final test_acc: 0.71860, valid_acc: 0.71540, global_best_val_acc: 0.71540  mid_eval

# TRAIN_NUM: 500 final test_acc: 0.85720, valid_acc: 0.84540, global_best_val_acc: 0.84540  no_mid_eval