In [None]:
# 参考 https://github.com/AnthonyK97/Text-Classification-on-IMDB
# https://github.com/AnthonyK97/Text-Classification-on-IMDB/blob/main/2%20CNN%2BGlove.ipynb

import os
import sys
import random
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import OrderedDict
import re, string
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True
    
!mkdir ./model_bakup/

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class CFG:
    batch_size = 20
    lr = 0.02
    eval_step_num = 50
    mid_eval = False
    best_eval_acc = 0.0
    model_output_dir = './model_bakup/'
    seed = 2032
    use_ema = False
    use_adversial_training = False
    
DEBUG_RUN = True

global_start_t = time.time()
print('ok')

In [None]:
seed_everything(seed=42)

imdb_data = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1, 'negative': 0})
print('before drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.drop_duplicates()
print('after drop_duplicates, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(30000)
print('after sample, imdb_data.shape: ', imdb_data.shape)
imdb_data = imdb_data.sample(len(imdb_data)).reset_index(drop=True)  # shuffle

imdb_data.head(5)

In [None]:
MAX_WORDS = 10000   # 仅考虑最高频的10000个词
MAX_LEN = 200
word_count_dict = {}

def clean_text(text):
    lowercase = text.lower().replace('\n', ' ')
    stripped_html = re.sub('<br />', ' ', lowercase)
    cleaned_punctuation = re.sub('[%s]'%re.escape(string.punctuation), '', stripped_html)
    return cleaned_punctuation

for review in imdb_data['review'].values:
    cleaned_text = clean_text(review)
    for word in cleaned_text.split(' '):
        word_count_dict[word] = word_count_dict.get(word, 0) + 1
            
df_word_dict = pd.DataFrame(pd.Series(word_count_dict, name='count'))
df_word_dict = df_word_dict.sort_values(by='count', ascending=False)

df_word_dict = df_word_dict[:MAX_WORDS-2]     # 总共取前max_words-2个词
df_word_dict['word_id'] = range(2, MAX_WORDS)

word_id_dict = df_word_dict['word_id'].to_dict()
word_id_dict['<unknown>'] = 0
word_id_dict['<padding>'] = 1

df_word_dict.head(15)

In [None]:
def pad(data_list, pad_length):
    padded_list = data_list.copy()
    
    if len(data_list) > pad_length:
        padded_list = data_list[-pad_length:]
        
    if len(data_list) < pad_length:
        padded_list = [1] * (pad_length-len(data_list)) + data_list
        
    return padded_list

def text_to_token(text):
    cleaned_text = clean_text(text)
    word_token_list = [word_id_dict.get(word, 0) for word in cleaned_text.split(' ')]
    pad_list = pad(word_token_list, MAX_LEN)
    token = ' '.join([str(x) for x in pad_list])
    return token
            
process_start_t = time.time()
print('start processing...')
imdb_data['review_tokens'] = imdb_data['review'].map(text_to_token)
print('ok, cost time: ', time.time()-process_start_t)
imdb_data.head(5)

In [None]:
TRAIN_NUM = 15000
imdb_data_test = imdb_data.iloc[:5000]
imdb_data_valid = imdb_data.iloc[5000:10000]
imdb_data_train = imdb_data.iloc[10000:TRAIN_NUM+10000]

if DEBUG_RUN:
    SAMPLE_NUM = 3000
    imdb_data_test = imdb_data_test.sample(SAMPLE_NUM)
    imdb_data_valid = imdb_data_valid.sample(SAMPLE_NUM)
    imdb_data_train = imdb_data_train.sample(2*SAMPLE_NUM)

print(f'imdb_data_train.shape: {imdb_data_train.shape}, imdb_data_valid.shape: {imdb_data_valid.shape}, '
      f'imdb_data_test.shape: {imdb_data_test.shape}')

In [None]:
cfg = CFG()
seed_everything(seed=cfg.seed)

print('ok')

In [None]:
class imdbDataset(Dataset):
    def __init__(self, data_df):
        self.data_df = data_df
        
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, index):
        label = self.data_df.iloc[index]['sentiment']
        label = torch.tensor([float(label)], dtype=torch.float, device=device)
        
        tokens = self.data_df.iloc[index]['review_tokens']
        feature = torch.tensor([int(x) for x in tokens.split(' ')], dtype=torch.long, device=device)
            
        return feature, label
    
def generate_data_iter(cfg):
    global imdb_data_train, imdb_data_valid, imdb_data_test
    ds_train = imdbDataset(imdb_data_train)
    ds_valid = imdbDataset(imdb_data_valid)
    ds_test = imdbDataset(imdb_data_test)
    print('len of ds_train: ', len(ds_train), 'len of ds_valid: ', len(ds_valid),
          'len of ds_test: ', len(ds_test))

    dl_train = DataLoader(ds_train, batch_size=cfg.batch_size, shuffle=True, num_workers=0)
    dl_valid = DataLoader(ds_valid, batch_size=cfg.batch_size, shuffle=False, num_workers=0)
    dl_test = DataLoader(ds_test, batch_size=cfg.batch_size, shuffle=False, num_workers=0)
    return dl_train, dl_valid, dl_test

dl_train, dl_valid, dl_test = generate_data_iter(cfg)
print('ok')

In [None]:
class model_EMA:
    '''
    # https://zhuanlan.zhihu.com/p/68748778
    Example
    # 初始化
    ema = EMA(model, 0.999)

    # 训练阶段，更新完参数后

    '''
    def __init__(self, model, decay=0.99):
        self.model = model
        self.decay = decay
        self.registered = False
        self.shadow = {}
        self.backup = {}

    def is_registered(self):
        return self.registered

    def register(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()
        self.registered = True

    def update(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                assert name in self.shadow
                new_average = (1.0 - self.decay) * param.data + self.decay * self.shadow[name]
                self.shadow[name] = new_average.clone()

    def apply_shadow(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                assert name in self.shadow
                self.backup[name] = param.data
                param.data = self.shadow[name]

    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}
        
class FGM():
    '''
    Example
    # 初始化
    fgm = FGM(model,epsilon=1,emb_name='word_embeddings.')
    for batch_input, batch_label in data:
        # 正常训练
        loss = model(batch_input, batch_label)
        loss.backward() # 反向传播，得到正常的grad
        # 对抗训练
        fgm.attack() # 在embedding上添加对抗扰动
        #model.zero_grad()  # 如果需要两次回传梯度不累加, 只使用后面添加扰动之后的得到的梯度，则去掉该行的注释！
        loss_adv = model(batch_input, batch_label)
        loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
        fgm.restore() # 恢复embedding参数
        # 梯度下降，更新参数
        optimizer.step()
        model.zero_grad()
    '''
    def __init__(self, model, emb_name, epsilon=1.0, adv_random=False):
        # emb_name这个参数要换成你模型中embedding的参数名
        self.model = model
        self.epsilon = epsilon
        self.emb_name = emb_name
        self.adv_random = adv_random
        self.backup = {}

    def attack(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                #print('found an param: ', name)
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                #print('in attack() norm is ', norm)
                #print('param.data: ', param.data, 'param.grad: ', param.grad)
                #print('in attack() norm.shape is ', norm.shape, 'param.data.shape: ', param.data.shape, 'param.grad.shape: ', param.grad.shape)
                if norm!=0 and not torch.isnan(norm):
                    epsilon = self.epsilon
                    if self.adv_random:
                        epsilon *= random.uniform(0.5, 1.5)
                    #r_at = epsilon * param.grad / norm
                    r_at = 0.1 * random.uniform(0.5, 1.5) * param.grad
                    #r_at = 0.1 * param.grad
                    param.data.add_(r_at)

    def restore(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and self.emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}
        
print('ok')

In [None]:
EMBEDDING_DIM = 50

class MLP_Net(nn.Module):
    def __init__(self, hidden_size=1000):
        super().__init__()
        
        self.embedding = nn.Embedding(num_embeddings=MAX_WORDS, embedding_dim=EMBEDDING_DIM, padding_idx=1)
        
#         self.fc = nn.Sequential()
#         self.fc.add_module('fc_1', nn.Linear(EMBEDDING_DIM*MAX_LEN, 200))
#         self.fc.add_module('relu_1', nn.ReLU())
#         self.fc.add_module('fc_2', nn.Linear(200, 500))
#         self.fc.add_module('relu_2', nn.ReLU())
#         self.fc.add_module('fc_3', nn.Linear(500, 250))
#         self.fc.add_module('relu_3', nn.ReLU())
#         self.fc.add_module('fc_4', nn.Linear(250, 100))
#         self.fc.add_module('relu_4', nn.ReLU())
#         self.fc.add_module('fc_5', nn.Linear(100, 1))
#         self.fc.add_module('sigmoid_1', nn.Sigmoid())
        
        self.fc = nn.Sequential()
        self.fc.add_module('fc_1', nn.Linear(EMBEDDING_DIM*MAX_LEN, hidden_size))
        self.fc.add_module('relu_1', nn.ReLU())
        self.fc.add_module('dropout_1', nn.Dropout(p=0.5))
#         self.fc.add_module('relu_1', nn.LeakyReLU())
        self.fc.add_module('fc_2', nn.Linear(hidden_size, 1))
        self.fc.add_module('sigmoid_final', nn.Sigmoid())
        
    def forward(self, x):
        x = self.embedding(x).view(x.size(0), -1)
        x = self.fc(x)
        return x

model = MLP_Net()
print(model)
model.to(device)     
model_param_num = sum(p.numel() for p in model.parameters())
model_trainable_param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('model_param_num: ', model_param_num, 'model_trainable_param_num: ', 
      model_trainable_param_num)

print('ok')
# model_param_num:  10502001 model_trainable_param_num:  10502001

In [None]:
def accuracy(y_pred, y_true):
    if type(y_pred)==list:
        y_pred = np.array(y_pred)
    y_pred = (y_pred > 0.5)
    if type(y_true)==list:
        y_true = np.array(y_true)
    acc = (y_pred==y_true).mean()
    return acc

def evaluate(model, dl_test, device):
    global cfg
    model.eval()
    
    y_true_lst, y_pred_lst = [], []
    with torch.no_grad():
        for step, batch in enumerate(dl_test):
            feature, label = batch
            feature, label = feature.to(device), label.to(device)
            y_pred = model(feature)
            y_pred_lst += list(y_pred.detach().cpu().numpy())
            y_true_lst += list(label.detach().cpu().numpy())
            
    model.train() # 恢复模型为训练状态
    acc = accuracy(y_pred_lst, y_true_lst)

    return acc
    
def train(model, dl_train, optimizer, loss_func, device):
    global cfg, global_step_num, global_best_valid_acc, dl_valid,  model_ema, fgm
    model.train()  # 将模型置为训练状态
    
    loss_func1 = nn.BCELoss()
    loss_func2 = nn.MSELoss()

    y_true_lst, y_pred_lst = [], []
    for step, batch in enumerate(dl_train):
        global_step_num += 1
        feature, label = batch
        feature, label = feature.to(device), label.to(device)
        y_pred = model(feature)
        #train_loss = loss_func(y_pred, label)
        train_loss = 0.5*loss_func1(y_pred, label) + 0.5*loss_func2(y_pred, label)
        y_pred_lst += list(y_pred.detach().cpu().numpy())
        y_true_lst += list(label.detach().cpu().numpy())
        train_loss.backward()
        if cfg.use_adversial_training:
            fgm.attack() # 在embedding上添加对抗扰动
            y_pred = model(feature)
            #loss_adv = loss_func(y_pred, label)
            loss_adv = 0.5*loss_func1(y_pred, label) + 0.5*loss_func2(y_pred, label)
            loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
            fgm.restore() # 恢复embedding参数
            
        optimizer.step()
        model.zero_grad()
        if cfg.use_ema:
            model_ema.update()
        
        if cfg.mid_eval and (global_step_num % cfg.eval_step_num == 0):
            if cfg.use_ema:
                model_ema.apply_shadow()
        
            valid_acc = evaluate(model, dl_valid, device)
            print(f'step_num: {global_step_num}, valid_acc: {valid_acc:.5f}')
            if valid_acc > global_best_valid_acc:
                global_best_valid_acc = valid_acc
                print(f'step_num: {global_step_num}, get new best val_acc: {valid_acc:.5f}, save the model now!')                
                torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model.pth'))
                
            if cfg.use_ema:
                model_ema.restore()
        
    acc = accuracy(y_pred_lst, y_true_lst)
    return acc

print('ok')

In [None]:
global_best_train_acc, global_best_valid_acc = 0.0, 0.0
global_train_acc = 0.0
global_step_num = 0

epochs = 150
#epochs = 3
# optimizer=torch.optim.Adagrad(model.parameters(), lr=0.06)
# optimizer=torch.optim.Adadelta(model.parameters(), lr=10.0)
optimizer=torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-3)
# optimizer=torch.optim.AdamW(model.parameters(), lr=0.006, weight_decay=0.01)
# loss_func = nn.BCELoss()
loss_func = nn.MSELoss()

model_ema = None
fgm = None
if cfg.use_ema:
    model_ema = model_EMA(model, decay=0.95)
    model_ema.register()
    
if cfg.use_adversial_training:
    fgm = FGM(model, 'embedding', epsilon=1.0, adv_random=True)
    
for epoch in range(epochs):
    #lr_val = random.uniform(0.001, 0.006)
    #optimizer=torch.optim.Adam(model.parameters(), lr=lr_val, weight_decay=1e-5)
    train_acc = train(model, dl_train, optimizer, loss_func, device)
    if cfg.use_ema:
        model_ema.apply_shadow()
        
    valid_acc = evaluate(model, dl_valid, device)
    test_acc = evaluate(model, dl_test, device)
    print(f'in epoch: {epoch}, train_acc: {train_acc:.5f}, valid_acc: {valid_acc:.5f}, test_acc: {test_acc:.5f}')
    if train_acc > global_best_train_acc:
        global_best_train_acc = train_acc
    if valid_acc > global_best_valid_acc:
        global_best_valid_acc = valid_acc
        global_train_acc = train_acc
        print(f'at the end of epoch, global_step_num: {global_step_num} get new best_valid_acc: {valid_acc:.5f}, save the model now!')
        torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model_1.pth'))
        
    if cfg.use_ema:
        model_ema.restore()
        
model = MLP_Net()
model.to(device)

model.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, 'best_step_model_1.pth')))
test_acc = evaluate(model, dl_test, device)
print(f'final test_acc: {test_acc:.5f}, best_val_acc: {global_best_valid_acc:.5f}, '
      f'train_acc: {global_train_acc:.5f}, best_train_acc: {global_best_train_acc:.5f}')

#assert False

# final test_acc: 0.80500, best_val_acc: 0.81967, train_acc: 0.96117, best_train_acc: 0.98283
# in epoch: 0, train_acc: 0.52233, valid_acc: 0.54767, test_acc: 0.54633
# at the end of epoch, global_step_num: 300 get new best_valid_acc: 0.54767, save the model now!
# in epoch: 1, train_acc: 0.68633, valid_acc: 0.58667, test_acc: 0.59533
# at the end of epoch, global_step_num: 600 get new best_valid_acc: 0.58667, save the model now!
# in epoch: 2, train_acc: 0.77650, valid_acc: 0.66967, test_acc: 0.68333
# at the end of epoch, global_step_num: 900 get new best_valid_acc: 0.66967, save the model now!
# in epoch: 3, train_acc: 0.83233, valid_acc: 0.74067, test_acc: 0.75100
# at the end of epoch, global_step_num: 1200 get new best_valid_acc: 0.74067, save the model now!
# in epoch: 4, train_acc: 0.86467, valid_acc: 0.75800, test_acc: 0.77267
# at the end of epoch, global_step_num: 1500 get new best_valid_acc: 0.75800, save the model now!
# in epoch: 5, train_acc: 0.89417, valid_acc: 0.76833, test_acc: 0.77333
# at the end of epoch, global_step_num: 1800 get new best_valid_acc: 0.76833, save the model now!
# in epoch: 6, train_acc: 0.90850, valid_acc: 0.78033, test_acc: 0.78700
# at the end of epoch, global_step_num: 2100 get new best_valid_acc: 0.78033, save the model now!
# in epoch: 7, train_acc: 0.92850, valid_acc: 0.77200, test_acc: 0.77567
# in epoch: 8, train_acc: 0.93567, valid_acc: 0.76900, test_acc: 0.78400
# in epoch: 9, train_acc: 0.93367, valid_acc: 0.76267, test_acc: 0.78600
# in epoch: 10, train_acc: 0.94700, valid_acc: 0.78233, test_acc: 0.79033
# at the end of epoch, global_step_num: 3300 get new best_valid_acc: 0.78233, save the model now!
# in epoch: 11, train_acc: 0.95367, valid_acc: 0.77200, test_acc: 0.78000
# in epoch: 12, train_acc: 0.93933, valid_acc: 0.78367, test_acc: 0.79067
# at the end of epoch, global_step_num: 3900 get new best_valid_acc: 0.78367, save the model now!
# in epoch: 13, train_acc: 0.95050, valid_acc: 0.77233, test_acc: 0.79167
# in epoch: 14, train_acc: 0.96167, valid_acc: 0.76600, test_acc: 0.78133
# in epoch: 15, train_acc: 0.94517, valid_acc: 0.77500, test_acc: 0.79333
# in epoch: 16, train_acc: 0.95367, valid_acc: 0.78467, test_acc: 0.79500
# at the end of epoch, global_step_num: 5100 get new best_valid_acc: 0.78467, save the model now!
# in epoch: 17, train_acc: 0.96367, valid_acc: 0.79133, test_acc: 0.79167
# at the end of epoch, global_step_num: 5400 get new best_valid_acc: 0.79133, save the model now!
# in epoch: 18, train_acc: 0.96150, valid_acc: 0.78433, test_acc: 0.79233
# in epoch: 19, train_acc: 0.94867, valid_acc: 0.78700, test_acc: 0.79867
# in epoch: 20, train_acc: 0.96750, valid_acc: 0.79567, test_acc: 0.79967
# at the end of epoch, global_step_num: 6300 get new best_valid_acc: 0.79567, save the model now!
# in epoch: 21, train_acc: 0.97100, valid_acc: 0.78300, test_acc: 0.79500
# in epoch: 22, train_acc: 0.95767, valid_acc: 0.78800, test_acc: 0.79200
# in epoch: 23, train_acc: 0.94933, valid_acc: 0.78533, test_acc: 0.78667
# in epoch: 24, train_acc: 0.96950, valid_acc: 0.79867, test_acc: 0.79700
# at the end of epoch, global_step_num: 7500 get new best_valid_acc: 0.79867, save the model now!
# in epoch: 25, train_acc: 0.96833, valid_acc: 0.79167, test_acc: 0.79367
# in epoch: 26, train_acc: 0.95533, valid_acc: 0.78167, test_acc: 0.79333
# in epoch: 27, train_acc: 0.95950, valid_acc: 0.79000, test_acc: 0.79000
# in epoch: 28, train_acc: 0.97000, valid_acc: 0.79300, test_acc: 0.79467
# in epoch: 29, train_acc: 0.96283, valid_acc: 0.79167, test_acc: 0.80133
# in epoch: 30, train_acc: 0.96533, valid_acc: 0.79233, test_acc: 0.79300
# in epoch: 31, train_acc: 0.95783, valid_acc: 0.79667, test_acc: 0.79267
# in epoch: 32, train_acc: 0.96017, valid_acc: 0.79600, test_acc: 0.79633
# in epoch: 33, train_acc: 0.96950, valid_acc: 0.80633, test_acc: 0.80167
# at the end of epoch, global_step_num: 10200 get new best_valid_acc: 0.80633, save the model now!
# in epoch: 34, train_acc: 0.97033, valid_acc: 0.78900, test_acc: 0.80233
# in epoch: 35, train_acc: 0.96250, valid_acc: 0.77100, test_acc: 0.77833
# in epoch: 36, train_acc: 0.95467, valid_acc: 0.79533, test_acc: 0.79600
# in epoch: 37, train_acc: 0.97167, valid_acc: 0.81133, test_acc: 0.80100
# at the end of epoch, global_step_num: 11400 get new best_valid_acc: 0.81133, save the model now!
# in epoch: 38, train_acc: 0.97683, valid_acc: 0.79267, test_acc: 0.80333
# in epoch: 39, train_acc: 0.96617, valid_acc: 0.77900, test_acc: 0.78767
# in epoch: 40, train_acc: 0.95450, valid_acc: 0.79767, test_acc: 0.79867
# in epoch: 41, train_acc: 0.95683, valid_acc: 0.80800, test_acc: 0.80800
# in epoch: 42, train_acc: 0.97583, valid_acc: 0.80333, test_acc: 0.80433
# in epoch: 43, train_acc: 0.97183, valid_acc: 0.80567, test_acc: 0.79467
# in epoch: 44, train_acc: 0.96233, valid_acc: 0.79500, test_acc: 0.78967
# in epoch: 45, train_acc: 0.96400, valid_acc: 0.79300, test_acc: 0.78367
# in epoch: 46, train_acc: 0.96317, valid_acc: 0.78833, test_acc: 0.79267
# in epoch: 47, train_acc: 0.96633, valid_acc: 0.79333, test_acc: 0.80267
# in epoch: 48, train_acc: 0.97200, valid_acc: 0.79667, test_acc: 0.80533
# in epoch: 49, train_acc: 0.97050, valid_acc: 0.79800, test_acc: 0.79533
# in epoch: 50, train_acc: 0.96283, valid_acc: 0.80233, test_acc: 0.79267
# in epoch: 51, train_acc: 0.95967, valid_acc: 0.79100, test_acc: 0.79067
# in epoch: 52, train_acc: 0.97033, valid_acc: 0.80200, test_acc: 0.80233
# in epoch: 53, train_acc: 0.97283, valid_acc: 0.79800, test_acc: 0.80567
# in epoch: 54, train_acc: 0.96933, valid_acc: 0.79300, test_acc: 0.80133
# in epoch: 55, train_acc: 0.95533, valid_acc: 0.79100, test_acc: 0.79833
# in epoch: 56, train_acc: 0.96367, valid_acc: 0.79867, test_acc: 0.80567
# in epoch: 57, train_acc: 0.97617, valid_acc: 0.80167, test_acc: 0.80267
# in epoch: 58, train_acc: 0.97467, valid_acc: 0.79433, test_acc: 0.80900
# in epoch: 59, train_acc: 0.95967, valid_acc: 0.79567, test_acc: 0.80167
# in epoch: 60, train_acc: 0.95883, valid_acc: 0.78600, test_acc: 0.80100
# in epoch: 61, train_acc: 0.97900, valid_acc: 0.79567, test_acc: 0.79633
# in epoch: 62, train_acc: 0.97400, valid_acc: 0.78733, test_acc: 0.79333
# in epoch: 63, train_acc: 0.96150, valid_acc: 0.78233, test_acc: 0.78467
# in epoch: 64, train_acc: 0.96300, valid_acc: 0.80100, test_acc: 0.80567
# in epoch: 65, train_acc: 0.96900, valid_acc: 0.80333, test_acc: 0.80367
# in epoch: 66, train_acc: 0.97550, valid_acc: 0.79667, test_acc: 0.79967
# in epoch: 67, train_acc: 0.97033, valid_acc: 0.78367, test_acc: 0.79600
# in epoch: 68, train_acc: 0.96500, valid_acc: 0.79967, test_acc: 0.81200
# in epoch: 69, train_acc: 0.95667, valid_acc: 0.80733, test_acc: 0.79933
# in epoch: 70, train_acc: 0.97033, valid_acc: 0.79733, test_acc: 0.80067
# in epoch: 71, train_acc: 0.97600, valid_acc: 0.79967, test_acc: 0.80033
# in epoch: 72, train_acc: 0.97033, valid_acc: 0.78533, test_acc: 0.78200
# in epoch: 73, train_acc: 0.95633, valid_acc: 0.79500, test_acc: 0.79533
# in epoch: 74, train_acc: 0.96117, valid_acc: 0.81967, test_acc: 0.80500
# at the end of epoch, global_step_num: 22500 get new best_valid_acc: 0.81967, save the model now!
# in epoch: 75, train_acc: 0.97667, valid_acc: 0.79333, test_acc: 0.80233
# in epoch: 76, train_acc: 0.98267, valid_acc: 0.79300, test_acc: 0.80133
# in epoch: 77, train_acc: 0.97367, valid_acc: 0.78700, test_acc: 0.78233
# in epoch: 78, train_acc: 0.94567, valid_acc: 0.79233, test_acc: 0.79300
# in epoch: 79, train_acc: 0.96300, valid_acc: 0.79567, test_acc: 0.79833
# in epoch: 80, train_acc: 0.97550, valid_acc: 0.80433, test_acc: 0.80767
# in epoch: 81, train_acc: 0.97667, valid_acc: 0.79033, test_acc: 0.80033
# in epoch: 82, train_acc: 0.96150, valid_acc: 0.79333, test_acc: 0.78667
# in epoch: 83, train_acc: 0.95717, valid_acc: 0.79900, test_acc: 0.79667
# in epoch: 84, train_acc: 0.97517, valid_acc: 0.80300, test_acc: 0.79900
# in epoch: 85, train_acc: 0.97483, valid_acc: 0.79500, test_acc: 0.79800
# in epoch: 86, train_acc: 0.96833, valid_acc: 0.79600, test_acc: 0.80600
# in epoch: 87, train_acc: 0.95450, valid_acc: 0.79233, test_acc: 0.80300
# in epoch: 88, train_acc: 0.97267, valid_acc: 0.80867, test_acc: 0.81200
# in epoch: 89, train_acc: 0.97767, valid_acc: 0.80333, test_acc: 0.80400
# in epoch: 90, train_acc: 0.97450, valid_acc: 0.79733, test_acc: 0.79433
# in epoch: 91, train_acc: 0.94333, valid_acc: 0.80167, test_acc: 0.79067
# in epoch: 92, train_acc: 0.96883, valid_acc: 0.80400, test_acc: 0.79933
# in epoch: 93, train_acc: 0.97850, valid_acc: 0.79900, test_acc: 0.81000
# in epoch: 94, train_acc: 0.97633, valid_acc: 0.79667, test_acc: 0.79933
# in epoch: 95, train_acc: 0.96167, valid_acc: 0.79733, test_acc: 0.79600
# in epoch: 96, train_acc: 0.95900, valid_acc: 0.80100, test_acc: 0.79933
# in epoch: 97, train_acc: 0.97633, valid_acc: 0.80833, test_acc: 0.81667
# in epoch: 98, train_acc: 0.97467, valid_acc: 0.80033, test_acc: 0.80700
# in epoch: 99, train_acc: 0.96400, valid_acc: 0.79600, test_acc: 0.79533
# in epoch: 100, train_acc: 0.96417, valid_acc: 0.79300, test_acc: 0.81000
# in epoch: 101, train_acc: 0.97033, valid_acc: 0.80133, test_acc: 0.80567
# in epoch: 102, train_acc: 0.96767, valid_acc: 0.80233, test_acc: 0.80267
# in epoch: 103, train_acc: 0.97017, valid_acc: 0.79467, test_acc: 0.79867
# in epoch: 104, train_acc: 0.97567, valid_acc: 0.80933, test_acc: 0.80200
# in epoch: 105, train_acc: 0.96400, valid_acc: 0.79633, test_acc: 0.79500
# in epoch: 106, train_acc: 0.95267, valid_acc: 0.80500, test_acc: 0.79900
# in epoch: 107, train_acc: 0.97167, valid_acc: 0.81033, test_acc: 0.80300
# in epoch: 108, train_acc: 0.98033, valid_acc: 0.80833, test_acc: 0.80067
# in epoch: 109, train_acc: 0.96667, valid_acc: 0.79467, test_acc: 0.78867
# in epoch: 110, train_acc: 0.95833, valid_acc: 0.79400, test_acc: 0.79900
# in epoch: 111, train_acc: 0.96767, valid_acc: 0.79233, test_acc: 0.79800
# in epoch: 112, train_acc: 0.97533, valid_acc: 0.79233, test_acc: 0.79867
# in epoch: 113, train_acc: 0.96900, valid_acc: 0.79433, test_acc: 0.79700
# in epoch: 114, train_acc: 0.96567, valid_acc: 0.78733, test_acc: 0.79867
# in epoch: 115, train_acc: 0.96483, valid_acc: 0.80967, test_acc: 0.79333
# in epoch: 116, train_acc: 0.97750, valid_acc: 0.79833, test_acc: 0.79300
# in epoch: 117, train_acc: 0.97350, valid_acc: 0.80467, test_acc: 0.79567
# in epoch: 118, train_acc: 0.96967, valid_acc: 0.80733, test_acc: 0.80067
# in epoch: 119, train_acc: 0.97283, valid_acc: 0.80400, test_acc: 0.80133
# in epoch: 120, train_acc: 0.96950, valid_acc: 0.80200, test_acc: 0.80300
# in epoch: 121, train_acc: 0.95900, valid_acc: 0.80400, test_acc: 0.79267
# in epoch: 122, train_acc: 0.96383, valid_acc: 0.81467, test_acc: 0.79867
# in epoch: 123, train_acc: 0.97967, valid_acc: 0.81300, test_acc: 0.79933
# in epoch: 124, train_acc: 0.97967, valid_acc: 0.79567, test_acc: 0.79867
# in epoch: 125, train_acc: 0.96200, valid_acc: 0.79100, test_acc: 0.78767
# in epoch: 126, train_acc: 0.94850, valid_acc: 0.80167, test_acc: 0.79567
# in epoch: 127, train_acc: 0.98283, valid_acc: 0.80167, test_acc: 0.81133
# in epoch: 128, train_acc: 0.98000, valid_acc: 0.80300, test_acc: 0.80100
# in epoch: 129, train_acc: 0.96617, valid_acc: 0.80600, test_acc: 0.80100
# in epoch: 130, train_acc: 0.96183, valid_acc: 0.80433, test_acc: 0.79967
# in epoch: 131, train_acc: 0.96833, valid_acc: 0.77967, test_acc: 0.78733
# in epoch: 132, train_acc: 0.96833, valid_acc: 0.79567, test_acc: 0.80233
# in epoch: 133, train_acc: 0.97100, valid_acc: 0.80400, test_acc: 0.79400
# in epoch: 134, train_acc: 0.96717, valid_acc: 0.80267, test_acc: 0.79367
# in epoch: 135, train_acc: 0.96750, valid_acc: 0.79567, test_acc: 0.79633
# in epoch: 136, train_acc: 0.97683, valid_acc: 0.80867, test_acc: 0.80633
# in epoch: 137, train_acc: 0.95883, valid_acc: 0.78600, test_acc: 0.79700
# in epoch: 138, train_acc: 0.96133, valid_acc: 0.80033, test_acc: 0.80700
# in epoch: 139, train_acc: 0.97833, valid_acc: 0.80333, test_acc: 0.80400
# in epoch: 140, train_acc: 0.97717, valid_acc: 0.79800, test_acc: 0.81900
# in epoch: 141, train_acc: 0.96967, valid_acc: 0.79233, test_acc: 0.79933
# in epoch: 142, train_acc: 0.95300, valid_acc: 0.79267, test_acc: 0.80067
# in epoch: 143, train_acc: 0.96467, valid_acc: 0.80467, test_acc: 0.79633
# in epoch: 144, train_acc: 0.97667, valid_acc: 0.79900, test_acc: 0.80600
# in epoch: 145, train_acc: 0.98017, valid_acc: 0.80633, test_acc: 0.79600
# in epoch: 146, train_acc: 0.96400, valid_acc: 0.80033, test_acc: 0.78767
# in epoch: 147, train_acc: 0.95433, valid_acc: 0.80033, test_acc: 0.79700
# in epoch: 148, train_acc: 0.96983, valid_acc: 0.80267, test_acc: 0.80933
# in epoch: 149, train_acc: 0.98167, valid_acc: 0.78400, test_acc: 0.78900

# seed = 2032
# final test_acc: 0.80500, best_val_acc: 0.81967, train_acc: 0.96117, best_train_acc: 0.98283

# seed = 2033
# final test_acc: 0.81633, best_val_acc: 0.81933, train_acc: 0.97733, best_train_acc: 0.98467

# seed = 2034
# final test_acc: 0.81700, best_val_acc: 0.81733, train_acc: 0.98350, best_train_acc: 0.98350

In [None]:
# optimizer=torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-4)  epoch=150
# final test_acc: 0.78567, best_val_acc: 0.78700, train_acc: 1.00000, best_train_acc: 1.00000
# total finished, cost time:  1460.5737087726593

# optimizer=torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-3)  epoch=150
# final test_acc: 0.81000, best_val_acc: 0.81200, train_acc: 0.99933, best_train_acc: 0.99983
# total finished, cost time:  1039.3225367069244

# optimizer=torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-3)  epoch=150   LeakyRELU
# final test_acc: 0.80100, best_val_acc: 0.80633, train_acc: 0.99850, best_train_acc: 0.99983
# total finished, cost time:  1046.8245861530304

# optimizer=torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-2)  不收敛！！！

# optimizer=torch.optim.Adam(model.parameters(), lr=0.004, weight_decay=1e-3)  epoch=150
# final test_acc: 0.80067, best_val_acc: 0.80667, train_acc: 0.99033, best_train_acc: 0.99733
# total finished, cost time:  1028.5406827926636

# optimizer=torch.optim.AdamW(model.parameters(), lr=0.005, weight_decay=0.01)
# final test_acc: 0.77700, best_val_acc: 0.78867, train_acc: 0.97367, best_train_acc: 0.99933
# total finished, cost time:  1047.6061577796936

# optimizer=torch.optim.AdamW(model.parameters(), lr=0.005, weight_decay=0.001)
# final test_acc: 0.78433, best_val_acc: 0.77900, train_acc: 0.96367, best_train_acc: 0.99383
# total finished, cost time:  1044.7504827976227

# optimizer=torch.optim.AdamW(model.parameters(), lr=0.006, weight_decay=0.001)
# final test_acc: 0.77867, best_val_acc: 0.77267, train_acc: 0.95467, best_train_acc: 0.98800
# total finished, cost time:  1052.4147984981537

In [None]:
seed_everything(seed=cfg.seed+2)
model = MLP_Net()
model.to(device)     
model_param_num = sum(p.numel() for p in model.parameters())
model_trainable_param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)

global_best_train_acc, global_best_valid_acc = 0.0, 0.0
global_train_acc = 0.0
global_step_num = 0

optimizer=torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-3)
# optimizer=torch.optim.AdamW(model.parameters(), lr=0.006, weight_decay=0.01)
# loss_func = nn.BCELoss()
loss_func = nn.MSELoss()

model_ema = None
fgm = None
if cfg.use_ema:
    model_ema = model_EMA(model, decay=0.95)
    model_ema.register()
    
if cfg.use_adversial_training:
    fgm = FGM(model, 'embedding', epsilon=1.0, adv_random=True)
    
for epoch in range(epochs):
    train_acc = train(model, dl_train, optimizer, loss_func, device)
    if cfg.use_ema:
        model_ema.apply_shadow()
        
    valid_acc = evaluate(model, dl_valid, device)
    test_acc = evaluate(model, dl_test, device)
    print(f'in epoch: {epoch}, train_acc: {train_acc:.5f}, valid_acc: {valid_acc:.5f}, test_acc: {test_acc:.5f}')
    if train_acc > global_best_train_acc:
        global_best_train_acc = train_acc
    if valid_acc > global_best_valid_acc:
        global_best_valid_acc = valid_acc
        global_train_acc = train_acc
        print(f'at the end of epoch, global_step_num: {global_step_num} get new best_valid_acc: {valid_acc:.5f}, save the model now!')
        torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model_2.pth'))
        
    if cfg.use_ema:
        model_ema.restore()
        
model = MLP_Net()
model.to(device)

model.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, 'best_step_model_2.pth')))
test_acc = evaluate(model, dl_test, device)
print(f'final test_acc: {test_acc:.5f}, best_val_acc: {global_best_valid_acc:.5f}, '
      f'train_acc: {global_train_acc:.5f}, best_train_acc: {global_best_train_acc:.5f}')

In [None]:
seed_everything(seed=cfg.seed+3)
model = MLP_Net()
model.to(device)     
model_param_num = sum(p.numel() for p in model.parameters())
model_trainable_param_num = sum(p.numel() for p in model.parameters() if p.requires_grad)

global_best_train_acc, global_best_valid_acc = 0.0, 0.0
global_train_acc = 0.0
global_step_num = 0

optimizer=torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-3)
loss_func = nn.MSELoss()

model_ema = None
fgm = None
if cfg.use_ema:
    model_ema = model_EMA(model, decay=0.95)
    model_ema.register()
    
if cfg.use_adversial_training:
    fgm = FGM(model, 'embedding', epsilon=1.0, adv_random=True)
    
for epoch in range(epochs):
    train_acc = train(model, dl_train, optimizer, loss_func, device)
    if cfg.use_ema:
        model_ema.apply_shadow()
        
    valid_acc = evaluate(model, dl_valid, device)
    test_acc = evaluate(model, dl_test, device)
    print(f'in epoch: {epoch}, train_acc: {train_acc:.5f}, valid_acc: {valid_acc:.5f}, test_acc: {test_acc:.5f}')
    if train_acc > global_best_train_acc:
        global_best_train_acc = train_acc
    if valid_acc > global_best_valid_acc:
        global_best_valid_acc = valid_acc
        global_train_acc = train_acc
        print(f'at the end of epoch, global_step_num: {global_step_num} get new best_valid_acc: {valid_acc:.5f}, save the model now!')
        torch.save(model.state_dict(), os.path.join(cfg.model_output_dir, 'best_step_model_3.pth'))
        
    if cfg.use_ema:
        model_ema.restore()
        
model = MLP_Net()
model.to(device)
model.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, 'best_step_model_3.pth')))
test_acc = evaluate(model, dl_test, device)
print(f'final test_acc: {test_acc:.5f}, best_val_acc: {global_best_valid_acc:.5f}, '
      f'train_acc: {global_train_acc:.5f}, best_train_acc: {global_best_train_acc:.5f}')

In [None]:
model_names_lst = ['best_step_model_1.pth', 'best_step_model_2.pth', 'best_step_model_3.pth']

def evaluate_ensemble(model_names_lst, dl_test, device):
    y_pred_avg = None
    for model_name in model_names_lst:
        model = MLP_Net()
        model.to(device)
        model.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, model_name)))
        model.eval()

        y_true_lst, y_pred_lst = [], []
        with torch.no_grad():
            for step, batch in enumerate(dl_test):
                feature, label = batch
                feature, label = feature.to(device), label.to(device)
                y_pred = model(feature)
                y_pred_lst += list(y_pred.detach().cpu().numpy())
                y_true_lst += list(label.detach().cpu().numpy())
                
        if y_pred_avg is None:
            y_pred_avg = y_pred_lst
        else:
            y_pred_avg = [v1+v2 for v1, v2 in zip(y_pred_avg, y_pred_lst)]
        del model
            
    y_pred_avg = [v/3.0 for v in y_pred_avg]
    acc = accuracy(y_pred_avg, y_true_lst)

    return acc

valid_acc_ensemble = evaluate_ensemble(model_names_lst, dl_valid, device)
test_acc_ensemble = evaluate_ensemble(model_names_lst, dl_test, device)
print(f'final test_acc_ensemble: {test_acc_ensemble:.5f}, valid_acc_ensemble: {valid_acc_ensemble:.5f}')

print('ok, finished, total cost time: ', time.time() - global_start_t)

# final test_acc: 0.79067, best_val_acc: 0.78567, train_acc: 0.93000, best_train_acc: 0.93100  # model_1  epoch_10
# final test_acc: 0.79767, best_val_acc: 0.77733, train_acc: 0.93717, best_train_acc: 0.93717  # model_2  epoch_10
# final test_acc: 0.78867, best_val_acc: 0.78067, train_acc: 0.93250, best_train_acc: 0.93250  # model_3  epoch_10
# final test_acc_ensemble: 0.82233, valid_acc_ensemble: 0.80767

In [None]:
model_names_lst = ['best_step_model_1.pth', 'best_step_model_2.pth', 'best_step_model_3.pth']

def zero_model_params(model):
    for name, param in model.named_parameters():
        param.data.fill_(0.0)
        
def add_two_model(model, model_to_add):
    state_1 = model.state_dict()
    state_2 = model_to_add.state_dict()
    for layer in state_1:
        state_1[layer] = (state_1[layer] + state_2[layer])
    model.load_state_dict(state_1)
    return model

def evaluate_ensemble_sum(model_names_lst, dl_test, device):
    model = MLP_Net()
    model.to(device)
    zero_model_params(model)
    for model_name in model_names_lst:
        model_to_add = MLP_Net()
        model_to_add.to(device)
        model_to_add.load_state_dict(torch.load(os.path.join(cfg.model_output_dir, model_name)))
        model = add_two_model(model, model_to_add)
    for name, param in model.named_parameters():
        param.data.div_(3.0)
    model.eval()

    y_true_lst, y_pred_lst = [], []
    with torch.no_grad():
        for step, batch in enumerate(dl_test):
            feature, label = batch
            feature, label = feature.to(device), label.to(device)
            y_pred = model(feature)
            y_pred_lst += list(y_pred.detach().cpu().numpy())
            y_true_lst += list(label.detach().cpu().numpy())
            
    acc = accuracy(y_pred_lst, y_true_lst)
    return acc

valid_acc_ensemble = evaluate_ensemble_sum(model_names_lst, dl_valid, device)
test_acc_ensemble = evaluate_ensemble_sum(model_names_lst, dl_test, device)
print(f'final test_acc_ensemble: {test_acc_ensemble:.5f}, valid_acc_ensemble: {valid_acc_ensemble:.5f}')

print('ok, finished, total cost time: ', time.time() - global_start_t)