In [21]:
!nvidia-smi

Sat Apr  9 06:49:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    73W / 149W |   5602MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [22]:
from google.colab import drive
import sys
drive.mount('/content/drive')
#设置路径
sys.path.append('/content/drive/MyDrive/Colab Notebooks')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
! pip install transformers==4.0.1



In [24]:
! pip install torch==1.4.0



In [25]:
import torch
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

config = {
    'train_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/train.csv',
    'test_file_path':'/content/drive/MyDrive/Colab Notebooks/dataset/test.csv',
    'train_val_ratio':0.1,
    'model_path':'/content/drive/MyDrive/Colab Notebooks/dataset/NeZha_model',
    'batch_size':16,
    'head': 'CNN',
    'num_epochs':1,
    'warmup_ratio':0.1, # warm up
    'learning_rate':2e-5,
    'logging_step':500,
    'seed':2022
}

config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'

def seed_everything(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  return seed

seed_everything(config['seed'])

2022

In [26]:
from collections import defaultdict
def read_data(config, tokenizer, mode = 'train'):
  data_df = pd.read_csv(config[f'{mode}_file_path'], sep=',')
  if mode == 'train':
    X_train, y_train = defaultdict(list),[]
    X_val, y_val = defaultdict(list),[]
    num_val = int(len(data_df) * config['train_val_ratio'])
  else:
    X_test, y_test = defaultdict(list),[]

  for i, row in tqdm(data_df.iterrows(), desc=f'preprocess {mode} data', colour = 'blue', total = len(data_df)):
    label = row[1] if mode == 'train' else 0
    sentence = row[-1]

    inputs = tokenizer.encode_plus(sentence, add_special_tokens = True, return_token_type_ids = True, return_attention_mask = True)

    if mode == 'train':
      if i < num_val:
        X_val['inputs_ids'].append(inputs['input_ids'])
        y_val.append(label)
        X_val['token_type_ids'].append(inputs['token_type_ids'])
        X_val['attention_mask'].append(inputs['attention_mask'])
      else:
        X_train['inputs_ids'].append(inputs['input_ids'])
        y_train.append(label)
        X_train['token_type_ids'].append(inputs['token_type_ids'])
        X_train['attention_mask'].append(inputs['attention_mask'])

    else:
        X_test['inputs_ids'].append(inputs['input_ids'])
        y_test.append(label)
        X_test['token_type_ids'].append(inputs['token_type_ids'])
        X_test['attention_mask'].append(inputs['attention_mask'])

  if mode == 'train':
    label2id = {label: i for i, label in enumerate(np.unique(y_train))}
    id2label = {i: label for label, i in label2id.items()}

    y_train = torch.tensor([label2id[i] for i in y_train], dtype =torch.long)

    y_val = torch.tensor([label2id[i] for i in y_val], dtype =torch.long)
    return X_train, y_train, X_val, y_val, label2id, id2label

  else:
    y_test = torch.tensor(y_test, dtype = torch.long)
    return X_test, y_test


In [27]:
from torch.utils.data import Dataset
class TNEWSData(Dataset):
  def __init__(self, X, y):
    self.x = X
    self.y = y
  
 
  def __getitem__(self, idx):
    return{
        'inputs_ids': self.x['inputs_ids'][idx],
        'label':self.y[idx],
        'token_type_ids':self.x['token_type_ids'][idx],
        'attention_mask':self.x['attention_mask'][idx]

    }

 
  def __len__(self):
    return self.y.size(0)

In [28]:
# 从 TNEWSDataset 返回的多个example，合并为Tensor
def collate_fn(example):
  input_ids_list = []
  labels = []
  token_type_ids_list = []
  attention_mask_list = []

  for ex in example:
    input_ids_list.append(ex['inputs_ids'])
    labels.append(ex['label'])
    token_type_ids_list.append(ex['token_type_ids'])
    attention_mask_list.append(ex['attention_mask'])

  max_len = max(len(input_ids) for input_ids in input_ids_list)
  input_ids_tensor = torch.zeros((len(labels), max_len),dtype=torch.long)
  token_type_ids_tensor = torch.zeros_like(input_ids_tensor)
  attention_mask_tensor = torch.zeros_like(input_ids_tensor)

  for i, input_ids in enumerate(input_ids_list):
    input_ids_tensor[i, :len(input_ids)] = torch.tensor(input_ids, dtype = torch.long)
    token_type_ids_tensor[i, :len(input_ids)] = torch.tensor(token_type_ids_list[i], dtype = torch.long)
    attention_mask_tensor[i, :len(input_ids)] = torch.tensor(attention_mask_list[i], dtype = torch.long)

  return {
      'input_ids': input_ids_tensor,
      'labels': torch.tensor(labels ,dtype= torch.long),
      'token_type_ids':token_type_ids_tensor,
      'attention_mask':attention_mask_tensor
  }  

In [29]:
# dataloader 可以并行
from transformers import BertTokenizer
from torch.utils.data import DataLoader
def build_dataloader(config):
  tokenizer = BertTokenizer.from_pretrained(config['model_path'])
  X_train, y_train, X_val, y_val, label2id, id2label = read_data(config, tokenizer, mode='train')
  X_test, y_test = read_data(config, tokenizer, mode='test')

  train_dataset = TNEWSData(X_train, y_train)
  val_dataset = TNEWSData(X_val, y_val)
  test_dataset = TNEWSData(X_test, y_test)

  train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], num_workers = 4, shuffle = True, collate_fn=collate_fn)
  val_dataloader = DataLoader(val_dataset, batch_size=config['batch_size'], num_workers = 4, shuffle = False, collate_fn=collate_fn)
  test_dataloader = DataLoader(test_dataset, batch_size=config['batch_size'], num_workers = 4, shuffle = False, collate_fn=collate_fn)

  return train_dataloader, val_dataloader, test_dataloader, id2label


In [30]:
train_dataloader, val_dataloader, test_dataloader, id2label = build_dataloader(config)

preprocess train data: 100%|[34m██████████[0m| 53360/53360 [00:44<00:00, 1189.14it/s]
preprocess test data: 100%|[34m██████████[0m| 10000/10000 [00:05<00:00, 1784.76it/s]


In [31]:
import torch.nn.functional as F
import torch.nn as nn
from NeZha import *
class NeZhaForTNEWS(NeZhaPreTrainedModel):
  def __init__(self, config, model_path, classifier):
    super(NeZhaForTNEWS, self).__init__(config)

    self.nezha = NeZhaModel.from_pretrained(model_path, config=config)
    self.classifier = classifier  # head
    self.config = config

  def forward(self, input_ids, token_type_ids, attention_mask, labels):
    outputs = self.nezha(input_ids = input_ids,
                token_type_ids = token_type_ids,
                attention_mask = attention_mask)
    hidden_states = outputs[2]

    logits = self.classifier(hidden_states, input_ids)

    outputs = (logits, )

    if labels is not None:
      loss_fct = FocalLoss(num_classes=self.config.num_labels)
      loss = loss_fct(logits, labels.view(-1))
      outputs = (loss, )+ outputs

    return outputs

## typing模块的作用
- 类型检查，防止运行时出现参数和返回值类型不符合。
- 作为开发文档附加说明，方便使用者调用时传入和返回参数类型。
- 该模块加入后并不会影响程序的运行，不会报正式的错误，只有提醒。

### 说明：

- 在传入参数时通过“参数名:类型”的形式声明参数的类型；
- 返回结果通过"-> 结果类型"的形式声明结果的类型。
- 在调用的时候如果参数的类型不正确pycharm会有提醒，但不会影响程序的运行。
- 对于如list列表等，还可以规定得更加具体一些，如：“-> List[str]”,规定返回的是列表，并且元素是字符串。

*举例*
```
from typing import List, Tuple, Dict
def add(a:int, string:str, f:float, b:bool) -> Tuple[List, Tuple, Dict, bool]:
list1 = list(range(a))
    tup = (string, string, string)
    d = {"a":f}
    bl = b
    return list1, tup, d,bl
print(add(5,"hhhh", 2.3, False))
```
结果：([0, 1, 2, 3, 4], ('hhhh', 'hhhh', 'hhhh'), {'a': 2.3}, False)

In [32]:
from typing import List
class ConvClassifier(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.conv = nn.Conv1d(in_channels = config.hidden_size, out_channels = config.hidden_size, kernel_size = 3, padding=(3 - 1) // 2)
    self.global_max_pool = nn.AdaptiveMaxPool1d(1)
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.fc = nn.Linear(config.hidden_size, config.num_labels)

  def forward(self, hidden_states: List[torch.Tensor], input_ids: torch.Tensor):
    hidden_states = self.dropout(hidden_states[-1])
    hidden_states = hidden_states.permute(0 ,2, 1)

    out = F.relu(self.conv(hidden_states))
    out = self.global_max_pool(out).squeeze(dim=2)
    out = self.fc(out)
    
    return out

In [33]:
def build_model(model_path, config, head):
  heads = {
      'CNN': ConvClassifier
  }
  assert head in heads ,"head must have been implemented"
  print(f'>>> You are using {head} head, please wait...')
  model = NeZhaForTNEWS(config, model_path, heads[head](config))
  return model

In [34]:
from sklearn.metrics import f1_score
def evaluation(config, model, val_dataloader):
  model.eval()
  preds = []
  labels = []
  val_loss = 0.
  val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))

  with torch.no_grad():
    for batch in val_iterator:
      labels.append(batch['labels'])
      batch = {item: value.to(config['device']) for item, value in batch.items()}
      loss, logits = model(**batch)[:2]
      val_loss += loss.item()
      preds.append(logits.argmax(dim = -1).detach().cpu())

  avg_val_loss = val_loss / len(val_dataloader)
  labels = torch.cat(labels, dim = 0).numpy()
  preds = torch.cat(preds, dim = 0).numpy()
  f1 = f1_score(labels, preds, average='macro')
  return avg_val_loss, f1


In [35]:
from extra_loss import *
from extra_optim import *
from transformers import AdamW
from tqdm import trange
def train(config, id2label, train_dataloader, val_dataloader):
  nezha_config = NeZhaConfig.from_pretrained(config['model_path'])
  nezha_config.output_hidden_states = True
  nezha_config.num_labels = len(id2label)

  model = build_model(config['model_path'], nezha_config, config['head'])

  # 得到模型的参数
  optimizer_grouped_parameters = model.parameters()
  # 定义一个基优化器
  optimizer = AdamW(model.parameters(), lr= config['learning_rate'])
  # Lookahead要有一个基优化器， k=5, alpha=1
  optimizer = Lookahead(optimizer, 5, 1)
  total_steps = config['num_epochs'] * len(train_dataloader)
  # 每调用warmup_steps次， 对应的学习率就会调整一次
  lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps = int(config['learning_rate'] * total_steps), t_total = total_steps)
                                                                   

  model.to(config['device'])
  epoch_iterator = trange(config['num_epochs'])
  global_steps = 0
  train_loss = 0.
  logging_loss = 0.

  for epoch in epoch_iterator:
    train_iterator = tqdm(train_dataloader, desc='Training', total=len(train_dataloader))
    model.train()
    for batch in train_iterator:
      batch = {item: value.to(config['device']) for item, value in batch.items()}
      loss = model(**batch)[0]
      model.zero_grad()
      loss.backward()
      optimizer.step()

      train_loss += loss.item()
      global_steps += 1

      if global_steps % config['logging_step'] == 0:
        print_train_loss = (train_loss - logging_loss) / config['logging_step']
        logging_loss = train_loss
        avg_val_loss, f1 = evaluation(config, model, val_dataloader)

        print_log = f'>>>traing loss:{print_train_loss: .5f}, valid loss:{avg_val_loss: .5f}, valid f1 score:{f1: .5f}'
        print(print_log)
        model.train()

  return model    

In [36]:
model = train(config, id2label, train_dataloader, val_dataloader)

>>> You are using CNN head, please wait...


Some weights of NeZhaModel were not initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/dataset/NeZha_model and are newly initialized: ['bert.encoder.layer.0.attention.self.relative_positions_encoding.positions_encoding', 'bert.encoder.layer.1.attention.self.relative_positions_encoding.positions_encoding', 'bert.encoder.layer.2.attention.self.relative_positions_encoding.positions_encoding', 'bert.encoder.layer.3.attention.self.relative_positions_encoding.positions_encoding', 'bert.encoder.layer.4.attention.self.relative_positions_encoding.positions_encoding', 'bert.encoder.layer.5.attention.self.relative_positions_encoding.positions_encoding', 'bert.encoder.layer.6.attention.self.relative_positions_encoding.positions_encoding', 'bert.encoder.layer.7.attention.self.relative_positions_encoding.positions_encoding', 'bert.encoder.layer.8.attention.self.relative_positions_encoding.positions_encoding', 'bert.encoder.layer.9.attention.self.relative_positions_encodi

>>>traing loss: 0.96119, valid loss: 0.72347, valid f1 score: 0.47554



Training:  17%|█▋        | 501/3002 [02:59<4:42:21,  6.77s/it][A
Training:  17%|█▋        | 502/3002 [03:00<3:20:53,  4.82s/it][A
Training:  17%|█▋        | 503/3002 [03:00<2:24:05,  3.46s/it][A
Training:  17%|█▋        | 504/3002 [03:00<1:44:09,  2.50s/it][A
Training:  17%|█▋        | 505/3002 [03:00<1:16:29,  1.84s/it][A
Training:  17%|█▋        | 506/3002 [03:01<57:34,  1.38s/it]  [A
Training:  17%|█▋        | 507/3002 [03:01<44:18,  1.07s/it][A
Training:  17%|█▋        | 508/3002 [03:01<34:42,  1.20it/s][A
Training:  17%|█▋        | 509/3002 [03:02<27:51,  1.49it/s][A
Training:  17%|█▋        | 510/3002 [03:02<23:01,  1.80it/s][A
Training:  17%|█▋        | 511/3002 [03:02<19:58,  2.08it/s][A
Training:  17%|█▋        | 512/3002 [03:03<17:30,  2.37it/s][A
Training:  17%|█▋        | 513/3002 [03:03<15:31,  2.67it/s][A
Training:  17%|█▋        | 514/3002 [03:03<14:06,  2.94it/s][A
Training:  17%|█▋        | 515/3002 [03:03<13:08,  3.16it/s][A
Training:  17%|█▋        | 

>>>traing loss: 0.72028, valid loss: 0.66758, valid f1 score: 0.49586



Training:  33%|███▎      | 1001/3002 [05:56<3:41:32,  6.64s/it][A
Training:  33%|███▎      | 1002/3002 [05:56<2:37:51,  4.74s/it][A
Training:  33%|███▎      | 1003/3002 [05:56<1:53:20,  3.40s/it][A
Training:  33%|███▎      | 1004/3002 [05:57<1:22:16,  2.47s/it][A
Training:  33%|███▎      | 1005/3002 [05:57<1:00:57,  1.83s/it][A
Training:  34%|███▎      | 1006/3002 [05:57<45:34,  1.37s/it]  [A
Training:  34%|███▎      | 1007/3002 [05:57<34:42,  1.04s/it][A
Training:  34%|███▎      | 1008/3002 [05:58<27:10,  1.22it/s][A
Training:  34%|███▎      | 1009/3002 [05:58<21:57,  1.51it/s][A
Training:  34%|███▎      | 1010/3002 [05:58<18:09,  1.83it/s][A
Training:  34%|███▎      | 1011/3002 [05:59<15:53,  2.09it/s][A
Training:  34%|███▎      | 1012/3002 [05:59<13:43,  2.42it/s][A
Training:  34%|███▎      | 1013/3002 [05:59<12:59,  2.55it/s][A
Training:  34%|███▍      | 1014/3002 [06:00<12:27,  2.66it/s][A
Training:  34%|███▍      | 1015/3002 [06:00<11:43,  2.83it/s][A
Training:  3

>>>traing loss: 0.65237, valid loss: 0.62353, valid f1 score: 0.54069



Training:  50%|█████     | 1501/3002 [08:53<2:45:45,  6.63s/it][A
Training:  50%|█████     | 1502/3002 [08:53<1:57:52,  4.72s/it][A
Training:  50%|█████     | 1503/3002 [08:53<1:24:41,  3.39s/it][A
Training:  50%|█████     | 1504/3002 [08:54<1:01:23,  2.46s/it][A
Training:  50%|█████     | 1505/3002 [08:54<45:07,  1.81s/it]  [A
Training:  50%|█████     | 1506/3002 [08:54<34:14,  1.37s/it][A
Training:  50%|█████     | 1507/3002 [08:54<25:55,  1.04s/it][A
Training:  50%|█████     | 1508/3002 [08:55<20:20,  1.22it/s][A
Training:  50%|█████     | 1509/3002 [08:55<16:29,  1.51it/s][A
Training:  50%|█████     | 1510/3002 [08:55<13:42,  1.81it/s][A
Training:  50%|█████     | 1511/3002 [08:56<11:58,  2.07it/s][A
Training:  50%|█████     | 1512/3002 [08:56<10:46,  2.30it/s][A
Training:  50%|█████     | 1513/3002 [08:56<09:51,  2.52it/s][A
Training:  50%|█████     | 1514/3002 [08:57<09:01,  2.75it/s][A
Training:  50%|█████     | 1515/3002 [08:57<08:25,  2.94it/s][A
Training:  50%

>>>traing loss: 0.62490, valid loss: 0.61103, valid f1 score: 0.50578



Training:  67%|██████▋   | 2001/3002 [11:50<1:51:06,  6.66s/it][A
Training:  67%|██████▋   | 2002/3002 [11:50<1:19:07,  4.75s/it][A
Training:  67%|██████▋   | 2003/3002 [11:50<56:40,  3.40s/it]  [A
Training:  67%|██████▋   | 2004/3002 [11:51<41:09,  2.47s/it][A
Training:  67%|██████▋   | 2005/3002 [11:51<30:06,  1.81s/it][A
Training:  67%|██████▋   | 2006/3002 [11:51<22:36,  1.36s/it][A
Training:  67%|██████▋   | 2007/3002 [11:52<17:13,  1.04s/it][A
Training:  67%|██████▋   | 2008/3002 [11:52<13:19,  1.24it/s][A
Training:  67%|██████▋   | 2009/3002 [11:52<10:46,  1.54it/s][A
Training:  67%|██████▋   | 2010/3002 [11:52<09:03,  1.83it/s][A
Training:  67%|██████▋   | 2011/3002 [11:53<07:54,  2.09it/s][A
Training:  67%|██████▋   | 2012/3002 [11:53<07:00,  2.35it/s][A
Training:  67%|██████▋   | 2013/3002 [11:53<06:21,  2.59it/s][A
Training:  67%|██████▋   | 2014/3002 [11:54<05:56,  2.77it/s][A
Training:  67%|██████▋   | 2015/3002 [11:54<05:45,  2.86it/s][A
Training:  67%|███

>>>traing loss: 0.61215, valid loss: 0.60698, valid f1 score: 0.53335



Training:  83%|████████▎ | 2501/3002 [14:48<55:12,  6.61s/it]  [A
Training:  83%|████████▎ | 2502/3002 [14:48<39:20,  4.72s/it][A
Training:  83%|████████▎ | 2503/3002 [14:48<28:09,  3.39s/it][A
Training:  83%|████████▎ | 2504/3002 [14:49<20:26,  2.46s/it][A
Training:  83%|████████▎ | 2505/3002 [14:49<14:56,  1.80s/it][A
Training:  83%|████████▎ | 2506/3002 [14:49<11:09,  1.35s/it][A
Training:  84%|████████▎ | 2507/3002 [14:50<08:26,  1.02s/it][A
Training:  84%|████████▎ | 2508/3002 [14:50<06:39,  1.24it/s][A
Training:  84%|████████▎ | 2509/3002 [14:50<05:16,  1.56it/s][A
Training:  84%|████████▎ | 2510/3002 [14:50<04:28,  1.83it/s][A
Training:  84%|████████▎ | 2511/3002 [14:51<03:51,  2.12it/s][A
Training:  84%|████████▎ | 2512/3002 [14:51<03:24,  2.40it/s][A
Training:  84%|████████▎ | 2513/3002 [14:51<03:06,  2.63it/s][A
Training:  84%|████████▎ | 2514/3002 [14:52<02:51,  2.84it/s][A
Training:  84%|████████▍ | 2515/3002 [14:52<02:44,  2.96it/s][A
Training:  84%|███████

>>>traing loss: 0.58535, valid loss: 0.60679, valid f1 score: 0.52051



Training: 100%|█████████▉| 3001/3002 [17:44<00:06,  6.63s/it][A
Training: 100%|██████████| 3002/3002 [17:45<00:00,  2.82it/s]
100%|██████████| 1/1 [17:45<00:00, 1065.07s/it]


In [37]:
def prediction(config, id2label, model, test_dataloader):
  test_iterator = tqdm(test_dataloader, desc='Prediction', total = len(test_dataloader))
  model.eval()
  test_preds = []

  with torch.no_grad():
    for batch in test_iterator:
      batch = {item: value.to(config['device']) for item, value in batch.items()}
      logits = model(**batch)[1]
      test_preds.append(logits.argmax(dim=-1).detach().cpu())
  
  test_preds = torch.cat(test_preds, dim=0).numpy()
  test_preds = [id2label[id_] for id_ in test_preds]

  test_df = pd.read_csv(config['test_file_path'], sep=',')
  test_df.insert(1, column='label', value=test_preds)
  test_df.drop(['sentence'], 1,inplace=True)#1表示按列删除
  test_df.to_csv('submission_Nezha_FocalLoss.csv', index=False, encoding= 'utf8')

In [38]:
prediction(config, id2label, model, test_dataloader)

Prediction: 100%|██████████| 625/625 [00:53<00:00, 11.76it/s]
