<a href="https://colab.research.google.com/github/sooyun1202/NLP/blob/main/BERT%EB%AA%A8%EB%8D%B8_%EC%98%81%EC%96%B4(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install transformers --quiet # package installer for python

In [19]:
import torch
from transformers import BertModel, BertTokenizer

In [20]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)

데이터셋

In [21]:
import json

data = {'train': {'speaker': [], 'utterance': [], 'emotion': []},
        'dev': {'speaker': [], 'utterance': [], 'emotion': []},
        'test': {'speaker': [], 'utterance': [], 'emotion': []}}

for dtype in ['train', 'dev', 'test']:
  for dialog in json.loads(open('friends_' + dtype + '.json').read()):
    for line in dialog:
      data[dtype]['speaker'].append(line['speaker'])
      data[dtype]['utterance'].append(line['utterance'])
      data[dtype]['emotion'].append(line['emotion'])

In [22]:
e2i_dict = dict((emo, i) for i, emo in enumerate(set(data['train']['emotion'])))
i2e_dict = {i: e for e, i in e2i_dict.items()}

In [23]:
import torch.nn as nn
from transformers import BertModel, BertTokenizer

class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert_tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    self.bert_model = BertModel.from_pretrained(pretrained_weights)
    self.linear = torch.nn.Linear(768, len(e2i_dict))

  def forward(self, utterance):
    tokens = self.bert_tokenizer.tokenize(utterance)
    tokens = ['[CLS]'] + tokens + ['[SEP]'] # (len)
    ids = [tokenizer.convert_tokens_to_ids(tokens)] # (bat=1, len)
    input_tensor = torch.tensor(ids).cuda()

    hidden_tensor = self.bert_model(input_tensor)[0] # (bat, len, hid)
    hidden_tensor = hidden_tensor[:, 0, :] # (bat, hid)
    logit = self.linear(hidden_tensor)
    return logit

In [24]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate(true_list, pred_list):
  precision = precision_score(true_list, pred_list, average=None)
  recall = recall_score(true_list, pred_list, average=None)
  micro_f1 = f1_score(true_list, pred_list, average='micro')
  print('precision:\t', ['%.4f' % v for v in precision])
  print('recall:\t\t', ['%.4f' % v for v in recall])
  print('micro_f1: %.6f' % micro_f1)

In [32]:
pretrained_weights = 'bert-base-uncased'
learning_rate = 2e-5 
n_epoch = 3

In [33]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
from tqdm import tqdm_notebook

model = Model()
model.cuda()
criterion = torch.nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), learning_rate) #Adam, Adagrad, SGD

for i_epoch in range(n_epoch):
  print('i_epoch:', i_epoch)

  model.train()
  for i_batch in tqdm_notebook(range(len(data['train']['utterance']))):
    logit = model(data['train']['utterance'][i_batch])
    target = torch.tensor([e2i_dict[data['train']['emotion'][i_batch]]]).cuda()
    loss = criterion(logit, target)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  
  model.eval()
  pred_list, true_list = [], []
  for i_batch in tqdm_notebook(range(len(data['dev']['utterance']))):
    logit = model(data['dev']['utterance'][i_batch])
    _, max_idx = torch.max(logit, dim=-1)
    pred_list += max_idx.tolist()
    true_list += [e2i_dict[data['dev']['emotion'][i_batch]]]
  evaluate(pred_list, true_list) # print results

i_epoch: 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.0870', '0.5610', '0.0824', '0.9145', '0.1916', '0.0000', '0.4570', '0.2097']
recall:		 ['0.5000', '0.4259', '0.3889', '0.6059', '0.3534', '0.0000', '0.6273', '0.4815']
micro_f1: 0.551783
i_epoch: 1


  _warn_prf(average, modifier, msg_start, len(result))


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.2609', '0.5285', '0.2118', '0.8473', '0.1822', '0.0000', '0.6093', '0.3387']
recall:		 ['0.1875', '0.5652', '0.4000', '0.6450', '0.3451', '0.0000', '0.5169', '0.4200']
micro_f1: 0.557725
i_epoch: 2


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.2174', '0.5610', '0.2471', '0.8330', '0.2009', '0.0000', '0.6291', '0.3387']
recall:		 ['0.2632', '0.5847', '0.3559', '0.6554', '0.3333', '0.0000', '0.5220', '0.4667']
micro_f1: 0.562818


In [34]:
import csv

dialogs = []
dialogs.append([])

with open('en_data.csv', newline='') as csvfile:
  spamreader = csv.reader(csvfile)
  for i,row in enumerate(spamreader):
    if i!=0:
      dialogs[0].append({'id':row[0],'speaker':row[3], 'utterance':row[4]})

from collections import OrderedDict

labeled = []
for dialog in tqdm_notebook(dialogs):
  dialog_list = []
  for line in dialog:
    logit = model(line['utterance'])
    _, max_idx = torch.max(logit, dim=-1)
    pred_emotion = max_idx.tolist()[0]

    line_dict = OrderedDict()
    line_dict['Id'] = line['id']
    line_dict['speaker'] = line['speaker']
    line_dict['utterance'] = line['utterance']
    line_dict['emotion'] = i2e_dict[pred_emotion]
    dialog_list.append(line_dict)
  labeled.append(dialog_list)

with open('labeled.csv', 'w', newline='') as csvfile:
    fieldnames = ['Id', 'Predicted']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for row in dialog_list:
      writer.writerow({'Id': row['Id'], 'Predicted': row['emotion']})

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




참고: https://colab.research.google.com/drive/1EMzEfTYjYLgEHjCCP1vEr9oOZLXMocGh?usp=sharing