
# Pytorch를 활용한 미국드라마 프렌즈 대사 감정 분석 모델 
## 모델 : Bert Base

## Dataset
EmotionLines 제공 프렌즈 대사 데이터셋<br>

## 개발 환경
  - Google Corab (With GPU)<br>
  - 구글 드라이브 연동 후 본인 경로 설정 필수<br>

### 본 자료는 Jangwon Park 님 제작 자료로써, <br>
### BERT 모델의 Accuracy 단순 참조용으로 첨부하였습니다.

In [None]:
# Hugging Face의 트랜스포머 모델을 설치
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |▏                               | 10kB 21.3MB/s eta 0:00:01[K     |▍                               | 20kB 27.9MB/s eta 0:00:01[K     |▋                               | 30kB 19.6MB/s eta 0:00:01[K     |▉                               | 40kB 22.9MB/s eta 0:00:01[K     |█                               | 51kB 26.0MB/s eta 0:00:01[K     |█▎                              | 61kB 28.5MB/s eta 0:00:01[K     |█▌                              | 71kB 29.9MB/s eta 0:00:01[K     |█▊                              | 81kB 27.2MB/s eta 0:00:01[K     |██                              | 92kB 24.3MB/s eta 0:00:01[K     |██▏                             | 102kB 23.1MB/s eta 0:00:01[K     |██▍                             | 112kB 23.1MB/s eta 0:00:01[K     |██▋                             | 

# 데이터 로드

In [None]:
!unzip Friends.zip

# train data 전처리

In [None]:
import pandas as pd
import json

In [None]:
with open('friends_train.json', encoding = 'utf-8', mode = 'r') as f:
  tempArray = json.load(f)

train = pd.DataFrame.from_dict(tempArray[0])

isFirst = True
for arr in tempArray:
  if isFirst:
    isFirst = False
    continue

  tempDf = pd.DataFrame.from_dict(arr)
  train = train.append(tempDf, ignore_index = True)

In [None]:
# 리뷰 문장 추출
train_sentences = train['utterance']
train_sentences[:10]

0    also I was the point person on my companys tr...
1                     You mustve had your hands full.
2                              That I did. That I did.
3        So lets talk a little bit about your duties.
4                               My duties?  All right.
5    Now youll be heading a whole division, so you...
6                                               I see.
7    But therell be perhaps 30 people under you so...
8                                        Good to know.
9                                We can go into detail
Name: utterance, dtype: object

In [None]:
# Electra의 입력 형식에 맞게 변환
train_sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in train_sentences]
train_sentences[:10]

['[CLS] also I was the point person on my company\x92s transition from the KL-5 to GR-6 system. [SEP]',
 '[CLS] You must\x92ve had your hands full. [SEP]',
 '[CLS] That I did. That I did. [SEP]',
 '[CLS] So let\x92s talk a little bit about your duties. [SEP]',
 '[CLS] My duties?  All right. [SEP]',
 '[CLS] Now you\x92ll be heading a whole division, so you\x92ll have a lot of duties. [SEP]',
 '[CLS] I see. [SEP]',
 '[CLS] But there\x92ll be perhaps 30 people under you so you can dump a certain amount on them. [SEP]',
 '[CLS] Good to know. [SEP]',
 '[CLS] We can go into detail [SEP]']

In [None]:
# 감정을 숫자로 변환
def emotion_labeling(emotion):
   return{'anger' : 0,'disgust':1,'fear':2, 'joy':3,'neutral':4,'non-neutral':5,'sadness':6,'surprise':7}[emotion]

emotion_labels = []

for e in train['emotion']:
   emotion_labels.append(emotion_labeling(e))

train['label'] = emotion_labels
train[:10]

Unnamed: 0,speaker,utterance,emotion,annotation,label
0,Chandler,also I was the point person on my companys tr...,neutral,4100000,4
1,The Interviewer,You mustve had your hands full.,neutral,5000000,4
2,Chandler,That I did. That I did.,neutral,5000000,4
3,The Interviewer,So lets talk a little bit about your duties.,neutral,5000000,4
4,Chandler,My duties? All right.,surprise,2000030,7
5,The Interviewer,"Now youll be heading a whole division, so you...",neutral,5000000,4
6,Chandler,I see.,neutral,3100010,4
7,The Interviewer,But therell be perhaps 30 people under you so...,neutral,4000100,4
8,Chandler,Good to know.,neutral,4100000,4
9,The Interviewer,We can go into detail,neutral,4000100,4


In [None]:
# label 추출
train_labels = train['label'].values
train_labels

array([4, 4, 4, ..., 7, 4, 5])

In [None]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from transformers import BertTokenizer,BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_tokenized_texts = [tokenizer.tokenize(sent) for sent in train_sentences]

print (train_sentences[0])
print (train_tokenized_texts[0])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…


[CLS] also I was the point person on my companys transition from the KL-5 to GR-6 system. [SEP]
['[CLS]', 'also', 'I', 'was', 'the', 'point', 'person', 'on', 'my', 'company', '##s', 'transition', 'from', 'the', 'K', '##L', '-', '5', 'to', 'GR', '-', '6', 'system', '.', '[SEP]']


In [None]:
from keras.preprocessing.sequence import pad_sequences

# 입력 토큰의 최대 시퀀스 길이
MAX_LEN = 128

# 토큰을 숫자 인덱스로 변환
train_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in train_tokenized_texts]

# 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

train_input_ids[0]


array([  101, 10379,   146, 10134, 10105, 12331, 15042, 10135, 15127,
       12100, 10107, 35959, 10188, 10105,   148, 11369,   118,   126,
       10114, 58787,   118,   127, 11787,   119,   102,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0]

In [None]:
# 어텐션 마스크 초기화
train_attention_masks = []

# 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
# 패딩 부분은 ELECTRA 모델에서 어텐션을 수행하지 않아 속도 향상
for seq in train_input_ids:
    seq_mask = [float(i>0) for i in seq]
    train_attention_masks.append(seq_mask)

print(train_attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
# pytorch 텐서로 변환
train_inputs = torch.tensor(train_input_ids)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_attention_masks)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# 배치 사이즈
batch_size = 32

# 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
# 학습시 배치 사이즈 만큼 데이터를 가져옴
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# 전처리- dev

In [None]:
with open('friends_dev.json', encoding = 'utf-8', mode = 'r') as f:
  tempArray = json.load(f)

dev = pd.DataFrame.from_dict(tempArray[0])

isFirst = True
for arr in tempArray:
  if isFirst:
    isFirst = False
    continue

  tempDf = pd.DataFrame.from_dict(arr)
  dev = dev.append(tempDf, ignore_index = True)

In [None]:
# 리뷰 문장 추출
dev_sentences = dev['utterance']
dev_sentences[:10]

0       Oh my God, hes lost it. Hes totally lost it.
1                                                What?
2    Or! Or, we could go to the bank, close our acc...
3                                     Youre a genius!
4              Aww, man, now we wont be bank buddies!
5                            Now, theres two reasons.
6                                                 Hey.
7                                                 Hey!
8    Ohh, you guys, remember that cute client I tol...
9                                              Where?!
Name: utterance, dtype: object

In [None]:
# Electra의 입력 형식에 맞게 변환
dev_sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in dev_sentences]
dev_sentences[:10]

['[CLS] Oh my God, he\x92s lost it. He\x92s totally lost it. [SEP]',
 '[CLS] What? [SEP]',
 '[CLS] Or! Or, we could go to the bank, close our accounts and cut them off at the source. [SEP]',
 '[CLS] You\x92re a genius! [SEP]',
 '[CLS] Aww, man, now we won\x92t be bank buddies! [SEP]',
 '[CLS] Now, there\x92s two reasons. [SEP]',
 '[CLS] Hey. [SEP]',
 '[CLS] Hey! [SEP]',
 '[CLS] Ohh, you guys, remember that cute client I told you about? I bit him. [SEP]',
 '[CLS] Where?! [SEP]']

In [None]:
# emotion으로 숫자로 변환
emotion_labels = []

for e in dev['emotion']:
   emotion_labels.append(emotion_labeling(e))

dev['label'] = emotion_labels
dev[:10]

Unnamed: 0,speaker,utterance,emotion,annotation,label
0,Phoebe,"Oh my God, hes lost it. Hes totally lost it.",non-neutral,2120,5
1,Monica,What?,surprise,1000130,7
2,Ross,"Or! Or, we could go to the bank, close our acc...",neutral,3000200,4
3,Chandler,Youre a genius!,joy,500000,3
4,Joey,"Aww, man, now we wont be bank buddies!",sadness,40100,6
5,Chandler,"Now, theres two reasons.",neutral,4000010,4
6,Phoebe,Hey.,neutral,3100010,4
7,All,Hey!,joy,1300010,3
8,Phoebe,"Ohh, you guys, remember that cute client I tol...",neutral,4100000,4
9,Rachel,Where?!,surprise,50,7


In [None]:
# 라벨 추출
dev_labels = dev['label'].values
dev_labels

array([5, 7, 4, ..., 6, 6, 6])

In [None]:
tokenizer =BertTokenizer.from_pretrained('bert-base-multilingual-cased')
dev_tokenized_texts = [tokenizer.tokenize(sent) for sent in dev_sentences]

print (dev_sentences[0])
print (dev_tokenized_texts[0])

[CLS] Oh my God, hes lost it. Hes totally lost it. [SEP]
['[CLS]', 'Oh', 'my', 'God', ',', 'he', '##s', 'lost', 'it', '.', 'He', '##s', 'totally', 'lost', 'it', '.', '[SEP]']


In [None]:
# 입력 토큰의 최대 시퀀스 길이
MAX_LEN = 128

# 토큰을 숫자 인덱스로 변환
dev_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in dev_tokenized_texts]

# 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
dev_input_ids = pad_sequences(dev_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

dev_input_ids[0]

array([   101,  22800,  15127,  14015,    117,  10261,  10107,  14172,
        10271,    119,  10357,  10107, 110240,  14172,  10271,    119,
          102,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [None]:
# 어텐션 마스크 초기화
dev_attention_masks = []

# 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
# 패딩 부분은 Electra 모델에서 어텐션을 수행하지 않아 속도 향상
for seq in dev_input_ids:
    seq_mask = [float(i>0) for i in seq]
    dev_attention_masks.append(seq_mask)

print(dev_attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
# 데이터를 파이토치의 텐서로 변환

validation_inputs = torch.tensor(dev_input_ids)
validation_labels = torch.tensor(dev_labels)
validation_masks = torch.tensor(dev_attention_masks)				


print(validation_inputs[0])
print(validation_labels[0])
print(validation_masks[0])
		




tensor([   101,  22800,  15127,  14015,    117,  10261,  10107,  14172,  10271,
           119,  10357,  10107, 110240,  14172,  10271,    119,    102,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0, 

In [None]:
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# 전처리 - test set

In [None]:
with open('friends_test.json', encoding = 'utf-8', mode = 'r') as f:
  tempArray = json.load(f)

test = pd.DataFrame.from_dict(tempArray[0])

isFirst = True
for arr in tempArray:
  if isFirst:
    isFirst = False
    continue

  tempDf = pd.DataFrame.from_dict(arr)
  test = test.append(tempDf, ignore_index = True)

In [None]:
emotion_labels = []

for e in test['emotion']:
   emotion_labels.append(emotion_labeling(e))

test['label'] = emotion_labels
test[:10]

Unnamed: 0,speaker,utterance,emotion,annotation,label
0,Mark,Why do all youre coffee mugs have numbers on ...,surprise,2000030,7
1,Rachel,Oh. Thats so Monica can keep track. That way ...,non-neutral,2100011,5
2,Rachel,Y'know what?,neutral,3000020,4
3,Ross,It didnt.,neutral,5000000,4
4,Frank,"Okay, so what you used to have with Rachel, is...",joy,1300010,3
5,Joey,"Now, wh-what, what is that like?",surprise,1000040,7
6,Frank,"Its so cool man, its so, its just cause be...",joy,2300000,3
7,Ross,"Yeah, yeah.",neutral,5000000,4
8,Joey,Why cant I find that?,non-neutral,20021,5
9,Ross,"Dont ask me, I had it and I blew it!",anger,302,0


In [None]:
# 리뷰 문장 추출
sentences = test['utterance']
sentences[:10]

0    Why do all youre coffee mugs have numbers on ...
1    Oh. Thats so Monica can keep track. That way ...
2                                         Y'know what?
3                                           It didnt.
4    Okay, so what you used to have with Rachel, is...
5                     Now, wh-what, what is that like?
6    Its so cool man, its so, its just cause be...
7                                          Yeah, yeah.
8                               Why cant I find that?
9                Dont ask me, I had it and I blew it!
Name: utterance, dtype: object

In [None]:
# ELECTRA의 입력 형식에 맞게 변환
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
sentences[:10]

['[CLS] Why do all you\x92re coffee mugs have numbers on the bottom? [SEP]',
 '[CLS] Oh. That\x92s so Monica can keep track. That way if one on them is missing, she can be like, \x91Where\x92s number 27?!\x92 [SEP]',
 "[CLS] Y'know what? [SEP]",
 '[CLS] It didn\x92t. [SEP]',
 '[CLS] Okay, so what you used to have with Rachel, is what I\x92ve got with Alice. [SEP]',
 '[CLS] Now, wh-what, what is that like? [SEP]',
 '[CLS] It\x92s so cool man, it\x92s so, it\x92s just \x91cause being with her is so much better than like not being with her. [SEP]',
 '[CLS] Yeah, yeah. [SEP]',
 '[CLS] Why can\x92t I find that? [SEP]',
 '[CLS] Don\x92t ask me, I had it and I blew it! [SEP]']

In [None]:
# 라벨 추출
labels = test['label'].values
labels

array([7, 5, 4, ..., 4, 4, 4])

In [None]:
tokenizer =BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print (sentences[0])
print (tokenized_texts[0])

[CLS] Why do all youre coffee mugs have numbers on the bottom? [SEP]
['[CLS]', 'Why', 'do', 'all', 'your', '##e', 'coffee', 'mu', '##gs', 'have', 'numbers', 'on', 'the', 'bottom', '?', '[SEP]']


In [None]:
# 입력 토큰의 최대 시퀀스 길이
MAX_LEN = 100

# 토큰을 숫자 인덱스로 변환
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

input_ids[0]

In [None]:
# 어텐션 마스크 초기화
attention_masks = []

# 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
# 데이터를 파이토치의 텐서로 변환
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

print(test_inputs[0])
print(test_labels[0])
print(test_masks[0])

In [None]:
# 배치 사이즈
batch_size = 32

# 파이토치의 DataLoader로 입력, 마스크, 라벨을 묶어 데이터 설정
# 학습시 배치 사이즈 만큼 데이터를 가져옴
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# 모델 생성

In [None]:
import tensorflow as tf

In [None]:
# GPU 디바이스 이름 구함
device_name = tf.test.gpu_device_name()

# GPU 디바이스 이름 검사
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
# 디바이스 설정
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla P4


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=8)
model.cuda()

In [None]:
from transformers import get_linear_schedule_with_warmup,AdamW

# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 4

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 학습률을 조금씩 감소시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
2e-5

2e-05

# 모델 학습

In [None]:
import numpy as np
import random
import time
import datetime

In [None]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# f1-score parameter
from sklearn.metrics import f1_score
f1_score_avg = []
trues = []
preds = []

In [None]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]


        

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        #
        pred_flat = np.argmax(logits, axis=1).flatten()
        trues_flat = label_ids.flatten()
        trues.extend(trues_flat)
        preds.extend(pred_flat)

        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print(trues[:10])
    print(preds[:10])
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  f1 score macro: {0: 2f}".format(f1_score(y_true=trues,y_pred= preds, labels = [0,1,2,3,4,5,6,7], average='macro')))
    print("  f1 score micro: {0: 2f}".format(f1_score(y_true=trues, y_pred= preds, labels = [0,1,2,3,4,5,6,7],average='micro')))
    print("  f1 score weighted: {0: 2f}".format(f1_score(y_true=trues, y_pred= preds, labels = [0,1,2,3,4,5,6,7],average='weighted')))
    print(f"  f1 score none: {f1_score(y_true=trues, y_pred= preds, labels = [0,1,2,3,4,5,6,7],average=None)}")
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...

  Average training loss: 1.33
  Training epcoh took: 0:04:34

Running Validation...
[5, 7, 4, 3, 6, 4, 4, 3, 4, 7]
[7, 7, 5, 3, 5, 4, 4, 3, 5, 7]
  Accuracy: 0.54
  f1 score macro:  0.282532
  f1 score micro:  0.544992
  f1 score weighted:  0.487760
  f1 score none: [0.08080808 0.         0.         0.50154799 0.73967684 0.22941176
 0.14925373 0.55955679]
  Validation took: 0:00:10

Training...

  Average training loss: 1.14
  Training epcoh took: 0:04:38

Running Validation...
[5, 7, 4, 3, 6, 4, 4, 3, 4, 7]
[7, 7, 5, 3, 5, 4, 4, 3, 5, 7]
  Accuracy: 0.56
  f1 score macro:  0.301831
  f1 score micro:  0.550934
  f1 score weighted:  0.501771
  f1 score none: [0.13592233 0.         0.         0.51960784 0.7438091  0.24423338
 0.20547945 0.56559767]
  Validation took: 0:00:10

Training...

  Average training loss: 1.01
  Training epcoh took: 0:04:38

Running Validation...
[5, 7, 4, 3, 6, 4, 4, 3, 4, 7]
[7, 7, 5, 3, 5, 4, 4, 3, 5, 7]
  Accuracy: 0.57
  f1 score macro:  0.3111

# 테스트셋 평가

In [None]:
#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(test_dataloader):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

   #
    pred_flat = np.argmax(logits, axis=1).flatten()
    trues_flat = label_ids.flatten()
    trues.extend(trues_flat)
    preds.extend(pred_flat)
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("  f1 score macro: {0: 2f}".format(f1_score(y_true=trues,y_pred= preds, labels = [0,1,2,3,4,5,6,7], average='macro')))
print("  f1 score micro: {0: 2f}".format(f1_score(y_true=trues, y_pred= preds, labels = [0,1,2,3,4,5,6,7],average='micro')))
print("  f1 score weighted: {0: 2f}".format(f1_score(y_true=trues, y_pred= preds, labels = [0,1,2,3,4,5,6,7],average='weighted')))
print(f"  f1 score none: {f1_score(y_true=trues, y_pred= preds, labels = [0,1,2,3,4,5,6,7],average=None)}")
print("Test took: {:}".format(format_time(time.time() - t0)))


Accuracy: 0.60
  f1 score macro:  0.340905
  f1 score micro:  0.576378
  f1 score weighted:  0.546143
  f1 score none: [0.23931624 0.         0.         0.57464455 0.7634105  0.30752454
 0.27916667 0.5631769 ]
Test took: 0:00:22


# 새로운 문장 테스트

In [None]:
# 입력 데이터 변환
def convert_input_data(sentences):

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 128

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    
    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [None]:
# 문장 테스트
def test_sentences(sentences):

    # 평가모드로 변경
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)
            
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)

    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

In [None]:
logits = test_sentences(['Nice job.'])

print(logits)
print(np.argmax(logits))

[[-0.869198   -0.95660347 -0.97209674  0.214642    3.1684265   1.7565498
  -0.58515584 -1.5049354 ]]
4


In [None]:
torch.save(model,)

TypeError: ignored