
# Pytorch를 활용한 미국드라마 프렌즈 대사 감정 분석 모델 
## 모델 : ELECTRA Base

## Dataset
EmotionLines 제공 프렌즈 대사 데이터셋<br>

## References
- https://github.com/jiwonny/nlp_emotion_classification/blob/master/friends_electra.ipynb (소스코드 참조)
- http://doraemon.iis.sinica.edu.tw/emotionlines/index.html
- https://huggingface.co/transformers/training.html
- http://wikidocs.net/book/2155 (데이터 구조 분석 부분 참조)

## 개발 환경
  - Google Corab (With GPU)<br>
  - 구글 드라이브 연동 후 본인 경로 설정 필수<br>


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#구글 코랩 환경에서 필수 다운로드 요소인 'Transformers' 불러오기
!pip install transformers



In [None]:
# Git 연동 작업하기

In [None]:
# 구글 드라이브 마운트하기

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 모델 생성에 필요한 각종 필수 도구 Import

import torch

from transformers import ElectraTokenizer, ElectraForSequenceClassification
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
import random
import time
import datetime
import json

In [None]:
# 판다스를 활용한 Json 데이터 프레임 생성 함수 설정

def jsonToDf(file_name):
  with open(file_name, encoding = 'utf-8', mode = 'r') as file:
    json_array = json.load(file)
  result = pd.DataFrame.from_dict(json_array[0])
  is_first = True
  for array in json_array:
      
    if is_first:
        is_first = False
        continue
    
    temp_df = pd.DataFrame.from_dict(array)
    result = result.append(temp_df, ignore_index = True)

  return result

In [None]:
# Json 데이터 불러오기  (본인 경로 설정 필요)
train = jsonToDf('/content/drive/MyDrive/Colab Notebooks/Korea Univ/001. NLP Project/Freinds_eng/friends_train.json')
dev = jsonToDf('/content/drive/MyDrive/Colab Notebooks/Korea Univ/001. NLP Project/Freinds_eng/friends_dev.json')
test = jsonToDf('/content/drive/MyDrive/Colab Notebooks/Korea Univ/001. NLP Project/Freinds_eng/friends_test.json')

In [None]:
# 모델 평가용 데이터 불러오기 (본인 경로 설정 필요)
predict = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Korea Univ/001. NLP Project/Freinds_eng/en_data.csv', encoding = 'UTF-8')

In [None]:
# Train Data 파악하기
train.utterance
max(len(l) for l in train.utterance)
sum(map(len, train.utterance))/len(train.utterance)

39.68478363791308

In [None]:
#Train Data 중 utterance Data 관찰
train.utterance

0        also I was the point person on my companys tr...
1                         You mustve had your hands full.
2                                  That I did. That I did.
3            So lets talk a little bit about your duties.
4                                   My duties?  All right.
                               ...                        
10556                                           You or me?
10557    I got it. Uh, Joey, women don't have Adam's ap...
10558                 You guys are messing with me, right?
10559                                                Yeah.
10560    That was a good one. For a second there, I was...
Name: utterance, Length: 10561, dtype: object

In [None]:
# Traint Data의 비율 분석을 위한 세팅
def below_threshold_len(max_len, nested_list):
  cnt = 0
  for s in nested_list:
    if(len(s) <= max_len):
      cnt = cnt +1
  print('전체 대본 중 길이가 %s 이하인 데이터셋의 비율: %s'%(max_len, (cnt / len(nested_list))*100))

In [None]:
# Train Data 비율 분석 진행
MAX_LEN = 100
below_threshold_len(MAX_LEN, train.utterance)

전체 대본 중 길이가 100 이하인 데이터셋의 비율: 94.59331502698608


In [None]:
# 전체 데이터셋 수치 파악
print(train.shape)
print(dev.shape)
print(test.shape)
print(predict.shape)

(10561, 4)
(1178, 4)
(2764, 4)
(1623, 5)


# Train, Dev, Test, Predict 데이터 전처리 과정

In [None]:
# Train,Dev,Test 데이터의 전처리 진행을 위한 함수 설정
# Max Len은 전체 데이터셋의 95%를 수준을 커버하는 100으로 설정

MAX_LEN = MAX_LEN

def getInputsAndLabels(dataset):
  data = dataset.copy(deep=True)
  data['utterance'] = data['utterance'].str.lower()

  utterances = data['utterance']
  utterances = ["[CLS] " + str(utterance) + " [SEP]" for utterance in utterances]
  
  encoder = LabelEncoder()
  labels = data['emotion'].values
  encoder.fit(labels)
  labels = encoder.transform(labels)

  tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
  tokenized_texts = [tokenizer.tokenize(utterance) for utterance in utterances]

  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  attention_masks = []
  for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_masks.append(seq_mask)

  return input_ids, labels, attention_masks

In [None]:
# Predict Data 전처리를 위한 함수 설정 
def getInputsFromTest(dataset):
  data = dataset.copy(deep=True)
  data['utterance'] = data['utterance'].str.lower()

  utterances = data['utterance']
  utterances = ["[CLS] " + str(utterance) + " [SEP]" for utterance in utterances]
  
  tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')
  tokenized_texts = [tokenizer.tokenize(utterance) for utterance in utterances]

  input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  attention_masks = []
  for seq in input_ids:
      seq_mask = [float(i>0) for i in seq]
      attention_masks.append(seq_mask)

  return input_ids, attention_masks

In [None]:
def getIndex(dataset):
  data = dataset.copy(deep = True)
  input_index = data.id.tolist()
  return torch.tensor(input_index)

In [None]:
# Train, Dev, Test 및 Predict 각 각의 데이터에 맞는 전처리 함수 적용

train_inputs, train_labels, train_masks = getInputsAndLabels(train)
dev_inputs, dev_labels, dev_masks = getInputsAndLabels(dev)
test_inputs, test_labels, test_masks = getInputsAndLabels(test)
predict_inputs, predict_masks = getInputsFromTest(predict)   

In [None]:
# 파이토치 텐세로 변환
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

dev_inputs = torch.tensor(dev_inputs)
dev_labels = torch.tensor(dev_labels)
dev_masks = torch.tensor(dev_masks)

test_inputs = torch.tensor(test_inputs)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_masks)

predict_index = getIndex(predict)     
predict_inputs = torch.tensor(predict_inputs)
predict_masks = torch.tensor(predict_masks)

In [None]:
train_inputs
dev_inputs
test_inputs

tensor([[  101,  2339,  2079,  ...,     0,     0,     0],
        [  101,  2821,  1012,  ...,     0,     0,     0],
        [  101,  1061,  1005,  ...,     0,     0,     0],
        ...,
        [  101,  3398,  2008,  ...,     0,     0,     0],
        [  101, 13814,  2054,  ...,     0,     0,     0],
        [  101,  2035,  2157,  ...,     0,     0,     0]])

In [None]:
# 파이토치의 DataLoader로 입력
# 학습시 배치 사이즈 만큼 데이터를 가져옴

batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

dev_data = TensorDataset(dev_inputs, dev_masks, dev_labels)
dev_sampler = SequentialSampler(dev_data)
dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


predict_data = TensorDataset(predict_index, predict_inputs, predict_masks) 
predict_sampler = RandomSampler(predict_data)
predict_dataloader = DataLoader(predict_data, sampler=predict_sampler, batch_size=batch_size)

In [None]:
# 디바이스 설정 (GPU 활성화 확인)
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


In [None]:
# 모델 불러오기 (분류 모델)
model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=8)
model.cuda()

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.d

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

# Train & Validation

In [None]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, #학습률
                  eps = 1e-8 # 0으로 나누는 것 방지
                )
epochs = 5

# 총 훈련 스텝 = 배치 반복 횟수 * epochs
total_steps = len(train_dataloader) * epochs

# 학습률을 조금씩 감소시키는 스케줄러 생성

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


In [None]:
from sklearn.metrics import f1_score

# 정확도 계산 함수
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# 재현을 위한 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화 설정

model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        if step % 300 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        batch = tuple(t.to(device).long() for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
             
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

        loss = outputs[0]
        total_loss += loss.item()


        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    eval_loss, eval_accuracy, eval_f1 = 0, 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in dev_dataloader:
        batch = tuple(t.to(device).long() for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():     
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
     
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    print("")
    print("Training complete!")

   




Training...
  Batch   300  of    331.    Elapsed: 0:01:11.

  Average training loss: 1.38
  Training epcoh took: 0:01:18

Running Validation...
  Accuracy: 0.55
  Validation took: 0:00:02

Training complete!

Training...
  Batch   300  of    331.    Elapsed: 0:01:11.

  Average training loss: 1.11
  Training epcoh took: 0:01:18

Running Validation...
  Accuracy: 0.57
  Validation took: 0:00:02

Training complete!

Training...
  Batch   300  of    331.    Elapsed: 0:01:11.

  Average training loss: 0.97
  Training epcoh took: 0:01:18

Running Validation...
  Accuracy: 0.59
  Validation took: 0:00:02

Training complete!

Training...
  Batch   300  of    331.    Elapsed: 0:01:11.

  Average training loss: 0.88
  Training epcoh took: 0:01:18

Running Validation...
  Accuracy: 0.58
  Validation took: 0:00:02

Training complete!


# Testing

In [None]:
    # ========================================
    #               Testing
    # ========================================

    print("")
    print("Testing...")

    t0 = time.time()
    model.eval()
    eval_loss, eval_accuracy, eval_f1 = 0, 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in test_dataloader:
        batch = tuple(t.to(device).long() for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():     
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
     
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        #eval_f1 += tmp_eval_f1
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Testing took: {:}".format(format_time(time.time() - t0)))

print("")
print("Testing complete!")

print("All Processes are completed")


Testing...
  Accuracy: 0.62
  Testing took: 0:00:06

Testing complete!
All Processes are completed


# Predict


In [None]:
tmp_predict_dataloader = DataLoader(predict_data, sampler=predict_sampler, batch_size=1)
predict_result = predict.copy(deep = True)
predict_result = predict_result.drop(columns = ['i_dialog', 'i_utterance', 'speaker'])
predict_result['Predicted'] = 'default'

encoder = LabelEncoder()
labels = train['emotion'].values
encoder.fit(labels)
labels = encoder.transform(labels)


for step, batch in enumerate(tmp_predict_dataloader):
    # 배치를 GPU에 넣음
    batch = tuple(t.to(device).long() for t in batch)
    
    # 배치에서 데이터 추출
    b_index, b_input_ids, b_input_mask = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    idx = b_index.item()
    predict_result['Predicted'][idx] = encoder.classes_[np.argmax(logits)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# 예측 결과 데이터 하위 5개 살펴보기
predict_result.tail()

Unnamed: 0,id,utterance,Predicted
1618,1618,Nooo.,non-neutral
1619,1619,"Hi, Kate!",joy
1620,1620,"Hi, Lauren.",neutral
1621,1621,"Hi, Lauren.",neutral
1622,1622,"Hi, pig!",joy


# 캐글 업로드 자료 생성

In [None]:
# 캐글 업로드를 위한 CSV 생성 (판다스 데이터 프레임 활용)
# 저장 경로 설정 필요

outfile_df = pd.DataFrame()


outfile_df['Id'] = predict_result['id']
outfile_df['Expected'] = predict_result['Predicted']


outfile_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Korea Univ/001. NLP Project/Freinds_eng/12241230.csv',index=False)


In [None]:
outfile_df.head(4)

Unnamed: 0,Id,Expected
0,0,neutral
1,1,surprise
2,2,anger
3,3,neutral
