In [1]:
!pip install -q transformers

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None

from tqdm.notebook import tqdm

# Torch
import torch 
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim


# Pre-trained ELECTRA 
from transformers import (ElectraTokenizerFast,
                          AutoTokenizer,
                          ElectraModel, 
                          ElectraForSequenceClassification,
                          AdamW)
                          
                          
# tokenizer_electra = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
# model_electra_pt = ElectraModel.from_pretrained("kykim/electra-kor-base")  


In [3]:
from google.colab import drive
drive.mount('/content/drive')

# PATH_FOR_DATA = '/content/drive/MyDrive/SentimentClassifier/'
# path_train_corpus1 = f'{PATH_FOR_DATA}감성대화말뭉치(최종데이터)_Training.xlsx'

# path_validation = f'{PATH_FOR_DATA}감성대화말뭉치(최종데이터)_Validation.xlsx'

# train_corpus = pd.read_excel(path_train_corpus1)
# valid_corpus = pd.read_excel(path_validation)

# # train_corpus.head()
# # valid_corpus.head()
# # print(train_corpus['감정_대분류'].unique()) # ['기쁨' '불안' '당황' '슬픔' '분노' '상처' '불안 ' '기쁨 ']
# # print(valid_corpus['감정_대분류'].unique()) # ['분노' '슬픔' '불안' '당황' '상처' '기쁨']

# train_corpus = train_corpus[["사람문장1", "감정_대분류"]]
# train_corpus.rename(columns ={"사람문장1": "sentence", 
#                       "감정_대분류" : "emotion"}, inplace=True)
# miss_spelled = {
#     '기쁨 ' : '기쁨',
#     '불안 ' : '불안',
# }

# for key, value in miss_spelled.items():
#   train_corpus.loc[train_corpus1.emotion == key, 'emotion'] = value
#   valid_corpus = valid_corpus[["사람문장1", "감정_대분류"]]

# valid_corpus = valid_corpus[["사람문장1", "감정_대분류"]]
# valid_corpus.rename(columns ={"사람문장1": "sentence", 
#                       "감정_대분류" : "emotion"}, inplace=True)

# train_corpus.to_csv(f'{PATH_FOR_DATA}train.csv', index=False)
# valid_corpus.to_csv(f'{PATH_FOR_DATA}valid.csv', index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda')

PATH_FOR_DATA = '/content/drive/MyDrive/SentimentClassifier/'
MAX_LEN = 512
EPOCHS = 3
BATCH_SIZE = 64
LABELS = ['부정','긍정','중립']
LABELS_ID = {key: idx for (idx, key) in enumerate(LABELS)}

In [5]:
class SentimentDataset(Dataset):
  
  def __init__(self, csv_file):
    self.dataset = pd.read_csv(csv_file)
    self.tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
    # print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.loc[idx, ['title','emotion']].values
    text = row[0]
    y = LABELS_ID[row[1]]


    

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [6]:

train_set = SentimentDataset('/content/drive/MyDrive/traindata1.csv')
valid_set = SentimentDataset('/content/drive/MyDrive/testdata1.csv')





In [7]:
model = ElectraForSequenceClassification.from_pretrained("kykim/electra-kor-base", problem_type="multi_label_classification", num_labels=3).to(device)
# model = ElectraForSequenceClassification.from_pretrained("kykim/electra-kor-base", problem_type="multi_label_classification", num_labels=6)

Some weights of the model checkpoint at kykim/electra-kor-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.dense.weight', 'classif

In [8]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
test_loader = DataLoader(valid_set, batch_size=16, shuffle=True)



In [9]:
import gc
gc.collect()
torch.cuda.empty_cache()

losses = []
accuracies = []

for i in range(EPOCHS):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    
    optimizer.zero_grad()
    y_batch = torch.tensor(y_batch)
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/513 [00:00<?, ?it/s]



Batch Loss: 106.14984881877899 Accuracy: tensor(0.4381, device='cuda:0')
Batch Loss: 195.44256174564362 Accuracy: tensor(0.5281, device='cuda:0')
Batch Loss: 269.15193685889244 Accuracy: tensor(0.5756, device='cuda:0')
Batch Loss: 337.97885832190514 Accuracy: tensor(0.6069, device='cuda:0')
Batch Loss: 405.2373937666416 Accuracy: tensor(0.6268, device='cuda:0')
Train Loss: 414.29849684238434 Accuracy: tensor(0.6284, device='cuda:0')


  0%|          | 0/513 [00:00<?, ?it/s]

Batch Loss: 56.706321984529495 Accuracy: tensor(0.7669, device='cuda:0')
Batch Loss: 110.98971238732338 Accuracy: tensor(0.7703, device='cuda:0')
Batch Loss: 164.68729610741138 Accuracy: tensor(0.7740, device='cuda:0')
Batch Loss: 216.64213556051254 Accuracy: tensor(0.7794, device='cuda:0')
Batch Loss: 268.45732520520687 Accuracy: tensor(0.7799, device='cuda:0')
Train Loss: 276.448626473546 Accuracy: tensor(0.7788, device='cuda:0')


  0%|          | 0/513 [00:00<?, ?it/s]

Batch Loss: 42.577916860580444 Accuracy: tensor(0.8388, device='cuda:0')
Batch Loss: 85.02100524306297 Accuracy: tensor(0.8391, device='cuda:0')
Batch Loss: 125.86768864840269 Accuracy: tensor(0.8388, device='cuda:0')
Batch Loss: 168.19613025337458 Accuracy: tensor(0.8370, device='cuda:0')
Batch Loss: 210.83931086212397 Accuracy: tensor(0.8365, device='cuda:0')
Train Loss: 216.5684531405568 Accuracy: tensor(0.8360, device='cuda:0')


In [16]:
torch.save(model.state_dict(), "model.pt")


In [17]:
torch.save(model.state_dict(), '/content/model.pt')

In [18]:
# LOAD FINE_TUNED WEIGHT && TEST DATA INSTANCE
model.load_state_dict(torch.load("model.pt"))
text, attention_mask, y = train_set[1]
model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

RuntimeError: ignored

In [15]:
# FOR VALIDATION
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

  0%|          | 0/129 [00:00<?, ?it/s]



RuntimeError: ignored

In [None]:
tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")

In [None]:
input_text = "오늘 너무 우울해"
input = tokenizer(
        input_text, 
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        add_special_tokens=True
        )


In [None]:
class SentimentClassifier():

  LABELS = ['분노', '슬픔', '불안', '당황', '상처', '기쁨',]
  ID_LABELS = {idx: key for (idx, key) in enumerate(LABELS)}
  
  def __init__(self):
    self.model = ElectraForSequenceClassification.from_pretrained("kykim/electra-kor-base", problem_type="multi_label_classification", num_labels=6).to(device)
    self.tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
    model.load_state_dict(torch.load("model.pt"))
    # print(self.dataset.describe())

  def _get_prediction_input(self, text):
    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

  def predict(self, text):
    input_ids, attention_mask = self._get_prediction_input(text)
    y_pred = model(input_ids.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    return SentimentClassifier.ID_LABELS[predicted.item()]


In [None]:
classifier = SentimentClassifier()

In [None]:
classifier.predict("너무 기여워서 지구 뿌셨다")