<a href="https://colab.research.google.com/github/subeenpark-io/sentiment-multiclass-classification-with-electra/blob/main/Electra_SentimentalAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers

[K     |████████████████████████████████| 3.3 MB 4.3 MB/s 
[K     |████████████████████████████████| 596 kB 73.0 MB/s 
[K     |████████████████████████████████| 61 kB 687 kB/s 
[K     |████████████████████████████████| 3.3 MB 52.3 MB/s 
[K     |████████████████████████████████| 895 kB 69.0 MB/s 
[?25h

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None

from tqdm.notebook import tqdm

# Torch
import torch 
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim


# Pre-trained ELECTRA 
from transformers import (ElectraTokenizerFast,
                          AutoTokenizer,
                          ElectraModel, 
                          ElectraForSequenceClassification,
                          AdamW)
                          
                          
# tokenizer_electra = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
# model_electra_pt = ElectraModel.from_pretrained("kykim/electra-kor-base")  


In [3]:
from google.colab import drive
drive.mount('/content/drive')

# PATH_FOR_DATA = '/content/drive/MyDrive/SentimentClassifier/'
# path_train_corpus1 = f'{PATH_FOR_DATA}감성대화말뭉치(최종데이터)_Training.xlsx'

# path_validation = f'{PATH_FOR_DATA}감성대화말뭉치(최종데이터)_Validation.xlsx'

# train_corpus = pd.read_excel(path_train_corpus1)
# valid_corpus = pd.read_excel(path_validation)

# # train_corpus.head()
# # valid_corpus.head()
# # print(train_corpus['감정_대분류'].unique()) # ['기쁨' '불안' '당황' '슬픔' '분노' '상처' '불안 ' '기쁨 ']
# # print(valid_corpus['감정_대분류'].unique()) # ['분노' '슬픔' '불안' '당황' '상처' '기쁨']

# train_corpus = train_corpus[["사람문장1", "감정_대분류"]]
# train_corpus.rename(columns ={"사람문장1": "sentence", 
#                       "감정_대분류" : "emotion"}, inplace=True)
# miss_spelled = {
#     '기쁨 ' : '기쁨',
#     '불안 ' : '불안',
# }

# for key, value in miss_spelled.items():
#   train_corpus.loc[train_corpus1.emotion == key, 'emotion'] = value
#   valid_corpus = valid_corpus[["사람문장1", "감정_대분류"]]

# valid_corpus = valid_corpus[["사람문장1", "감정_대분류"]]
# valid_corpus.rename(columns ={"사람문장1": "sentence", 
#                       "감정_대분류" : "emotion"}, inplace=True)

# train_corpus.to_csv(f'{PATH_FOR_DATA}train.csv', index=False)
# valid_corpus.to_csv(f'{PATH_FOR_DATA}valid.csv', index=False)

Mounted at /content/drive


In [4]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda')

PATH_FOR_DATA = '/content/drive/MyDrive/SentimentClassifier/'
MAX_LEN = 512
EPOCHS = 3
BATCH_SIZE = 128
LABELS = ['분노', '슬픔', '불안', '당황', '상처', '기쁨',]
LABELS_ID = {key: idx for (idx, key) in enumerate(LABELS)}

In [5]:
class SentimentDataset(Dataset):
  
  def __init__(self, csv_file):
    self.dataset = pd.read_csv(csv_file)
    self.tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
    # print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:2].values
    text = row[0]
    y = LABELS_ID[row[1]]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [6]:
train_set = SentimentDataset(f"{PATH_FOR_DATA}train.csv")
valid_set = SentimentDataset(f"{PATH_FOR_DATA}valid.csv")

Downloading:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/620 [00:00<?, ?B/s]

In [7]:
model = ElectraForSequenceClassification.from_pretrained("kykim/electra-kor-base", problem_type="multi_label_classification", num_labels=6).to(device)
# model = ElectraForSequenceClassification.from_pretrained("kykim/electra-kor-base", problem_type="multi_label_classification", num_labels=6)

Downloading:   0%|          | 0.00/451M [00:00<?, ?B/s]

Some weights of the model checkpoint at kykim/electra-kor-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.out_proj.weight', 'clas

In [8]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
test_loader = DataLoader(valid_set, batch_size=16, shuffle=True)

In [9]:
import gc
gc.collect()
torch.cuda.empty_cache()

losses = []
accuracies = []

for i in range(EPOCHS):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = torch.tensor(y_batch)
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/2555 [00:00<?, ?it/s]



Batch Loss: 178.32761347293854 Accuracy: tensor(0.2044, device='cuda:0')
Batch Loss: 347.574653506279 Accuracy: tensor(0.2606, device='cuda:0')
Batch Loss: 500.0118441581726 Accuracy: tensor(0.3077, device='cuda:0')
Batch Loss: 642.1530686616898 Accuracy: tensor(0.3447, device='cuda:0')
Batch Loss: 778.7958997488022 Accuracy: tensor(0.3696, device='cuda:0')
Batch Loss: 911.8902617692947 Accuracy: tensor(0.3904, device='cuda:0')
Batch Loss: 1040.0839197039604 Accuracy: tensor(0.4085, device='cuda:0')
Batch Loss: 1163.99873316288 Accuracy: tensor(0.4270, device='cuda:0')
Batch Loss: 1281.7187435626984 Accuracy: tensor(0.4443, device='cuda:0')
Batch Loss: 1397.317420721054 Accuracy: tensor(0.4581, device='cuda:0')
Batch Loss: 1508.814896941185 Accuracy: tensor(0.4712, device='cuda:0')
Batch Loss: 1616.6054911017418 Accuracy: tensor(0.4842, device='cuda:0')
Batch Loss: 1729.016072511673 Accuracy: tensor(0.4929, device='cuda:0')
Batch Loss: 1834.3400936722755 Accuracy: tensor(0.5028, device

  0%|          | 0/2555 [00:00<?, ?it/s]

Batch Loss: 93.5669729411602 Accuracy: tensor(0.6687, device='cuda:0')
Batch Loss: 186.8705930709839 Accuracy: tensor(0.6750, device='cuda:0')
Batch Loss: 280.0506112277508 Accuracy: tensor(0.6721, device='cuda:0')
Batch Loss: 373.48307770490646 Accuracy: tensor(0.6709, device='cuda:0')
Batch Loss: 465.97170689702034 Accuracy: tensor(0.6709, device='cuda:0')
Batch Loss: 557.8686669170856 Accuracy: tensor(0.6721, device='cuda:0')
Batch Loss: 649.7584983706474 Accuracy: tensor(0.6702, device='cuda:0')
Batch Loss: 739.4565274715424 Accuracy: tensor(0.6712, device='cuda:0')
Batch Loss: 829.0669512152672 Accuracy: tensor(0.6732, device='cuda:0')
Batch Loss: 922.6890593767166 Accuracy: tensor(0.6726, device='cuda:0')
Batch Loss: 1017.4815987348557 Accuracy: tensor(0.6719, device='cuda:0')
Batch Loss: 1107.3128602802753 Accuracy: tensor(0.6731, device='cuda:0')
Batch Loss: 1201.2247586846352 Accuracy: tensor(0.6726, device='cuda:0')
Batch Loss: 1291.6138689517975 Accuracy: tensor(0.6726, devi

  0%|          | 0/2555 [00:00<?, ?it/s]

Batch Loss: 84.67156001925468 Accuracy: tensor(0.7081, device='cuda:0')
Batch Loss: 167.41595801711082 Accuracy: tensor(0.7138, device='cuda:0')
Batch Loss: 251.20490497350693 Accuracy: tensor(0.7121, device='cuda:0')
Batch Loss: 337.58861792087555 Accuracy: tensor(0.7086, device='cuda:0')
Batch Loss: 424.49808728694916 Accuracy: tensor(0.7051, device='cuda:0')
Batch Loss: 506.05748999118805 Accuracy: tensor(0.7064, device='cuda:0')
Batch Loss: 593.4647657573223 Accuracy: tensor(0.7051, device='cuda:0')
Batch Loss: 676.390524238348 Accuracy: tensor(0.7069, device='cuda:0')
Batch Loss: 758.5921711623669 Accuracy: tensor(0.7069, device='cuda:0')
Batch Loss: 840.8072051107883 Accuracy: tensor(0.7071, device='cuda:0')
Batch Loss: 923.0242600440979 Accuracy: tensor(0.7073, device='cuda:0')
Batch Loss: 1003.6896657049656 Accuracy: tensor(0.7080, device='cuda:0')
Batch Loss: 1085.4709162414074 Accuracy: tensor(0.7083, device='cuda:0')
Batch Loss: 1168.8928917646408 Accuracy: tensor(0.7078, de

In [10]:
torch.save(model.state_dict(), "model.pt")

In [11]:
torch.save(model.state_dict(), f'{PATH_FOR_DATA}model.pt')

In [None]:
# LOAD FINE_TUNED WEIGHT && TEST DATA INSTANCE
model.load_state_dict(torch.load("model.pt"))
text, attention_mask, y = train_set[1]
model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

In [None]:
# FOR VALIDATION
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

In [20]:
tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")

In [None]:
input_text = "오늘 너무 우울해"
input = tokenizer(
        input_text, 
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        add_special_tokens=True
        )


In [37]:
class SentimentClassifier():

  LABELS = ['분노', '슬픔', '불안', '당황', '상처', '기쁨',]
  ID_LABELS = {idx: key for (idx, key) in enumerate(LABELS)}
  
  def __init__(self):
    self.model = ElectraForSequenceClassification.from_pretrained("kykim/electra-kor-base", problem_type="multi_label_classification", num_labels=6).to(device)
    self.tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
    model.load_state_dict(torch.load("model.pt"))
    # print(self.dataset.describe())

  def _get_prediction_input(self, text):
    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

  def predict(self, text):
    input_ids, attention_mask = self._get_prediction_input(text)
    y_pred = model(input_ids.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    return SentimentClassifier.ID_LABELS[predicted.item()]


In [38]:
classifier = SentimentClassifier()

Some weights of the model checkpoint at kykim/electra-kor-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.out_proj.weight', 'clas

'슬픔'

In [47]:
classifier.predict("너무 기여워서 지구 뿌셨다")



'기쁨'