In [1]:
! pip install -q transformers

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None

from tqdm.notebook import tqdm

# Torch
import torch 
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim


# Pre-trained ELECTRA 
from transformers import (ElectraTokenizerFast,
                          AutoTokenizer,
                          ElectraModel, 
                          ElectraForSequenceClassification,
                          AdamW)
                          
                          
# tokenizer_electra = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
# model_electra_pt = ElectraModel.from_pretrained("kykim/electra-kor-base")  


In [3]:
! pip install torchmetrics
from torchmetrics import F1Score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda')

In [5]:
MAX_LEN = 128
EPOCHS = 5
BATCH_SIZE = 32
#LABELS = ['부정', '긍정', '중립']
#LABELS_ID = {key : idx for (idx, key) in enumerate(LABELS)}

LABELS_ID = { '부정' : 0, '긍정' : 1, '중립' : 2 }

In [6]:
LABELS_ID['부정']

0

In [7]:
a = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/traindata1.csv', encoding="utf-8-sig", index_col=0)
a.iloc[10][['title','emotion']].values

array(['‘외형성장·수익개선’ 두 토끼 잡았다', '긍정'], dtype=object)

In [8]:
class SentimentDataset(Dataset):
  
  def __init__(self, csv_file):
    self.dataset = pd.read_csv(csv_file)
    self.tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
    # print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx][['title','emotion']].values
    text = row[0]
    y = LABELS_ID[row[1]]

    inputs = self.tokenizer(
        text, 
        return_tensors = 'pt',
        truncation = True,
        max_length = MAX_LEN,
        pad_to_max_length = True,
        add_special_tokens = True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
train_set = SentimentDataset('/content/drive/MyDrive/Colab Notebooks/data/traindata1.csv')
valid_set = SentimentDataset('/content/drive/MyDrive/Colab Notebooks/data/testdata1.csv')

In [11]:
model = ElectraForSequenceClassification.from_pretrained("kykim/electra-kor-base", problem_type = "multi_label_classification", num_labels = 3).to(device)

Some weights of the model checkpoint at kykim/electra-kor-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.dense.bias', 'classifie

In [12]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
test_loader = DataLoader(valid_set, batch_size=16, shuffle=True)



In [31]:
from torchmetrics import F1Score

def f1_scoring(X,L):
    max_vals, max_indices = torch.max(X, 1)
    L.extend(max_indices)


In [13]:
import gc
gc.collect()
torch.cuda.empty_cache()

losses = []
accuracies = []

for i in range(EPOCHS):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0
  #f1Score = 0.0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = torch.tensor(y_batch)
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/513 [00:00<?, ?it/s]



Batch Loss: 104.88424676656723 Accuracy: tensor(0.4844, device='cuda:0')
Batch Loss: 192.28828090429306 Accuracy: tensor(0.5475, device='cuda:0')
Batch Loss: 267.06745475530624 Accuracy: tensor(0.5804, device='cuda:0')
Batch Loss: 336.8554077446461 Accuracy: tensor(0.6070, device='cuda:0')
Batch Loss: 402.1844075322151 Accuracy: tensor(0.6294, device='cuda:0')
Train Loss: 411.30363246798515 Accuracy: tensor(0.6304, device='cuda:0')


  0%|          | 0/513 [00:00<?, ?it/s]

Batch Loss: 58.49995794892311 Accuracy: tensor(0.7662, device='cuda:0')
Batch Loss: 115.18360197544098 Accuracy: tensor(0.7616, device='cuda:0')
Batch Loss: 173.342421323061 Accuracy: tensor(0.7598, device='cuda:0')
Batch Loss: 231.37995873391628 Accuracy: tensor(0.7598, device='cuda:0')
Batch Loss: 287.11097125709057 Accuracy: tensor(0.7608, device='cuda:0')
Train Loss: 295.0260239094496 Accuracy: tensor(0.7599, device='cuda:0')


  0%|          | 0/513 [00:00<?, ?it/s]

Batch Loss: 44.42800298333168 Accuracy: tensor(0.8319, device='cuda:0')
Batch Loss: 91.71183650940657 Accuracy: tensor(0.8153, device='cuda:0')
Batch Loss: 137.80729641765356 Accuracy: tensor(0.8140, device='cuda:0')
Batch Loss: 180.8282382786274 Accuracy: tensor(0.8164, device='cuda:0')
Batch Loss: 229.17443069815636 Accuracy: tensor(0.8129, device='cuda:0')
Train Loss: 235.25463190674782 Accuracy: tensor(0.8136, device='cuda:0')


  0%|          | 0/513 [00:00<?, ?it/s]

Batch Loss: 40.40763345360756 Accuracy: tensor(0.8319, device='cuda:0')
Batch Loss: 76.50962530821562 Accuracy: tensor(0.8487, device='cuda:0')
Batch Loss: 113.07654888182878 Accuracy: tensor(0.8521, device='cuda:0')
Batch Loss: 152.18627620488405 Accuracy: tensor(0.8516, device='cuda:0')
Batch Loss: 189.77506756037474 Accuracy: tensor(0.8519, device='cuda:0')
Train Loss: 194.22945216298103 Accuracy: tensor(0.8520, device='cuda:0')


  0%|          | 0/513 [00:00<?, ?it/s]

Batch Loss: 31.63361644744873 Accuracy: tensor(0.8856, device='cuda:0')
Batch Loss: 59.161829352378845 Accuracy: tensor(0.8931, device='cuda:0')
Batch Loss: 87.48370904102921 Accuracy: tensor(0.8950, device='cuda:0')
Batch Loss: 117.87719410285354 Accuracy: tensor(0.8920, device='cuda:0')
Batch Loss: 146.39353220909834 Accuracy: tensor(0.8928, device='cuda:0')
Train Loss: 150.2007869631052 Accuracy: tensor(0.8928, device='cuda:0')


In [14]:
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/model/0725_koelectra_batch32_lr1e-5.pt")

In [15]:
# LOAD FINE_TUNED WEIGHT && TEST DATA INSTANCE
model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/model/0725_koelectra_batch32_lr1e-5.pt"))
text, attention_mask, y = train_set[1]
model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))



SequenceClassifierOutput([('logits',
                           tensor([[ 3.1258, -2.1579, -1.0010]], device='cuda:0',
                                  grad_fn=<AddmmBackward0>))])

In [16]:
# FOR VALIDATION
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

  0%|          | 0/129 [00:00<?, ?it/s]



Accuracy: tensor(0.7420, device='cuda:0')


In [17]:
tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")

In [18]:
input_text = "오늘 너무 우울해"
input = tokenizer(
        input_text, 
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        add_special_tokens=True
        )




In [27]:
class SentimentClassifier():

  LABELS = ['부정', '긍정', '중립',]
  ID_LABELS = {idx: key for (idx, key) in enumerate(LABELS)}
  
  def __init__(self):
    self.model = ElectraForSequenceClassification.from_pretrained("kykim/electra-kor-base", problem_type="multi_label_classification", num_labels=3).to(device)
    self.tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
    model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/model/0725_koelectra_batch32_lr1e-5.pt"))
    # print(self.dataset.describe())

  def _get_prediction_input(self, text):
    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

  def predict(self, text):
    input_ids, attention_mask = self._get_prediction_input(text)
    y_pred = model(input_ids.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    return SentimentClassifier.ID_LABELS[predicted.item()]


In [28]:
classifier = SentimentClassifier()

Some weights of the model checkpoint at kykim/electra-kor-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.dense.bias', 'classifie

In [40]:
classifier.predict("금강산 관광 재개")



'긍정'