In [1]:
! pip install -q transformers

[K     |████████████████████████████████| 4.4 MB 34.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 58.0 MB/s 
[K     |████████████████████████████████| 101 kB 12.6 MB/s 
[K     |████████████████████████████████| 596 kB 56.1 MB/s 
[?25h

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None

from tqdm.notebook import tqdm

# Torch
import torch 
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim


# Pre-trained ELECTRA 
from transformers import (ElectraTokenizerFast,
                          AutoTokenizer,
                          ElectraModel, 
                          ElectraForSequenceClassification,
                          AdamW)
                          
                          
# tokenizer_electra = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
# model_electra_pt = ElectraModel.from_pretrained("kykim/electra-kor-base")  


In [3]:
! pip install torchmetrics
from torchmetrics import F1Score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.9.3-py3-none-any.whl (419 kB)
[K     |████████████████████████████████| 419 kB 26.9 MB/s 
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.9.3


In [4]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda')

In [5]:
MAX_LEN = 128
EPOCHS = 5
BATCH_SIZE = 32
#LABELS = ['부정', '긍정', '중립']
#LABELS_ID = {key : idx for (idx, key) in enumerate(LABELS)}

LABELS_ID = { '부정' : 0, '긍정' : 1, '중립' : 2 }

In [6]:
LABELS_ID['부정']

0

In [7]:
a = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/traindata1.csv', encoding="utf-8-sig", index_col=0)
a.iloc[10][['title','emotion']].values

array(['‘외형성장·수익개선’ 두 토끼 잡았다', '긍정'], dtype=object)

In [8]:
class SentimentDataset(Dataset):
  
  def __init__(self, csv_file):
    self.dataset = pd.read_csv(csv_file)
    self.tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
    # print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx][['title','emotion']].values
    text = row[0]
    y = LABELS_ID[row[1]]

    inputs = self.tokenizer(
        text, 
        return_tensors = 'pt',
        truncation = True,
        max_length = MAX_LEN,
        pad_to_max_length = True,
        add_special_tokens = True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
train_set = SentimentDataset('/content/drive/MyDrive/Colab Notebooks/traain.csv')
#valid_set = SentimentDataset('/content/drive/MyDrive/Colab Notebooks/valid.csv')
#test_set = SentimentDataset('/content/drive/MyDrive/Colab Notebooks/test.csv')

Downloading:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/620 [00:00<?, ?B/s]

In [11]:
#print(len(valid_set))

820


In [12]:
model = ElectraForSequenceClassification.from_pretrained("kykim/electra-kor-base", problem_type = "multi_label_classification", num_labels = 3).to(device)

Downloading:   0%|          | 0.00/451M [00:00<?, ?B/s]

Some weights of the model checkpoint at kykim/electra-kor-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.out_proj.bias', 'classi

In [13]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
#test_loader = DataLoader(test_set, batch_size=16, shuffle=True)
#valid_loader=DataLoader(valid_set, batch_size=16, shuffle=True)



In [14]:
from torchmetrics import F1Score

def f1_scoring(X,L):
    max_vals, max_indices = torch.max(X, 1)
    L.extend(max_indices)


In [15]:
import gc
gc.collect()
torch.cuda.empty_cache()

losses = []
accuracies = []

for i in range(EPOCHS):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0
  #f1Score = 0.0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = torch.tensor(y_batch)
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

  0%|          | 0/462 [00:00<?, ?it/s]



Batch Loss: 106.84426534175873 Accuracy: tensor(0.4375, device='cuda:0')
Batch Loss: 194.9212031364441 Accuracy: tensor(0.5178, device='cuda:0')
Batch Loss: 269.805168569088 Accuracy: tensor(0.5654, device='cuda:0')
Batch Loss: 334.39581257104874 Accuracy: tensor(0.6055, device='cuda:0')
Train Loss: 373.9809679389 Accuracy: tensor(0.6221, device='cuda:0')


  0%|          | 0/462 [00:00<?, ?it/s]

Batch Loss: 59.38121995329857 Accuracy: tensor(0.7487, device='cuda:0')
Batch Loss: 112.87760072946548 Accuracy: tensor(0.7625, device='cuda:0')
Batch Loss: 165.2391601651907 Accuracy: tensor(0.7721, device='cuda:0')
Batch Loss: 219.0781282633543 Accuracy: tensor(0.7727, device='cuda:0')
Train Loss: 251.24174420535564 Accuracy: tensor(0.7733, device='cuda:0')


  0%|          | 0/462 [00:00<?, ?it/s]

Batch Loss: 42.64637042582035 Accuracy: tensor(0.8388, device='cuda:0')
Batch Loss: 83.42259155213833 Accuracy: tensor(0.8391, device='cuda:0')
Batch Loss: 124.87752661854029 Accuracy: tensor(0.8385, device='cuda:0')
Batch Loss: 166.94845262914896 Accuracy: tensor(0.8403, device='cuda:0')
Train Loss: 191.58180351555347 Accuracy: tensor(0.8422, device='cuda:0')


  0%|          | 0/462 [00:00<?, ?it/s]

Batch Loss: 29.352949522435665 Accuracy: tensor(0.9044, device='cuda:0')
Batch Loss: 62.35759936645627 Accuracy: tensor(0.8931, device='cuda:0')
Batch Loss: 96.31065533682704 Accuracy: tensor(0.8879, device='cuda:0')
Batch Loss: 127.82032894715667 Accuracy: tensor(0.8886, device='cuda:0')
Train Loss: 147.14213216677308 Accuracy: tensor(0.8884, device='cuda:0')


  0%|          | 0/462 [00:00<?, ?it/s]

Batch Loss: 21.92127699404955 Accuracy: tensor(0.9337, device='cuda:0')
Batch Loss: 43.93967177718878 Accuracy: tensor(0.9322, device='cuda:0')
Batch Loss: 65.51488266699016 Accuracy: tensor(0.9298, device='cuda:0')
Batch Loss: 88.4102582372725 Accuracy: tensor(0.9300, device='cuda:0')
Train Loss: 100.43483321741223 Accuracy: tensor(0.9311, device='cuda:0')


In [16]:
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/0725_koelectra_batch32_lr1e-5.pt")

In [17]:
# LOAD FINE_TUNED WEIGHT && TEST DATA INSTANCE
model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/0725_koelectra_batch32_lr1e-5.pt"))
text, attention_mask, y = train_set[1]
model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))



SequenceClassifierOutput([('logits',
                           tensor([[ 0.8740, -2.9331,  1.8652]], device='cuda:0',
                                  grad_fn=<AddmmBackward0>))])

In [18]:
# FOR VALIDATION
from torchmetrics import F1Score

예측값=[]
실제값=[]
model.eval()
pred=[]
target=[]
test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(valid_loader):
  
  y_batch = y_batch.to(device)
  
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  
  _, predicted = torch.max(y_pred, 1)
  target.append(y_batch)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)
  pred.append(predicted)

print(pred)
print(target)

for i in pred:
  예측값.extend(i)

for j in target:
  실제값.extend(j)
예측값 = torch.tensor(예측값)
실제값 = torch.tensor(실제값)

print(예측값)
print(실제값)
f1 = F1Score(num_classes=3)
f1score=f1(예측값, 실제값)


print("Accuracy:", test_correct.float() / test_total)
print("f1score {} : ".format(f1score))




  0%|          | 0/52 [00:00<?, ?it/s]



[tensor([2, 2, 0, 1, 1, 0, 0, 0, 0, 1, 1, 2, 2, 0, 1, 0], device='cuda:0'), tensor([0, 0, 2, 1, 0, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2, 2], device='cuda:0'), tensor([2, 2, 0, 1, 2, 2, 0, 1, 0, 2, 0, 1, 0, 0, 2, 1], device='cuda:0'), tensor([0, 1, 0, 2, 2, 0, 0, 2, 1, 2, 1, 0, 2, 0, 0, 0], device='cuda:0'), tensor([0, 2, 1, 0, 2, 0, 0, 0, 0, 1, 1, 1, 2, 0, 1, 0], device='cuda:0'), tensor([2, 2, 0, 1, 0, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2], device='cuda:0'), tensor([2, 0, 1, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 2, 1, 1], device='cuda:0'), tensor([0, 0, 1, 0, 1, 1, 2, 0, 0, 2, 0, 2, 2, 0, 1, 1], device='cuda:0'), tensor([2, 0, 0, 2, 2, 2, 2, 0, 2, 1, 2, 0, 0, 0, 0, 1], device='cuda:0'), tensor([0, 1, 2, 1, 2, 2, 2, 1, 2, 2, 0, 2, 0, 1, 2, 0], device='cuda:0'), tensor([2, 0, 1, 2, 0, 0, 2, 1, 0, 0, 2, 0, 0, 2, 0, 2], device='cuda:0'), tensor([2, 2, 0, 1, 1, 2, 2, 0, 2, 2, 2, 0, 0, 1, 2, 0], device='cuda:0'), tensor([1, 2, 1, 0, 1, 0, 2, 2, 0, 2, 0, 1, 2, 0, 1, 1], device='cuda:0'), tensor([2, 2, 0, 0, 2, 0

In [19]:
#for test data
예측값=[]
실제값=[]
model.eval()
pred=[]
target=[]
test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  
  y_batch = y_batch.to(device)
  
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  
  _, predicted = torch.max(y_pred, 1)
  target.append(y_batch)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)
  pred.append(predicted)

print(pred)
print(target)

for i in pred:
  예측값.extend(i)

for j in target:
  실제값.extend(j)
예측값 = torch.tensor(예측값)
실제값 = torch.tensor(실제값)

print(예측값)
print(실제값)
f1 = F1Score(num_classes=3)
f1score=f1(예측값, 실제값)


print("Accuracy:", test_correct.float() / test_total)
print("f1score {} : ".format(f1score))


  0%|          | 0/81 [00:00<?, ?it/s]



[tensor([1, 2, 2, 2, 1, 2, 2, 0, 0, 1, 2, 1, 2, 1, 0, 0], device='cuda:0'), tensor([2, 0, 0, 2, 0, 1, 0, 0, 1, 2, 1, 2, 2, 2, 0, 0], device='cuda:0'), tensor([1, 1, 1, 1, 2, 0, 1, 2, 1, 2, 0, 2, 2, 2, 0, 2], device='cuda:0'), tensor([2, 0, 0, 0, 2, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2], device='cuda:0'), tensor([2, 2, 1, 0, 1, 0, 2, 2, 2, 0, 2, 2, 1, 0, 0, 2], device='cuda:0'), tensor([1, 1, 0, 2, 1, 0, 0, 1, 2, 1, 1, 0, 2, 0, 2, 2], device='cuda:0'), tensor([0, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 2, 2, 0, 0, 1], device='cuda:0'), tensor([1, 2, 1, 0, 2, 2, 2, 1, 0, 2, 1, 1, 2, 0, 0, 2], device='cuda:0'), tensor([2, 0, 2, 1, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2, 1, 0], device='cuda:0'), tensor([2, 2, 2, 2, 2, 2, 1, 2, 0, 0, 2, 0, 2, 0, 0, 2], device='cuda:0'), tensor([0, 2, 0, 1, 2, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 2], device='cuda:0'), tensor([0, 0, 1, 2, 0, 0, 2, 1, 0, 2, 2, 2, 0, 0, 0, 2], device='cuda:0'), tensor([2, 0, 0, 0, 1, 2, 2, 1, 2, 0, 1, 2, 0, 0, 0, 1], device='cuda:0'), tensor([1, 0, 0, 2, 1, 0

In [20]:
tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")

In [21]:
input_text = "오늘 너무 우울해"
input = tokenizer(
        input_text, 
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        add_special_tokens=True
        )




In [43]:
PATH='/content/drive/MyDrive/Colab Notebooks/train.csv'

In [45]:
class SentimentClassifier():

  LABELS = ['부정', '긍정', '중립',]
  ID_LABELS = {idx: key for (idx, key) in enumerate(LABELS)}
  
  def __init__(self):
    self.df=pd.read_csv(PATH) #추가
    self.model = ElectraForSequenceClassification.from_pretrained("kykim/electra-kor-base", problem_type="multi_label_classification", num_labels=3).to(device)
    self.tokenizer = ElectraTokenizerFast.from_pretrained("kykim/electra-kor-base")
    model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/0725_koelectra_batch32_lr1e-5.pt"))
    # print(self.dataset.describe())

  def _get_prediction_input(self, text):
    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

  def predict(self, text):
    input_ids, attention_mask = self._get_prediction_input(text)
    y_pred = model(input_ids.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    return SentimentClassifier.ID_LABELS[predicted.item()]

  def labeling(self, df):
    df['emotion']=df['title'].apply(self.predict)
    print(self.df)






In [46]:
classifier = SentimentClassifier()

Some weights of the model checkpoint at kykim/electra-kor-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['classifier.out_proj.bias', 'classi

In [47]:
classifier.labeling(classifier.df)



      Unnamed: 0  level_0  index press  \
0              0        0   8719  한국경제   
1              1        1    563  한국경제   
2              2        3    167  한국경제   
3              3        4    237  경향신문   
4              4        5    373  경향신문   
...          ...      ...    ...   ...   
7372        7372     8190    162  매일경제   
7373        7373     8192   8432  경향신문   
7374        7374     8194   3034  경향신문   
7375        7375     8195     95  조선일보   
7376        7376     8196   1775  경향신문   

                                                  title      date emotion  
0                               구글 '인앱결제 꼼수'에…멜론도 가격 인상  20220608      부정  
1                      용적률에 집값 희비…분당·일산 '쑥쑥' 평촌·산본 '주춤'  20220607      중립  
2                              "인플레 시대…金보다 나이키·코카콜라 사라"  20220106      중립  
3                 이탈리아 코로나19 누적 확진자 7375명…북부 지역 '모두 멈춤'  20200310      부정  
4                                     ‘골폭죽’으로 팬들 마음 달래라  20220614      중립  
...                                    

In [28]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
df

Unnamed: 0.1,Unnamed: 0,level_0,index,press,title,date,emotion
0,0,0,8719,한국경제,구글 '인앱결제 꼼수'에…멜론도 가격 인상,20220608,부정
1,1,1,563,한국경제,용적률에 집값 희비…분당·일산 '쑥쑥' 평촌·산본 '주춤',20220607,중립
2,2,3,167,한국경제,"""인플레 시대…金보다 나이키·코카콜라 사라""",20220106,중립
3,3,4,237,경향신문,이탈리아 코로나19 누적 확진자 7375명…북부 지역 '모두 멈춤',20200310,부정
4,4,5,373,경향신문,‘골폭죽’으로 팬들 마음 달래라,20220614,중립
...,...,...,...,...,...,...,...
7372,7372,8190,162,매일경제,"민노총 택배노조, CJ대한통운 본사 기습점거",20220211,부정
7373,7373,8192,8432,경향신문,안철수 “선거지원금 440억 반납하고 투표 참여자들에게 마스크 지급하자”,20200404,중립
7374,7374,8194,3034,경향신문,"“공적 판매 돕는 마스크 업체 세무조사 유예” 김현준 국세청장, 제조·유통 현장 찾...",20200303,중립
7375,7375,8195,95,조선일보,GDP 대비 재정적자 비율 역대 최고… EU 권고기준의 2배 육박,20200604,부정


In [26]:
classifier.predict("삼성전자화이팅")



'긍정'

In [30]:
classifier.predict(df['title'][0])



'부정'

In [35]:
tex=['나나','보라돌이']
b=classifier.predict(tex)
b



'중립'