In [1]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3 # TypeError: dropout(): argument 'input' (position 1) must be Tensor, not str
!pip install torch

Collecting mxnet
  Downloading mxnet-1.8.0.post0-py2.py3-none-manylinux2014_x86_64.whl (46.9 MB)
[K     |████████████████████████████████| 46.9 MB 45 kB/s 
Collecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-1.8.0.post0
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 5.2 MB/s 
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp37-cp37m-linux_x86_64.whl size=595733 sha256=59219ab59b1fdfd2490ea91535f516d7ee05b16cf799453874df3baa051e9abc
  Stored in directory: /root/.cache/pip/wheels/be/b4/06/7f3fdfaf707e6b5e98b79c041

In [2]:
#깃허브에서 KoBERT 파일 로드
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-hya491v1
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-hya491v1
Building wheels for collected packages: kobert
  Building wheel for kobert (setup.py) ... [?25l[?25hdone
  Created wheel for kobert: filename=kobert-0.1.2-py3-none-any.whl size=12771 sha256=ae496d876222f83e337bafd902b99889f6cbceb5620f0bdf0c277aa3dd2b32a2
  Stored in directory: /tmp/pip-ephem-wheel-cache-a7f1jc48/wheels/d3/68/ca/334747dfb038313b49cf71f84832a33372f3470d9ddfd051c0
Successfully built kobert
Installing collected packages: kobert
Successfully installed kobert-0.1.2


In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm as tqdm, tqdm_notebook

In [4]:
#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

In [5]:
#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [6]:
# confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [7]:
#GPU 사용
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [8]:
#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

[██████████████████████████████████████████████████]
[██████████████████████████████████████████████████]


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
#불러올 파일의 경로를 filename 변수에 저장
filename = '/content/drive/My Drive/ESG/news_train_sample.csv'

In [11]:
# 학습용 데이터셋 불러오기
import pandas as pd
dataset = pd.read_csv(filename)
dataset

Unnamed: 0,title,originallink,link,description,pubDate,corp_name,E,S,G,value
0,"백악관 또 반도체회의, 삼성전자 불러놓고 ""내부 정보 내놔""",https://www.joongang.co.kr/article/25009320,https://news.naver.com/main/read.naver?mode=LS...,"이날 회의에는 삼성전자를 비롯해 TSMC, 인텔, 마이크로소프트(MS), 애플, 제...","Fri, 24 Sep 2021 10:11:00 +0900",삼성전자,0,0,0,0
1,"백악관, 또 반도체 대응 회의···삼성전자도 참석",https://www.sedaily.com/NewsView/22RL705FUP,https://news.naver.com/main/read.naver?mode=LS...,미국 백악관이 반도체 부족 문제를 해결하기 위해 삼성전자와 애플 등 주요 기업 관계...,"Fri, 24 Sep 2021 07:47:00 +0900",삼성전자,0,0,0,0
2,"NAVER, 규제 노이즈에도 투자포인트 `여전`…추가 하락시 `매수` -한국",http://www.edaily.co.kr/news/newspath.asp?news...,https://news.naver.com/main/read.naver?mode=LS...,한국투자증권은 15일 NAVER(035420)에 대해 장기적인 투자포인트의 본질은 ...,"Wed, 15 Sep 2021 08:04:00 +0900",네이버,0,0,0,0
3,'빅테크 규제'에 카카오·네이버를 바라보는 온도차,http://www.newsis.com/view/?id=NISX20210924_00...,https://news.naver.com/main/read.naver?mode=LS...,"기사내용 요약카카오 목표가 하향, 네이버는 유지 ""카카오 방향성 맞지만 성장세 둔화...","Fri, 24 Sep 2021 11:06:00 +0900",카카오,0,0,0,0
4,"삼성바이오로직스, 의약품 위탁개발 플랫폼 개발·공개",http://www.fnnews.com/news/202109221321036446,https://news.naver.com/main/read.naver?mode=LS...,"삼성바이오로직스가 세포주 개발부터 시험계획서(IND) 제출, 그리고 공정 특성확인(...","Wed, 22 Sep 2021 13:29:00 +0900",삼성바이오로직스,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1920,"삼양식품, 미얀마 산업연수생에게 소송 취하 압력?",http://www.ohmynews.com/NWS_Web/view/at_pg.asp...,https://news.naver.com/main/read.naver?mode=LS...,[ 변호사와 이주노동자들이 (주)삼양식품을 근로기준법과 최저임금법을 위반한 혐의로 ...,"Thu, 12 Nov 2009 17:13:00 +0900",삼양식품,0,1,0,2
1921,"전인장 삼양식품 회장, 횡령으로 ‘징역 3년’ 구속…‘불닭볶음면 불매운동’...",http://www.ajunews.com/view/20190125145641625,http://www.ajunews.com/view/20190125145641625,"누리꾼들은 ""삼양식품 불닭볶음면 불매운동 해야겠다"" ""회장이라고 회삿돈을 마음대로 ...","Fri, 25 Jan 2019 15:10:00 +0900",삼양식품,0,0,1,2
1922,"[식품회사 도 넘은 대주주 배불리기 ①] 삼양식품, 오너회사와 불공정거래",,https://news.naver.com/main/read.naver?mode=LS...,"같다""며 ""불공정거래행위가 의심된다""고 말했다. 이에 대해 최남석 삼양식품 홍보팀장...","Wed, 14 Mar 2012 15:03:00 +0900",삼양식품,0,0,1,2
1923,"김정수 삼양식품 총괄사장, 대표이사 대신 ESG위원장…왜?",http://www.sisajournal.com/news/articleView.ht...,https://news.naver.com/main/read.naver?mode=LS...,업계에서는 비리로 삼양식품의 도덕성에 막대한 타격을 입힌 김 총괄사장이 윤리경영을 ...,"Tue, 09 Mar 2021 14:22:00 +0900",삼양식품,0,0,1,0


In [12]:
e_new_data = dataset[['title','E']]
s_new_data = dataset[['title','S']]
g_new_data = dataset[['title','G']]

In [13]:
e_list = []
for q, label in zip(e_new_data['title'], e_new_data['E']):
    data = [q, str(int(label))]
    e_list.append(data)

In [14]:
s_list = []
for q, label in zip(s_new_data['title'], s_new_data['S']):
    data = [q, str(int(label))]
    s_list.append(data)

In [15]:
g_list = []
for q, label in zip(g_new_data['title'], g_new_data['G']):
    data = [q, str(int(label))]
    g_list.append(data)

In [16]:
# Train / Test set 분리
from sklearn.model_selection import train_test_split

e_train, e_test = train_test_split(e_list, test_size=0.2, random_state=42)
s_train, s_test = train_test_split(s_list, test_size=0.2, random_state=42)
g_train, g_test = train_test_split(g_list, test_size=0.2, random_state=42)
print("e_train shape is:", len(e_train))
print("s_train shape is:", len(s_train))
print("g_train shape is:", len(g_train))
print("e_test shape is:", len(e_test))
print("s_test shape is:", len(s_test))
print("g_test shape is:", len(g_test))

e_train shape is: 1540
s_train shape is: 1540
g_train shape is: 1540
e_test shape is: 385
s_test shape is: 385
g_test shape is: 385


In [17]:
# 기본 Bert tokenizer 사용
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [18]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [19]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 6
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [20]:
e_data_train = BERTDataset(e_train, 0, 1, tok, max_len, True, False)
s_data_train = BERTDataset(s_train, 0, 1, tok, max_len, True, False)
g_data_train = BERTDataset(g_train, 0, 1, tok, max_len, True, False)

e_data_test = BERTDataset(e_test, 0, 1, tok, max_len, True, False)
s_data_test = BERTDataset(s_test, 0, 1, tok, max_len, True, False)
g_data_test = BERTDataset(g_test, 0, 1, tok, max_len, True, False)

In [21]:
# pytorch용 DataLoader 사용
e_train_dataloader = torch.utils.data.DataLoader(e_data_train, batch_size=batch_size, num_workers=2)
s_train_dataloader = torch.utils.data.DataLoader(s_data_train, batch_size=batch_size, num_workers=2)
g_train_dataloader = torch.utils.data.DataLoader(g_data_train, batch_size=batch_size, num_workers=2)

e_test_dataloader = torch.utils.data.DataLoader(e_data_test, batch_size=batch_size, num_workers=2)
s_test_dataloader = torch.utils.data.DataLoader(s_data_test, batch_size=batch_size, num_workers=2)
g_test_dataloader = torch.utils.data.DataLoader(g_data_test, batch_size=batch_size, num_workers=2)

In [22]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [23]:
e_model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
s_model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
g_model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [24]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
e_optimizer_grouped_parameters = [
    {'params': [p for n, p in e_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in e_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
s_optimizer_grouped_parameters = [
    {'params': [p for n, p in s_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in s_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
g_optimizer_grouped_parameters = [
    {'params': [p for n, p in g_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in g_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [25]:
# 옵티마이저 선언
e_optimizer = AdamW(e_optimizer_grouped_parameters, lr=learning_rate)
e_loss_fn = nn.CrossEntropyLoss() # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능
# 옵티마이저 선언
s_optimizer = AdamW(s_optimizer_grouped_parameters, lr=learning_rate)
s_loss_fn = nn.CrossEntropyLoss()
# 옵티마이저 선언
g_optimizer = AdamW(g_optimizer_grouped_parameters, lr=learning_rate)
g_loss_fn = nn.CrossEntropyLoss()

In [26]:
e_t_total = len(e_train_dataloader) * num_epochs
e_warmup_step = int(e_t_total * warmup_ratio)

In [27]:
s_t_total = len(s_train_dataloader) * num_epochs
s_warmup_step = int(s_t_total * warmup_ratio)

In [28]:
g_t_total = len(g_train_dataloader) * num_epochs
g_warmup_step = int(g_t_total * warmup_ratio)

In [29]:
e_scheduler = get_cosine_schedule_with_warmup(e_optimizer, num_warmup_steps=e_warmup_step, num_training_steps=e_t_total)
s_scheduler = get_cosine_schedule_with_warmup(s_optimizer, num_warmup_steps=s_warmup_step, num_training_steps=s_t_total)
g_scheduler = get_cosine_schedule_with_warmup(g_optimizer, num_warmup_steps=g_warmup_step, num_training_steps=g_t_total)

In [30]:
# 학습 평가 지표인 accuracy 계산 -> 얼마나 타겟값을 많이 맞추었는가
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [31]:
# 모델 학습 시작
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    # 학습모드
    e_model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(e_train_dataloader)):
        e_optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        e_out = e_model(token_ids, valid_length, segment_ids)
        loss = e_loss_fn(e_out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(e_model.parameters(), max_grad_norm) # gradient clipping
        e_optimizer.step()
        e_scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(e_out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    # 평가모드
    e_model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(e_test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        e_out = e_model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(e_out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.8814731240272522 train acc 0.234375
epoch 1 train acc 0.746875


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 1 test acc 0.7767857142857143


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.16927340626716614 train acc 0.96875
epoch 2 train acc 0.89625


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 2 test acc 0.7767857142857143


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.15674327313899994 train acc 0.96875
epoch 3 train acc 0.92375


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 3 test acc 0.9665178571428571


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.04420075565576553 train acc 0.984375
epoch 4 train acc 0.9675


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 4 test acc 0.9665178571428571


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.018070725724101067 train acc 1.0
epoch 5 train acc 0.984375


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 5 test acc 0.9620535714285714


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.02121375873684883 train acc 1.0
epoch 6 train acc 0.99125


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 6 test acc 0.9754464285714286


In [32]:
commnetslist = [] # 텍스트 데이터를 담을 리스트
e_emo_list = [] # 감성 값을 담을 리스트
for c in e_test: # 모든 댓글
  commnetslist.append( [c, 5] ) # [댓글, 임의의 양의 정수값] 설정
  
pdData = pd.DataFrame(e_test, columns = [['title', 'E']] )
pdData = pdData.values
test_set = BERTDataset(pdData, 0, 1, tok, max_len, True, False) 
test_input = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=5)

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_input):
  token_ids = token_ids.long().to(device)
  segment_ids = segment_ids.long().to(device)
  valid_length= valid_length 
  # 이때, out이 예측 결과 리스트
  e_test_out = e_model(token_ids, valid_length, segment_ids)

  # e는 2가지 실수 값으로 구성된 리스트
  # 0번 인덱스가 더 크면 부정, 긍정은 반대
  for e in e_test_out:
    if e[0]>=e[1]: # 부정
      value = 0
    else: #긍정
      value = 1
    e_emo_list.append(value)

  cpuset_checked))


In [33]:
e_checklist = []
for i in range(len(e_test)):
  e_checklist.append(int(e_test[i][1]))

In [34]:
# 테스트 결과 확인
confusion_matrix(e_emo_list, e_checklist)
print(classification_report(e_emo_list, e_checklist))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       351
           1       0.81      0.88      0.85        34

    accuracy                           0.97       385
   macro avg       0.90      0.93      0.91       385
weighted avg       0.97      0.97      0.97       385



In [35]:
torch.save(e_model.state_dict, './drive/MyDrive/ESG/e_model.pt')

In [36]:
import gc
del e_model, e_train, e_test
gc.collect()
torch.cuda.empty_cache()

In [37]:
del e_train_dataloader

In [38]:
# 모델 학습 시작
for s in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    # 학습모드
    s_model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(s_train_dataloader)):
        s_optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        s_out = s_model(token_ids, valid_length, segment_ids)
        loss = s_loss_fn(s_out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(s_model.parameters(), max_grad_norm) # gradient clipping
        s_optimizer.step()
        s_scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(s_out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(s+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(s+1, train_acc / (batch_id+1)))
    # 평가모드
    s_model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(s_test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        s_out = s_model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(s_out, label)
    print("epoch {} test acc {}".format(s+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.6464236974716187 train acc 0.6875
epoch 1 train acc 0.76625


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 1 test acc 0.8973214285714286


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.3392200171947479 train acc 0.859375
epoch 2 train acc 0.91375


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 2 test acc 0.9486607142857143


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.11230657249689102 train acc 0.96875
epoch 3 train acc 0.963125


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 3 test acc 0.9464285714285714


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.02105569653213024 train acc 1.0
epoch 4 train acc 0.98


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 4 test acc 0.9464285714285714


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.08225607872009277 train acc 0.984375
epoch 5 train acc 0.986875


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 5 test acc 0.9553571428571429


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.014177721925079823 train acc 1.0
epoch 6 train acc 0.989375


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 6 test acc 0.9553571428571429


In [39]:
commnetslist = [] # 텍스트 데이터를 담을 리스트
s_emo_list = [] # 감성 값을 담을 리스트
for c in s_test: # 모든 댓글
  commnetslist.append( [c, 5] ) # [댓글, 임의의 양의 정수값] 설정
  
pdData = pd.DataFrame(s_test, columns = [['title', 'S']] )
pdData = pdData.values
test_set = BERTDataset(pdData, 0, 1, tok, max_len, True, False) 
test_input = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=5)

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_input):
  token_ids = token_ids.long().to(device)
  segment_ids = segment_ids.long().to(device)
  valid_length= valid_length 
  # 이때, out이 예측 결과 리스트
  s_test_out = s_model(token_ids, valid_length, segment_ids)

  # e는 2가지 실수 값으로 구성된 리스트
  # 0번 인덱스가 더 크면 부정, 긍정은 반대
  for e in s_test_out:
    if e[0]>=e[1]: # 부정
      value = 0
    else: #긍정
      value = 1
    s_emo_list.append(value)

  cpuset_checked))


In [40]:
s_checklist = []
for i in range(len(s_test)):
  s_checklist.append(int(s_test[i][1]))

In [41]:
# 테스트 결과 확인
confusion_matrix(s_emo_list, s_checklist)
print(classification_report(s_emo_list, s_checklist))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96       281
           1       0.90      0.91      0.90       104

    accuracy                           0.95       385
   macro avg       0.93      0.94      0.93       385
weighted avg       0.95      0.95      0.95       385



In [42]:
torch.save(s_model.state_dict, './drive/MyDrive/ESG/s_model.pt')

In [43]:
import gc
del s_model, s_train, s_test
gc.collect()
torch.cuda.empty_cache()

In [44]:
del s_train_dataloader, s_data_train

In [45]:
# 모델 학습 시작
for g in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    g_model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(g_train_dataloader)):
        g_optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        g_out = g_model(token_ids, valid_length, segment_ids)
        loss = g_loss_fn(g_out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(g_model.parameters(), max_grad_norm) # gradient clipping
        g_optimizer.step()
        g_scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(g_out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(g+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(g+1, train_acc / (batch_id+1)))
    # 평가모드
    g_model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(g_test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        g_out = g_model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(g_out, label)
    print("epoch {} test acc {}".format(g+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.7175501585006714 train acc 0.515625
epoch 1 train acc 0.75


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 1 test acc 0.9308035714285714


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.2677616477012634 train acc 0.90625
epoch 2 train acc 0.9425


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 2 test acc 0.96875


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.22952337563037872 train acc 0.9375
epoch 3 train acc 0.966875


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 3 test acc 0.96875


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.21586944162845612 train acc 0.953125
epoch 4 train acc 0.985


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 4 test acc 0.9642857142857143


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.22453370690345764 train acc 0.9375
epoch 5 train acc 0.99125


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 5 test acc 0.9776785714285714


  0%|          | 0/25 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.13229596614837646 train acc 0.96875
epoch 6 train acc 0.993125


  0%|          | 0/7 [00:00<?, ?it/s]

epoch 6 test acc 0.9732142857142857


In [46]:
commnetslist = [] # 텍스트 데이터를 담을 리스트
g_emo_list = [] # 감성 값을 담을 리스트
for c in g_test: # 모든 댓글
  commnetslist.append( [c, 5] ) # [댓글, 임의의 양의 정수값] 설정
  
pdData = pd.DataFrame(g_test, columns = [['title', 'S']] )
pdData = pdData.values
test_set = BERTDataset(pdData, 0, 1, tok, max_len, True, False) 
test_input = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=5)

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_input):
  token_ids = token_ids.long().to(device)
  segment_ids = segment_ids.long().to(device)
  valid_length= valid_length 
  # 이때, out이 예측 결과 리스트
  g_test_out = g_model(token_ids, valid_length, segment_ids)

  # e는 2가지 실수 값으로 구성된 리스트
  # 0번 인덱스가 더 크면 부정, 긍정은 반대
  for e in g_test_out:
    if e[0]>=e[1]: # 부정
      value = 0
    else: #긍정
      value = 1
    g_emo_list.append(value)

  cpuset_checked))


In [47]:
g_checklist = []
for i in range(len(g_test)):
  g_checklist.append(int(g_test[i][1]))

In [48]:
# 테스트 결과 확인
confusion_matrix(g_emo_list, g_checklist)
print(classification_report(g_emo_list, g_checklist))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       308
           1       0.96      0.88      0.92        77

    accuracy                           0.97       385
   macro avg       0.96      0.94      0.95       385
weighted avg       0.97      0.97      0.97       385



In [49]:
# 모델 저장
torch.save(g_model.state_dict, './drive/MyDrive/ESG/g_model.pt')