# 1. 필요 모듈 설치

In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers
!pip install torch

!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-24mayy1j/kobert-tokenizer_eb3428439d03413f8480f2bb6347fe00
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-24mayy1j/kobert-tokenizer_eb3428439d03413f8480f2bb6347fe00
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone


# 2. 필요 라이브러리 임포트

In [None]:
import numpy as np
np.bool = np.bool_
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
from tqdm import tqdm, tqdm_notebook
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch

from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# 3. 데이터 불러오기 및 feature 선택

In [None]:
chzzk_review = pd.read_csv("/content/drive/MyDrive/치지직리뷰.csv")

In [None]:
reviews = chzzk_review["content"]

In [None]:
review_list = reviews.tolist()

In [None]:
len(review_list)

# 4. 모델 불러오기

In [None]:
# kobert pt 파일이 들어있는 경로
path = "/content/drive/MyDrive/kobertfinetuning_epoch_3.pt"

In [None]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict=False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:

tok = tokenizer.tokenize

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
# kobert 모델 호출
states = torch.load(path)

In [None]:
# Bert Classifier 객체 설정
model = BERTClassifier(bert=bertmodel, dr_rate=0.5,)
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
# parameter 지정
max_len = 64
batch_size = 1990
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
def new_softmax(a) :
    c = np.max(a) # 최댓값
    exp_a = np.exp(a-c) # 각각의 원소에 최댓값을 뺀 값에 exp를 취한다. (이를 통해 overflow 방지)
    sum_exp_a = np.sum(exp_a)
    y = (exp_a / sum_exp_a) * 100
    return np.round(y, 3)

# 5. 모델 사용 & output을 csv파일로 변경

In [None]:
import pandas as pd
def predict(predict_sentences):
    dataset_another = [(sentence, '0') for sentence in predict_sentences]
    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)

    model.eval()
    predictions = []

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.to(device).long()
        segment_ids = segment_ids.to(device).long()
        valid_length = valid_length.to(device)
        label = label.to(device).long()

        out = model(token_ids, valid_length, segment_ids)

        for batch_index, logits in enumerate(out):
            if torch.argmax(logits) == 2:  # 긍정인 경우에만 추가
               predictions.append((predict_sentences[batch_id * batch_size + batch_index], "긍정"))
            elif torch.argmax(logits) == 0:  # 부정인 경우에만 추가
               predictions.append((predict_sentences[batch_id * batch_size + batch_index], "부정"))
            else : # 중립인 경우에 추가
               predictions.append((predict_sentences[batch_id * batch_size + batch_index], "중립"))

    return pd.DataFrame(predictions)


In [None]:
predict200 = predict(review_list[:200])
predict400 = predict(review_list[200:400])
predict600 = predict(review_list[400:600])
predict800 = predict(review_list[600:800])
predict1000 = predict(review_list[800:1000])
predict1200 = predict(review_list[1000:1200])
predict1400 = predict(review_list[1200:1400])
predict1600 = predict(review_list[1400:1600])
predict1800 = predict(review_list[1600:1800])
predict1990 = predict(review_list[1800:])

In [None]:
chzzk_all= pd.concat([predict200,predict400,predict600, predict800, predict1000, predict1200, predict1400, predict1600, predict1800, predict1990 ],axis=0)

In [None]:
# 긍정만 모아놓는 csv와 전체를 다 모아놓는 csv 2개를 만들었다.
chzzk_all = chzzk_all.reset_index(drop=True)

In [None]:
chzzk_positive = pd.DataFrame(chzzk_positive)

In [None]:
chzzk_positive.columns = ['content']

In [None]:
chzzk_positive.to_csv('chzzk_positive.csv', index=False)

In [None]:
chzzk_all.columns = ['content', 'emotion']

In [None]:
chzzk_all.to_csv('chzzk_all.csv', index=False)