Train


In [None]:
# !pip install ipywidgets  # faor vscode
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import *

In [None]:
from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

In [None]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
## CPU
# device = torch.device("cpu")

## GPU
device = torch.device("cuda:0")

In [None]:

bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


In [None]:

!wget -O .cache/ratings_train.txt http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_train.txt
!wget -O .cache/ratings_test.txt http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_test.txt

--2022-11-14 08:50:38--  http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_train.txt
Resolving skt-lsl-nlp-model.s3.amazonaws.com (skt-lsl-nlp-model.s3.amazonaws.com)... 52.219.146.18
Connecting to skt-lsl-nlp-model.s3.amazonaws.com (skt-lsl-nlp-model.s3.amazonaws.com)|52.219.146.18|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘.cache/ratings_train.txt’


2022-11-14 08:50:41 (6.59 MB/s) - ‘.cache/ratings_train.txt’ saved [14628807/14628807]

--2022-11-14 08:50:41--  http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_test.txt
Resolving skt-lsl-nlp-model.s3.amazonaws.com (skt-lsl-nlp-model.s3.amazonaws.com)... 52.219.60.50
Connecting to skt-lsl-nlp-model.s3.amazonaws.com (skt-lsl-nlp-model.s3.amazonaws.com)|52.219.60.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4893335 (4.7M) [text/plain]
Saving to: ‘.cache/ratings_test.txt’


2022-11-14 08:50:43 (3.0

In [None]:

dataset_train = nlp.data.TSVDataset(".cache/ratings_train.txt", field_indices=[1,2], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset(".cache/ratings_test.txt", field_indices=[1,2], num_discard_samples=1)

In [None]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)


In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

  


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        return self.classifier(out)

In [None]:
# model = torch.load("/content/drive/MyDrive/community_classification.pt").to(device)

In [None]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [None]:

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [None]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)


def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(2):
    train_acc = 0.0
    test_acc = 0.0
    pred_report = []
    label_report = []
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        max_vals, max_indices = torch.max(out, 1)

        test_acc += calc_accuracy(out, label)
        pred_report.extend(max_indices.to('cpu'))
        label_report.extend(label.to('cpu'))
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    print(classification_report(label_report,pred_report))

  0%|          | 0/2344 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.331543892621994 train acc 0.890625
epoch 1 batch id 201 loss 0.07973811030387878 train acc 0.9560012437810945
epoch 1 batch id 401 loss 0.08517706394195557 train acc 0.958658042394015
epoch 1 batch id 601 loss 0.16614532470703125 train acc 0.960196547420965
epoch 1 batch id 801 loss 0.2405151128768921 train acc 0.9596207865168539
epoch 1 batch id 1001 loss 0.19142985343933105 train acc 0.9585102397602397
epoch 1 batch id 1201 loss 0.13941442966461182 train acc 0.9572621773522065
epoch 1 batch id 1401 loss 0.1145404800772667 train acc 0.9552551748750893
epoch 1 batch id 1601 loss 0.2505960762500763 train acc 0.9536032167395377
epoch 1 batch id 1801 loss 0.13284829258918762 train acc 0.9518670183231538
epoch 1 batch id 2001 loss 0.2241898626089096 train acc 0.9506106321839081
epoch 1 batch id 2201 loss 0.22525236010551453 train acc 0.9491353362108133
epoch 1 train acc 0.9479544404152447


  0%|          | 0/782 [00:00<?, ?it/s]

epoch 1 test acc 0.8889066496163683
              precision    recall  f1-score   support

           0       0.88      0.90      0.89     24827
           1       0.90      0.88      0.89     25173

    accuracy                           0.89     50000
   macro avg       0.89      0.89      0.89     50000
weighted avg       0.89      0.89      0.89     50000



  


  0%|          | 0/2344 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.43915680050849915 train acc 0.828125
epoch 2 batch id 201 loss 0.12258817255496979 train acc 0.9318252487562189
epoch 2 batch id 401 loss 0.18266229331493378 train acc 0.9363700124688279
epoch 2 batch id 601 loss 0.2014966905117035 train acc 0.9403858153078203
epoch 2 batch id 801 loss 0.21020253002643585 train acc 0.9430789637952559
epoch 2 batch id 1001 loss 0.19780310988426208 train acc 0.9457885864135864
epoch 2 batch id 1201 loss 0.09334888309240341 train acc 0.9473485636969192
epoch 2 batch id 1401 loss 0.13588698208332062 train acc 0.9485858315488936
epoch 2 batch id 1601 loss 0.11643953621387482 train acc 0.9491821517801374
epoch 2 batch id 1801 loss 0.09317446500062943 train acc 0.9501405469183787
epoch 2 batch id 2001 loss 0.1230342909693718 train acc 0.9508370814592704
epoch 2 batch id 2201 loss 0.18321731686592102 train acc 0.950775215810995
epoch 2 train acc 0.9510896615472126


  0%|          | 0/782 [00:00<?, ?it/s]

epoch 2 test acc 0.8900255754475703
              precision    recall  f1-score   support

           0       0.91      0.87      0.89     24827
           1       0.87      0.91      0.89     25173

    accuracy                           0.89     50000
   macro avg       0.89      0.89      0.89     50000
weighted avg       0.89      0.89      0.89     50000



# 긍부정

In [12]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        return self.classifier(out)

device = torch.device("cuda:0")
model = torch.load('/content/drive/MyDrive/model/community_classification2.pt').to(device)

In [16]:
import pandas as pd
import re
from glob import glob
from tqdm import tqdm_notebook
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import *
from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

def text_cleaning(x):
    mail_del = re.sub("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z-.]+)","",str(x))
    meta_del = re.sub("[\r\n\xa0]","",str(mail_del))
    name_del = re.sub("(\.\s+[ㄱ-ㅎ가-힣]+\s[기]+[자]+)","",str(meta_del))
    clean_text = re.sub("[^\w\s^.]"," ",name_del)
    return clean_text

def get_data(filepath,sheet_name):
  df = pd.read_excel(filepath,sheet_name=sheet_name)
  df.columns = df.loc[0]
  df.drop(index=0,inplace=True)
  df = df.reset_index(drop=True)  
  df['내용'] = df['내용'].map(text_cleaning)
  
  return df


def load_all_data(path,sheet_name="뉴스"):
    files = glob(path+'/*.xlsx')
    
    df = pd.DataFrame()
    for file in files:
        df = df.append(get_data(file,sheet_name), ignore_index=True)

    df = df.sort_values(by=['작성일'])


    return df

class CommunityDataset(Dataset):
    def __init__(self, dataset, bert_tokenizer, max_len,pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i]) for i in dataset]
        


    def __getitem__(self, i):
        return (self.sentences[i])

    def __len__(self):
        return (len(self.sentences))


_ , vocab = get_pytorch_kobert_model(cachedir=".cache")
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)


/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


  warn("Workbook contains no default style, apply openpyxl's default")


In [35]:

#paths,sheetnames
paths = ['/content/drive/MyDrive/data/임대차3법(54,752건)','/content/drive/MyDrive/data/중대재해처벌법(40,056건)','/content/drive/MyDrive/data/차별금지법(59,421건)','/content/drive/MyDrive/data/탄소중립(59,295건)']
sheetnames = ['커뮤니티','블로그','트위터']


for path in paths:
  topic = path.split('/')[-1]
  for sheet in sheetnames:
    df = load_all_data(path,sheet_name=sheet)
    dates = df['작성일'].unique()
    pos_neg = [[0,0,0] for _ in range(len(dates))]

    model.eval()

    for i,date in tqdm(enumerate(dates)):
      
      CommunityInput = CommunityDataset(df.loc[df['작성일'] == date,'내용'], tok, 256,True,False)
      Community_loader = DataLoader(CommunityInput,batch_size=16, num_workers=1)

      positive = 0
      negative = 0

      for batch_id, (token_ids, valid_length, segment_ids) in enumerate(Community_loader):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          
          out = model(token_ids, valid_length, segment_ids)
          max_vals, max_indices = torch.max(out, 1)

          
          negative += (max_indices == 0).sum()
          positive += (max_indices == 1).sum()
      
      pos_neg[i][0] = date
      pos_neg[i][1] = positive.item()
      pos_neg[i][2] = negative.item()
    df_pos_neg = pd.DataFrame(pos_neg,columns=['date','Positive','Negative'])
    df_pos_neg.to_excel(f'/content/drive/MyDrive/data/result/{topic}_{sheet}.xlsx')


  warn("Workbook contains no default style, apply openpyxl's default")


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [51]:
sentence = "나는 너가 너무 좋아"

CommunityInput = CommunityDataset([sentence], tok, 256,True,False)
Community_loader = DataLoader(CommunityInput,batch_size=16, num_workers=1)
for batch_id, (token_ids, valid_length, segment_ids) in enumerate(Community_loader):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    
    out = model(token_ids, valid_length, segment_ids)
    max_vals, max_indices = torch.max(out, 1)
    print(max_indices)

tensor([1], device='cuda:0')
