## 1. Load Library

In [None]:
# !pip install transformers pytorch_lightning==1.4.9
# !pip install torchmetrics==0.6.0
# !pip install torchtext==0.6.0
# !pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q
# !pip install rouge
# !pip install h5py
# !pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning==1.4.9
  Using cached pytorch_lightning-1.4.9-py3-none-any.whl (925 kB)
Installing collected packages: pytorch_lightning
Successfully installed pytorch_lightning-1.4.9


In [None]:
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset, load_metric
from rouge import Rouge 
from keras.models import load_model
from konlpy.tag import Okt
from string import whitespace, punctuation

import matplotlib.pyplot as plt
import pytorch_lightning as pl
import seaborn as sns
import pandas as pd
import numpy as np
import torch
import json
import pandas as pd
import re, unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances

## 2. Data Load

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
summ_path= '/content/drive/My Drive/23 U 4-1/텍스트마이닝/기말프로젝트/'

summ_train= pd.read_csv(summ_path+ 'summ_train.csv')
# summ_join1= pd.read_csv(summ_path+ 'result2.csv')
# summ_join2= pd.read_csv(summ_path+ 'Sum6.csv')
# summ_join3= pd.read_csv(summ_path+ 'Sum7.csv')
# summ_join4= pd.read_csv(summ_path+ 'Sum8.csv')
# summ_join5= pd.read_csv(summ_path+ 'Sum9.csv')

# summ_join1.columns= ['input_text', 'target_text']
# summ_join2.columns= ['input_text', 'target_text']
# summ_join3.columns= ['input_text', 'target_text']
# summ_join4.columns= ['input_text', 'target_text']
# summ_join5.columns= ['input_text', 'target_text']

In [None]:
# summ_train= pd.concat([summ_train, summ_join1])
# summ_train.shape

In [None]:
# summ_train.to_csv('summ_train.csv', encoding= 'utf-8-sig', index= False)

## 3. KOBART fine-tuning

In [None]:
class DatasetFromDataframe(Dataset):
    def __init__(self, df, dataset_args):
        self.data = df
        self.max_length = dataset_args['max_length']
        self.tokenizer = dataset_args['tokenizer']
        self.start_token = '<s>'
        self.end_token = '</s>'

    def __len__(self):
        return len(self.data)
    
    def create_tokens(self, text):
        tokens = self.tokenizer.encode(self.start_token + text + self.end_token)
        
        tokenLength = len(tokens)
        remain = self.max_length - tokenLength
        
        if remain >= 0:
            tokens = tokens + [ self.tokenizer.pad_token_id ] * remain
            attention_mask = [ 1 ] * tokenLength + [ 0 ] * remain
        else:
            tokens = tokens[: self.max_length - 1] + self.tokenizer.encode(self.end_token)
            attention_mask = [ 1 ] * self.max_length
        
        return tokens, attention_mask

    def __getitem__(self, index):
        record = self.data.iloc[index]

        question, answer = record['input_text'], record['target_text']
        
        input_id, input_mask = self.create_tokens(question)
        output_id, output_mask = self.create_tokens(answer)
        
        label = output_id[1:(self.max_length + 1)]
        label = label + (self.max_length - len(label)) * [ -100 ]
        
        return {
            'input_ids': torch.LongTensor(input_id),
            'attention_mask': torch.LongTensor(input_mask),
            'decoder_input_ids': torch.LongTensor(output_id),
            'decoder_attention_mask': torch.LongTensor(output_mask),
            "labels": torch.LongTensor(label)
        }

In [None]:
class OneSourceDataModule(pl.LightningDataModule):
    def __init__(
        self,
        **kwargs
    ):
        super().__init__()
        
        self.data = kwargs.get('data')
        self.dataset_args = kwargs.get("dataset_args")
        self.batch_size = kwargs.get("batch_size") or 32
        self.train_size = kwargs.get("train_size") or 0.9

    def setup(self, stage = ""):
        trainset, testset = train_test_split(summ_train, train_size=self.train_size, shuffle=True)
        
        self.trainset = DatasetFromDataframe(trainset, self.dataset_args)
        self.testset = DatasetFromDataframe(testset, self.dataset_args)

    def train_dataloader(self):
        train = DataLoader(
            self.trainset,
            batch_size=self.batch_size
        )
        return train

    def val_dataloader(self):
        val = DataLoader(
            self.testset,
            batch_size=self.batch_size
        )
        return val

    def test_dataloader(self):
        test = DataLoader(
            self.testset,
            batch_size=self.batch_size
        )
        return test

In [None]:
class KoBARTConditionalGeneration(pl.LightningModule):
    def __init__(self, hparams, **kwargs):
        super(KoBARTConditionalGeneration, self).__init__()
        self.hparams.update(hparams)
        
        self.model = kwargs['model'].cuda()
        self.tokenizer = kwargs['tokenizer']
        
        self.model.train()

    def configure_optimizers(self):
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            'weight_decay': 0.01
        }, {
            'params': [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            'weight_decay': 0.0
        }]
        
        optimizer = torch.optim.AdamW(
            optimizer_grouped_parameters,
            lr = self.hparams.lr
        )
        
        # num_workers = gpus * num_nodes
        data_len = len(self.train_dataloader().dataset)
        print(f'학습 데이터 양: {data_len}')
        
        num_train_steps = int(data_len / self.hparams.batch_size * self.hparams.max_epochs)
        print(f'Step 수: {num_train_steps}')
        
        num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio)
        print(f'Warmup Step 수: {num_warmup_steps}')
        
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_steps
        )
        
        lr_scheduler = {
            'scheduler': scheduler, 
            'monitor': 'loss',
            'interval': 'step',
            'frequency': 1
        }
        
        return [ optimizer ], [ lr_scheduler ]
        
    def forward(self, inputs):
        return self.model(
            input_ids = inputs['input_ids'],
            attention_mask = inputs['attention_mask'],
            decoder_input_ids = inputs['decoder_input_ids'],
            decoder_attention_mask = inputs['decoder_attention_mask'],
            labels = inputs['labels'],
            return_dict = True
        )

    def training_step(self, batch, batch_idx):
        loss = self(batch).loss
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self(batch).loss

    def test(self, text):
        tokens = self.tokenizer.encode("<s>" + text + "</s>")
        
        tokenLength = len(tokens)
        remain = self.hparams.max_length - tokenLength
        
        if remain >= 0:
            tokens = tokens + [ self.tokenizer.pad_token_id ] * remain
            attention_mask = [ 1 ] * tokenLength + [ 0 ] * remain
        else:
            tokens = tokens[: self.hparams.max_length - 1] + self.tokenizer.encode("</s>")
            attention_mask = [ 1 ] * self.hparams.max_length
        
        tokens = torch.LongTensor([ tokens ]).cuda()
        attention_mask = torch.LongTensor([ attention_mask ]).cuda()
        self.model = self.model.cuda()
        
        result = self.model.generate(
            tokens,
            max_length = self.hparams.max_length,
            attention_mask = attention_mask,
            num_beams = 10
        )[0]
        
        a = self.tokenizer.decode(result)
        return a

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "gogamza/kobart-base-v2",
    bos_token="<s>",
    eos_token="</s>",
    unk_token='<unk>',
    pad_token='<pad>',
    mask_token='<mask>'
)

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/682k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [None]:
token_lengthes = summ_train.apply(
    lambda row: max(
        len(tokenizer.encode(row['target_text'])),
        len(tokenizer.encode(row['input_text']))
    ),
    axis=1
)

In [None]:
BATCH_SIZE = 32
MAX_LENGTH = 128
EPOCHS = 20

KoBARTModel = BartForConditionalGeneration.from_pretrained(summ_path+ 'final2.h5')
# KoBARTModel = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v2")
model = KoBARTConditionalGeneration({
    "lr": 5e-6,
    "warmup_ratio": 0.1,
    "batch_size": BATCH_SIZE,
    "max_length": MAX_LENGTH,
    "max_epochs": EPOCHS
},
    tokenizer = tokenizer,
    model = KoBARTModel
)
dm = OneSourceDataModule(
    data = summ_train,
    batch_size = BATCH_SIZE,
    train_size = 0.9,
    dataset_args = {
        "tokenizer": tokenizer,
        "max_length": MAX_LENGTH,
    }
)
trainer = pl.Trainer(
    max_epochs = EPOCHS,
    gpus = 1
)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs


In [None]:
trainer.fit(model, dm)

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.core.lightning:
  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | BartForConditionalGeneration | 123 M 
-------------------------------------------------------
123 M     Trainable params
0         Non-trainable params
123 M     Total params
495.440   Total estimated model params size (MB)


학습 데이터 양: 26999
Step 수: 16874
Warmup Step 수: 1687


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
KoBARTModel.save_pretrained(summ_path+"./final2.h5")

In [None]:
model.test('S-OIL은 지난 1일 업무 중 세상을 등진 소방관 유자녀 70명에게 300만 원씩 모드 2억 1000만 원의 학자금을 기부')

'</s> S-OIL, 소방관 유자녀에 장학금 2억 원 기부</s>'

## 4. KOBART validation

In [None]:
valid_summ= pd.read_csv(summ_path+ 'valid_summ.csv')
valid_summ.head()

Unnamed: 0,Summary,title
0,"중소기업 기술분야 최대 행사인 ‘세계로 뻗는 혁신기술, 세상을 바꾸는 기술인재’라는...",중소기업기술혁신대전 역대 최대 규모로 21일 개막
1,주영섭 중기청장을 비롯한 21개 회원국은 지난 9일 페루 리마에서 열린 APEC 회...,2016 APEC 중소기업장관회의 페루서 개최
2,대전 유성 유성구 용산동 387번지 일원(36만3800m2)에 기업형 임대주택 37...,‘대전 용산지구에 6000세대 뉴스테이’ 대전 첫 신청서 접수
3,대전지방조달청(청장 김종환)은 40년 동안 사과농장을 직영하며 양조기술을 배워 사과...,예산사과와인 우수 전통주로 거듭
4,대전도시공사는 13일 갑천지구 3블록 분양아파트 건설공사를 추정금액 3100억여원에...,"갑천지구 3블록 아파트공사 재입찰, 호수공원사업 본격화"


In [None]:
lzValidTitle= []
for i in range(valid_summ.shape[0]):
  lzValidTitle.append(str(model.test(valid_summ.iloc[i,0])))
print(len(lzValidTitle))

In [None]:
valid_title= pd.DataFrame()
valid_title['title']= list(valid_summ['title'])
valid_title['newTitle']= lzValidTitle

valid_title.head()

In [None]:
valid_title.to_csv('valid_title.csv', encoding= 'utf-8-sig', index= False)

In [None]:
valid_title= pd.read_csv(summ_path+ 'valid_title.csv')
valid_title.head()

In [None]:
# White space, punctuation removal
pattern_whitespace = re.compile(f'[{whitespace}]+')
valid_title['title'] = valid_title.title.fillna('').replace(pattern_whitespace, ' ').map(lambda x: unicodedata.normalize('NFC', x)).str.strip()
valid_title['newTitle'] = valid_title.newTitle.fillna('').replace(pattern_whitespace, ' ').map(lambda x: unicodedata.normalize('NFC', x)).str.strip()

def CleanEnd(text):
    email = re.compile(r'[-_0-9a-z]+@[-_0-9a-z]+(?:\.[0-9a-z]+)+', flags=re.IGNORECASE)
    # url = re.compile(r'(?:https?:\/\/)?[-_0-9a-z]+(?:\.[-_0-9a-z]+)+', flags=re.IGNORECASE)
    # etc = re.compile(r'\.([^\.]*(?:기자|특파원|교수|작가|대표|논설|고문|주필|부문장|팀장|장관|원장|연구원|이사장|위원|실장|차장|부장|에세이|화백|사설|소장|단장|과장|기획자|큐레이터|저작권|평론가|©|©|ⓒ|\@|\/|=|▶|무단|전재|재배포|금지|\[|\]|\(\))[^\.]*)$')
    # bracket = re.compile(r'^((?:\[.+\])|(?:【.+】)|(?:<.+>)|(?:◆.+◆)\s)')
    
    result = email.sub('', text)
    # result = url.sub('', result)
    # result = etc.sub('.', result)
    # result = bracket.sub('', result).strip()
    return result
    
valid_title['title'] = valid_title.title.map(CleanEnd)
valid_title['newTitle'] = valid_title.newTitle.map(CleanEnd)
valid_title.head()

In [None]:
def TextFilter(text):
    punct = ''.join([chr for chr in punctuation if chr != '%'])
    filtering = re.compile(f'[{whitespace}{punct}]+')
    onlyText = re.compile(r'[^\% ㄱ-ㅣ가-힣]+')
    result = filtering.sub(' ', text)
    result = onlyText.sub(' ', result).strip()
    result = filtering.sub(' ', result)
    return result

valid_title['title'] = valid_title.title.map(TextFilter)
valid_title['newTitle'] = valid_title.newTitle.map(TextFilter)
valid_title.head()

## 5. Evaluation - ROUGE

In [None]:
valid_title.drop([123], axis=0, inplace=True)
valid_title.head()

In [None]:
# ROUGE-1
sumRecall_1= 0
sumPrecision_1= 0
sumF1_1= 0

# ROUGE-1
sumRecall_2= 0
sumPrecision_2= 0
sumF1_2= 0

# ROUGE-1
sumRecall_l= 0
sumPrecision_l= 0
sumF1_l= 0

for i in range(499):
  rouge = Rouge()
  scores = rouge.get_scores(valid_title.iloc[i,1], valid_title.iloc[i,0])

  #ROUGE-1
  #recall
  sumRecall_1+= list(list(scores[0].values())[0].values())[0]
  #precision
  sumPrecision_1+= list(list(scores[0].values())[0].values())[1]
  #sumF1
  sumF1_1+= list(list(scores[0].values())[0].values())[2]

  #ROUGE-2
  #recall
  sumRecall_2+= list(list(scores[0].values())[1].values())[0]
  #precision
  sumPrecision_2+= list(list(scores[0].values())[1].values())[1]
  #sumF1
  sumF1_2+= list(list(scores[0].values())[1].values())[2]

  #ROUGE-l
  #recall
  sumRecall_l+= list(list(scores[0].values())[2].values())[0]
  #precision
  sumPrecision_l+= list(list(scores[0].values())[2].values())[1]
  #sumF1
  sumF1_l+= list(list(scores[0].values())[2].values())[2]

In [None]:
print('ROUGE-1:', sumRecall_1/499, sumPrecision_1/499, sumF1_1/499)
print('ROUGE-2:', sumRecall_2/499, sumPrecision_2/499, sumF1_2/499)
print('ROUGE-l:', sumRecall_l/499, sumPrecision_l/499, sumF1_l/499)

In [None]:
data_path= '/content/drive/My Drive/23 U 4-1/텍스트마이닝/기말프로젝트/Data/'
valid_origin= pd.read_csv(data_path+ 'valid_cleansing.csv')
valid_origin= valid_origin[:499]
print(valid_origin.shape)

In [None]:
valid_origin.head()

In [None]:
# ROUGE-1
sumRecall_1= 0
sumPrecision_1= 0
sumF1_1= 0

# ROUGE-1
sumRecall_2= 0
sumPrecision_2= 0
sumF1_2= 0

# ROUGE-1
sumRecall_l= 0
sumPrecision_l= 0
sumF1_l= 0

for i in range(499):
  rouge = Rouge()
  scores = rouge.get_scores(valid_origin.iloc[i,4], valid_origin.iloc[i,0])

  #ROUGE-1
  #recall
  sumRecall_1+= list(list(scores[0].values())[0].values())[0]
  #precision
  sumPrecision_1+= list(list(scores[0].values())[0].values())[1]
  #sumF1
  sumF1_1+= list(list(scores[0].values())[0].values())[2]

  #ROUGE-2
  #recall
  sumRecall_2+= list(list(scores[0].values())[1].values())[0]
  #precision
  sumPrecision_2+= list(list(scores[0].values())[1].values())[1]
  #sumF1
  sumF1_2+= list(list(scores[0].values())[1].values())[2]

  #ROUGE-l
  #recall
  sumRecall_l+= list(list(scores[0].values())[2].values())[0]
  #precision
  sumPrecision_l+= list(list(scores[0].values())[2].values())[1]
  #sumF1
  sumF1_l+= list(list(scores[0].values())[2].values())[2]

In [None]:
print('ROUGE-1:', sumRecall_1/499, sumPrecision_1/499, sumF1_1/499)
print('ROUGE-2:', sumRecall_2/499, sumPrecision_2/499, sumF1_2/499)
print('ROUGE-l:', sumRecall_l/499, sumPrecision_l/499, sumF1_l/499)

## 6. Evaluation - Cosine, Manhattan Score

In [None]:
sumCos= 0
sumMan= 0

for i in range(499):
  sentences= (valid_title.iloc[i,1], valid_title.iloc[i,0])
  tfidf_vectorizer = TfidfVectorizer()
  tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
  tfidf_normalized = tfidf_matrix/np.sum(tfidf_matrix)

  cos_similar = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
  sumCos+= (cos_similar)

  manhattan_d = manhattan_distances(tfidf_normalized[0:1], tfidf_normalized[1:2])
  sumMan+= (manhattan_d)

print('Cosine:', sumCos/499)
print('Manhattan:', sumMan/499)