# Package Import


In [1]:
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# DataLoader

In [4]:
import torch
from tqdm.auto import tqdm

In [5]:
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.example = []

        for each in dataset:
            self.example.append({k: torch.tensor(v) for k, v in each.items()})
    def __getitem__(self, idx):
        return self.example[idx]

    def __len__(self):
        return len(self.example)

def tokenized_dataset(dataset, tokenizer):

    data = []
    for _, item in tqdm(dataset.iterrows(), desc="tokenizing", total=len(dataset)):
        output = tokenizer(item["sentence"], padding=True, truncation=True, max_length=256, add_special_tokens=True)
        data.append(output)

    return data

def load_dataset(tokenizer, data_path):
    dataset = pd.read_csv(data_path, index_col=0)
    tokenized_data = tokenized_dataset(dataset, tokenizer)
    RE_dataset = RE_Dataset(tokenized_data)
    return RE_dataset

# Pre train을 시행하지 않고 데이터가 어떻게 생성되는지 관찰

In [6]:
from transformers import AutoTokenizer, BertForMaskedLM, pipeline, DataCollatorForLanguageModeling, LineByLineTextDataset

In [None]:
# tokenizer and model import
model_name = 'klue/roberta-large'
my_model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'mask_token':'[MASK]'})

In [None]:
# Data import and make dataloader
train_data = '/content/drive/MyDrive/level2_RE/train.csv'

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.3
)

dataset = load_dataset(tokenizer,train_data)

In [None]:
# masked_data 된 데이터를 만들어 줍니다. 생성된 데이터의 다양성을 위해서 mlm_probability는 0.3으로 맞춥니다.
masked_data = data_collator(dataset.example)

In [10]:
masked_text = []
for each in masked_data['input_ids']:
    tmp = tokenizer.decode(each.tolist())
    tmp = tmp.replace('[CLS]','')
    tmp = tmp.replace('[SEP]','')
    tmp = tmp.replace('[PAD]','')
    tmp = tmp.strip()
    masked_text.append(tmp)

In [11]:
nlp_fill = pipeline('fill-mask', top_k=5, model=my_model, tokenizer=tokenizer)

In [None]:
sentence_wo_pretrain = []
for idx, each in enumerate(tqdm(masked_text)):
    try:
        tmp_result = nlp_fill(each)
        token_list = []
        for k in tmp_result:
            index = np.random.randint(3,size=1)[0]
            token_list.append(k[index]['token'])

        tmp = tokenizer.encode(each)

        count = 0
        for idx2, j in enumerate(tmp):
            if j == 4:
                tmp[idx2] = token_list[count]
                count += 1
        sentence_wo_pretrain.append(tokenizer.decode(tmp[1:-1]))
    except:
        sentence_wo_pretrain.append("None")
        pass

In [None]:
sentence_wo_pretrain[9]

## for data collator redefine (mlm probability 0.3 -> 0.15)


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Pretrain

In [None]:
from transformers import Trainer, TrainingArguments, RobertaForMaskedLM

epoch = 15
warmup_ratio = 0.1

model_name = 'klue/roberta-large'
my_model = RobertaForMaskedLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='model_output',
    overwrite_output_dir=True,
    num_train_epochs=15,
    learning_rate=2e-5,
    warmup_steps=1000,
    per_gpu_train_batch_size=32,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100
)

trainer = Trainer(
    model=my_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()

In [None]:
trainer.save_model('/content/drive/MyDrive/level2_RE')

In [None]:
# 생성된 데이터 확인
my_model = RobertaForMaskedLM.from_pretrained('/content/drive/MyDrive/level2_RE')
nlp_fill = pipeline('fill-mask', top_k=3, model=my_model, tokenizer=tokenizer)

In [None]:
sentence_w_pretrain = []
for idx, each in enumerate(tqdm(masked_text)):
    try:
        tmp_result = nlp_fill(each)
        token_list = []
        for k in tmp_result:
            index = np.random.randint(3,size=1)[0]
            token_list.append(k[index]['token'])

        tmp = tokenizer.encode(each)

        count = 0
        for idx2, j in enumerate(tmp):
            if j == 4:
                tmp[idx2] = token_list[count]
                count += 1
        sentence_w_pretrain.append(tokenizer.decode(tmp[1:-1]))
    except:
        sentence_w_pretrain.append("None")
        pass

# pretrain with bert expand dataloader

In [None]:
def tokenized_dataset_expand(dataset, tokenizer):

    data = []
    for _, item in tqdm(dataset.iterrows(), desc="tokenizing", total=len(dataset)):

        # tokenizer에 라벨정보도 같이 포함
        subj = eval(item["subject_entity"])["word"]
        obj = eval(item["object_entity"])["word"]
        label = item['label']
        concat_entity = tokenizer.sep_token.join([subj, obj])
        sentence = tokenizer.sep_token.join([item['sentence'],label])

        output = tokenizer(concat_entity,
                            sentence, 
                           padding=True, truncation=True, max_length=256, add_special_tokens=True)
        data.append(output)

    return data

def load_dataset_expand(tokenizer, data_path):
    dataset = pd.read_csv(data_path, index_col=0)
    label = list(dataset.label.unique())
    # vocab에 라벨들을 포함해줌
    tokenizer.add_tokens(label)

    tokenized_data = tokenized_dataset_expand(dataset, tokenizer)
    RE_dataset = RE_Dataset(tokenized_data)
    
    return RE_dataset

In [None]:
new_dataset = load_dataset_expand(tokenizer,train_data)

tokenizing:   0%|          | 0/25976 [00:00<?, ?it/s]

In [None]:
# check tokens and length of tokenizer
print(dataset.example[0])
print(len(tokenizer))

In [None]:
from transformers import Trainer, TrainingArguments, RobertaForMaskedLM
epoch = 15
warmup_ratio = 0.1

model_name = 'klue/roberta-large'
my_model = RobertaForMaskedLM.from_pretrained(model_name)
my_model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir='model_output',
    overwrite_output_dir=True,
    num_train_epochs=epoch,
    learning_rate=2e-5,
    warmup_steps=1000,
    per_gpu_train_batch_size=32,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100
)

trainer = Trainer(
    model=my_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=new_dataset
)

trainer.train()

Embedding(32030, 1024)

In [None]:
trainer.save_model('/content/drive/MyDrive/level2_RE/bert_expand')

In [None]:
# data generate with bert_expand model
my_model = RobertaForMaskedLM.from_pretrained('/content/drive/MyDrive/level2_RE/bert_expand')

In [None]:
nlp_fill = pipeline('fill-mask', top_k=3, model=my_model, tokenizer=tokenizer)

In [None]:
bert_expand_sentence = []
for idx, each in enumerate(tqdm(masked_text)):

    try:
        tmp_result = nlp_fill(each)
        token_list = []
        for k in tmp_result:
            index = np.random.randint(3,size=1)[0]
            token_list.append(k[index]['token'])

        tmp = tokenizer.encode(each)

        count = 0
        for idx2, j in enumerate(tmp):
            if j == 4:
                tmp[idx2] = token_list[count]
                count += 1
        bert_expand_sentence.append(tokenizer.decode(tmp[1:-1]))
    except:
        sentencebert_expand_sentence.append("None")
        pass

# Make csv file

In [None]:
sentence = []
for idx, each in enumerate(tqdm(result)):

    try:
        tmp_result = nlp_fill(each)
        token_list = []
        for k in tmp_result:
            index = np.random.randint(3,size=1)[0]
            token_list.append(k[index]['token'])

        tmp = tokenizer.encode(each)

        count = 0
        for idx2, j in enumerate(tmp):
            if j == 4:
                tmp[idx2] = token_list[count]
                count += 1
        sentence.append(tokenizer.decode(tmp[1:-1]))
    except:
        sentence.append("None")
        pass

  0%|          | 0/25976 [00:00<?, ?it/s]

In [None]:
len(sentence)

In [None]:
train_df = pd.read_csv(train_data,index_col=0)

In [None]:
train_df['generated_1'] = sentence

In [None]:
train_df['generated_wo_pretrain'] = sentence_wo_pretrain

In [None]:
train_df.to_csv('/content/drive/MyDrive/level2_RE/bert_generated.csv')