# Test_pretrained_ke-t5-small

## git

In [2]:
# !git config --global user.name candym1
# !git config --global user.email tmxk5283@gmail.com
# !git clone https://github.com/seuyon0101/saturi.git

In [3]:
# !cd saturi

## import

In [4]:
import torch
import os
import re
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import nltk
from nltk.tokenize import sent_tokenize

import datasets
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
from transformers import AutoModel, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments

## Test data upload

In [5]:
kor_path = os.getenv('HOME')+"/korean-english-park.train.ko"
eng_path = os.getenv('HOME')+"/korean-english-park.train.en"

In [6]:
with open(kor_path, "r") as f:
    kor = f.read().splitlines()

print("Data Size:", len(kor))
print("Example:")

for sen in kor[0:100][::20]: print(">>", sen)

Data Size: 94123
Example:
>> 개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"
>> 북한의 핵무기 계획을 포기하도록 하려는 압력이 거세지고 있는 가운데, 일본과 북한의 외교관들이 외교 관계를 정상화하려는 회담을 재개했다.
>> "경호 로보트가 침입자나 화재를 탐지하기 위해서 개인적으로, 그리고 전문적으로 사용되고 있습니다."
>> 수자원부 당국은 논란이 되고 있고, 막대한 비용이 드는 이 사업에 대해 내년에 건설을 시작할 계획이다.
>> 또한 근력 운동은 활발하게 걷는 것이나 최소한 20분 동안 뛰는 것과 같은 유산소 활동에서 얻는 운동 효과를 심장과 폐에 주지 않기 때문에, 연구학자들은 근력 운동이 심장에 큰 영향을 미치는지 여부에 대해 논쟁을 해왔다.


In [7]:
with open(eng_path, "r") as f:
    eng = f.read().splitlines()

print("Data Size:", len(eng))
print("Example:")

for sen in eng[0:100][::20]: print(">>", sen)

Data Size: 94123
Example:
>> Much of personal computing is about "can you top this?"
>> Amid mounting pressure on North Korea to abandon its nuclear weapons program Japanese and North Korean diplomats have resumed talks on normalizing diplomatic relations.
>> “Guard robots are used privately and professionally to detect intruders or fire,” Karlsson said.
>> Authorities from the Water Resources Ministry plan to begin construction next year on the controversial and hugely expensive project.
>> Researchers also have debated whether weight-training has a big impact on the heart, since it does not give the heart and lungs the kind of workout they get from aerobic activities such as brisk walking or running for at least 20 minutes.


In [8]:
cleaned_corpus = []
for i in range(len(kor)):
    set_corpus = []
    raw_sen = kor[i] + ' <TSL> ' + eng[i]
    set_corpus.append(raw_sen)
    for t in range(len(set_corpus)):
        set_corpus = list(set(set_corpus))
        for s in set_corpus:
            result = ""
            result += s
            cleaned_corpus.append(result)

In [9]:
cleaned_corpus[2]

"그러나 이것은 또한 책상도 필요로 하지 않는다. <TSL> Like all optical mice, But it also doesn't need a desk."

In [10]:
def preprocess_sentence_ko(sentence, s_token=False, e_token=False):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^ㄱ-ㅎ가-힣a-zA-Z?.!,]+", " ", sentence)

    sentence = sentence.strip()

    if s_token:
        sentence = '<start> ' + sentence

    if e_token:
        sentence += ' <end>'
    
    return sentence

In [11]:
def preprocess_sentence_en(sentence, s_token=False, e_token=False):
    sentence = sentence.lower().strip()
    
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)

    sentence = sentence.strip()

    if s_token:
        sentence = '<start> ' + sentence

    if e_token:
        sentence += ' <end>'
    
    return sentence

In [12]:
cleaned_corpus[0]

'개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?" <TSL> Much of personal computing is about "can you top this?"'

In [13]:
enc_corpus = []
dec_corpus = []

num_examples = 30000

for z in range(num_examples):
    ko, en = cleaned_corpus[z].split(" <TSL> ")
    
    enc_corpus.append(preprocess_sentence_ko(ko))
    dec_corpus.append(preprocess_sentence_en(en, s_token=True, e_token=True))
    
print("Korean:", enc_corpus[0])
print("English:", dec_corpus[0])

Korean: 개인용 컴퓨터 사용의 상당 부분은 이것보다 뛰어날 수 있느냐 ?
English: <start> much of personal computing is about can you top this ? <end>


### DataFreame 으로 변경

In [14]:
df = pd.DataFrame(zip(enc_corpus, dec_corpus))
df.columns = ['input', 'target']

In [15]:
df.head()

Unnamed: 0,input,target
0,개인용 컴퓨터 사용의 상당 부분은 이것보다 뛰어날 수 있느냐 ?,<start> much of personal computing is about ca...
1,모든 광마우스와 마찬가지 로 이 광마우스도 책상 위에 놓는 마우스 패드를 필요로 하...,<start> so a mention a few weeks ago about a r...
2,그러나 이것은 또한 책상도 필요로 하지 않는다 .,"<start> like all optical mice , but it also do..."
3,". 달러하는 이 최첨단 무선 광마우스는 허공에서 팔목 , 팔 , 그외에 어떤 부분이...",<start> uses gyroscopic sensors to control the...
4,정보 관리들은 동남 아시아에서의 선박들에 대한 많은 테러 계획들이 실패로 돌아갔음을...,<start> intelligence officials have revealed a...


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   30000 non-null  object
 1   target  30000 non-null  object
dtypes: object(2)
memory usage: 468.9+ KB


### train, test data

In [17]:
x_train, x_test = train_test_split(df, test_size=0.2, random_state=77)

In [18]:
print(len(x_train))
print(len(x_test))

24000
6000


In [19]:
all_doc_f = np.concatenate((x_train,x_test))

In [20]:
print(len(all_doc_f))

30000


In [21]:
all_doc_f[0]

array(['한편 터키는 독일 뒤스브르크에서 열린 평가전에서 핀란드를 으로 제압했다 .',
       '<start> meanwhile , turkey continued their preparations for the euro finals with a victory over finland in duisburg , germany . <end>'],
      dtype=object)

In [22]:
x_train['input'][1]

'모든 광마우스와 마찬가지 로 이 광마우스도 책상 위에 놓는 마우스 패드를 필요로 하지 않는다 .'

## pretrained_model_t5_small

### tset

In [23]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [24]:
task_prefix = "translate Ko to English: "
sentences = ["너는 누굴까나?.", "너는 누구니."]

inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    do_sample=False,  # disable sampling to test if batching affects output
)

print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))

['Ko ?.', 'Ko.']


In [25]:
tokenizer = AutoTokenizer.from_pretrained("KETI-AIR/ke-t5-small")

In [26]:
test_text = x_train['input'][1]
test_t5 = tokenizer(test_text).tokens()
print(test_t5)

['▁모든', '▁광', '마우스', '와', '▁마찬가지', '▁', '로', '▁이', '▁광', '마우스', '도', '▁책상', '▁위에', '▁놓는', '▁마우스', '▁', '패드', '를', '▁필요로', '▁하지', '▁않는다', '▁', '.', '</s>']


In [27]:
print(len(x_train) == len(x_train['target']))
print(len(x_test) == len(x_test['target']))

True
True


In [28]:
tokenizer.model_max_length

512

In [29]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [30]:
train_ = pd.DataFrame({'input' : x_train['input'], 'target' : x_train['target']}).reset_index(drop=True)
test_ = pd.DataFrame({'input' : x_test['input'], 'target' : x_test['target']}).reset_index(drop=True)

In [31]:
train_d = Dataset.from_pandas(train_)
test_d = Dataset.from_pandas(test_)

# datasetdict형태로 transformation
dataset = datasets.DatasetDict({"train":train_d,"test":test_d})

In [32]:
# 데이터 값 최종 확인
dataset.set_format(type='pandas')
df = dataset['train'][:]
df

Unnamed: 0,input,target
0,한편 터키는 독일 뒤스브르크에서 열린 평가전에서 핀란드를 으로 제압했다 .,"<start> meanwhile , turkey continued their pre..."
1,이들은 소속당의 열성 지지자뿐만아니라 부동층 및 상대정당의 당원까지 공략하고 나섰다 .,<start> they are portraying themselves as unit...
2,그가 받은 처벌이 어떠한 것인지에 대해서는 알려지지 않았다 .,<start> the marine corps would not specify wha...
3,로딕은 처음부터 서비스가 잘 들어갔다 며 서비스가 위력적이어서 승리할 수 있었다 고...,"<start> one ace was . mph , breaking the dubai..."
4,수도 전역의 여러 개의 붕괴된 건물 속에 많은 사람들이 실종된 것으로 보도되면서 사...,<start> local media reports said one man died ...
...,...,...
23995,로열 아윈 병원의 렌 노타로스 박사는 호주 방송과의 인터뷰에서 대통령의 몸에 박힌 ...,<start> surgeons operated on ramos horta for t...
23996,노무현 대통령은 일 KTV 특집 인터뷰에서 제 차 남북정상회담을 언급하며 김정일 국...,<start> north korean leader kim jong il is the...
23997,년 LZ 힌데브르크가 뉴저지에서 이륙 직전 추락해 화재로 타버렸을 때 라디오 저널리...,"<start> london , england cnn oh , the humanity..."
23998,인도 헌법상 세습적 계급 제도를 근거로 한 신분 차별은 위법이며 대도시에선 이러한 ...,<start> india s constitution outlaws caste bas...


In [33]:
# 인코딩하여 최종 데이터 dict 저장
dataset.set_format(type=None)
def tokenize(batch):
    return tokenizer(batch['input'], padding=True, truncation=True)

In [34]:
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [35]:
print(dataset_encoded.column_names)
print(dataset_encoded["train"].column_names)

{'train': ['attention_mask', 'input', 'input_ids', 'target'], 'test': ['attention_mask', 'input', 'input_ids', 'target']}
['attention_mask', 'input', 'input_ids', 'target']


In [36]:
print(dataset_encoded["train"][0])

{'target': '<start> meanwhile , turkey continued their preparations for the euro finals with a victory over finland in duisburg , germany . <end>', 'input_ids': [228, 6211, 12, 1511, 261, 296, 43125, 1082, 37, 440, 42821, 22635, 21, 7, 45, 37920, 7, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'input': '한편 터키는 독일 뒤스브르크에서 열린 평가전에서 핀란드를 으로 제압했다 .', 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [37]:
data_collator =  DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

### Model

In [38]:
num_labels = 2
num_epochs = 5
batch_size = 32

In [39]:
model = AutoModel.from_pretrained("KETI-AIR/ke-t5-small")

Some weights of the model checkpoint at KETI-AIR/ke-t5-small were not used when initializing T5Model: ['lm_head.weight']
- This IS expected if you are initializing T5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
torch.cuda.empty_cache()
torch.cuda.empty_cache()
torch.cuda.empty_cache()

In [41]:
def model_init():
    return AutoModel.from_pretrained("KETI-AIR/ke-t5-small").to(device)

In [42]:
args = TrainingArguments(
    output_dir = 'data_test',
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    save_steps=1e6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    disable_tqdm=True,
    load_best_model_at_end=True)

In [43]:
trainer = Trainer(model_init=model_init,
                  args=args,
                  data_collator=data_collator,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["test"],
                  tokenizer=tokenizer)

loading configuration file https://huggingface.co/KETI-AIR/ke-t5-small/resolve/main/config.json from cache at /aiffel/.cache/huggingface/transformers/a240b555451a28d400c0fcd042656bc28d18c553be5503a17a5fff9ab86ecf1b.cfa5a0bf5803bcceef6e8ff70f41932d7d5eb3b077c6885fadf7e912703f33e9
Model config T5Config {
  "_name_or_path": "hf/ke-t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.0,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.11.3",
  "use_cache": true,
  "vocab_size": 64128
}

loading weights file https://huggingface.co/KETI-AIR/ke-t5-small/resolve/main/pytorch_model.bin fro

NameError: name 'device' is not defined

In [None]:
# trainer.train()

### pipeline

In [None]:
nltk.download("punkt")

test = "안녕 내이름은 곱등이. 곱등 곱등"
sent_tokenize(test)

In [None]:
sample_text = dataset['train']["input"][1]
summaries = {}

In [None]:
sample_text

In [None]:
test_text = "안녕하세요"

In [None]:
pipe = pipeline("translation_ko_to_en", model="KETI-AIR/ke-t5-large", tokenizer=tokenizer)

In [None]:
pipe_out = pipe(sample_text)

In [None]:
pipe_out

In [None]:
summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["translation_text"]))

In [None]:
summaries

In [None]:
torch.cuda.empty_cache()
torch.cuda.empty_cache()
torch.cuda.empty_cache()