In [2]:
import os

# root path
ROOT_PATH = os.path.abspath(".") # this makes compatible absolute path both for local and server

# designate root path for the data
DATA_ROOT_PATH = os.path.join(ROOT_PATH, 'data')

# designate path for each dataset files
LYRIC_PATH = os.path.join(DATA_ROOT_PATH, "lyrics_kor.txt")
BILLBOARD_PATH = os.path.join(DATA_ROOT_PATH, "rawdata_김지훈_201500844.tsv")
GEULSTAGRAM_PATH = os.path.join(DATA_ROOT_PATH, "geulstagram.csv")

print(ROOT_PATH)

/Users/noopy/Documents/BERT-PROJECTS/kor-3-line-poetry


In [3]:
from datetime import datetime
from easydict import EasyDict

# Initialize configuration
CFG = EasyDict()

# Dataset Config as constants
CFG.DEBUG = True
CFG.num_workers = 4
CFG.train_batch_size = 16

# Train configuration
CFG.user_name = "snoop2head"
today = datetime.now().strftime("%m%d_%H:%M")
CFG.file_base_name = f"{CFG.user_name}_{today}"
CFG.model_dir = "skt/ko-gpt-trinity-1.2B-v0.5" # designate the model's name registered on huggingface: https://huggingface.co/skt/ko-gpt-trinity-1.2B-v0.5
CFG.max_token_length = 42
CFG.learning_rate = 5e-5
CFG.weight_decay = 1e-2 # https://paperswithcode.com/method/weight-decay

# training steps configurations
CFG.save_steps = 500
CFG.early_stopping_patience = 5
CFG.warmup_steps = 500
CFG.logging_steps = 100
CFG.evaluation_strategy = 'epoch'
CFG.evaluation_steps = 500

# Directory configuration
CFG.result_dir = os.path.join(ROOT_PATH, "results")
CFG.saved_model_dir = os.path.join(ROOT_PATH, "best_models")
CFG.logging_dir = os.path.join(ROOT_PATH, "logs")
CFG.baseline_dir = os.path.join(ROOT_PATH, 'baseline-code')

print(CFG)

{'DEBUG': True, 'num_workers': 4, 'train_batch_size': 16, 'user_name': 'snoop2head', 'file_base_name': 'snoop2head_1114_23:29', 'model_dir': 'skt/ko-gpt-trinity-1.2B-v0.5', 'max_token_length': 42, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'save_steps': 500, 'early_stopping_patience': 5, 'warmup_steps': 500, 'logging_steps': 100, 'evaluation_strategy': 'epoch', 'evaluation_steps': 500, 'result_dir': '/Users/noopy/Documents/BERT-PROJECTS/kor-3-line-poetry/results', 'saved_model_dir': '/Users/noopy/Documents/BERT-PROJECTS/kor-3-line-poetry/best_models', 'logging_dir': '/Users/noopy/Documents/BERT-PROJECTS/kor-3-line-poetry/logs', 'baseline_dir': '/Users/noopy/Documents/BERT-PROJECTS/kor-3-line-poetry/baseline-code'}


In [4]:
import random
import torch
import pandas as pd
import numpy as np

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

def seed_everything(seed) :
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed_everything(42)

In [5]:
# read txt file from line by line
def read_txt(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return lines

# make sampling function from the list
def sampling(list_lines:list, n:int) -> list:
    # sampling
    list_lines = np.random.choice(list_lines, n)
    list_lines = list(list_lines)
    return list_lines

In [6]:
import torch
from transformers import GPT2LMHeadModel

CFG.saved_model_dir = "./results"
model_path = "/Users/noopy/Documents/BERT-PROJECTS/kor-3-line-poetry/results/snoop2head_1114_05_58_loss_0.3655.pt"

# Attach Language model Head to the pretrained GPT model
model = GPT2LMHeadModel.from_pretrained(CFG.model_dir) # KoGPT3 shares the same structure as KoGPT2. 

# move the model to device
if torch.cuda.is_available() and CFG.DEBUG == False:
    device = torch.device("cuda:0")
    model.load_state_dict(torch.load(model_path))
elif CFG.DEBUG == True:
    device = torch.device("cpu")
    model.load_state_dict(torch.load(model_path, map_location=device))

model.to(device)
model.eval()
print(device)

cpu


In [7]:
if device == torch.device("cuda:0"):
    os.system("nvidia-smi")

In [8]:
from transformers import GPT2Tokenizer, PreTrainedTokenizerFast

# https://huggingface.co/transformers/preprocessing.html
# Load the Tokenizer: "Fast" means that the tokenizer code is written in Rust Lang
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    CFG.model_dir,
    max_len = CFG.max_token_length,
    padding='max_length',
    add_special_tokens = True,
    return_tensors="pt",
    truncation = True,
    bos_token = "<s>",
    eos_token = "</s>",
    unk_token = "<unk>",
    pad_token = "<pad>",
    mask_token = "<mask>",
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [11]:
input_sentence = "그대 왜 내 꿈에"

def infer_sentence(input_sentence, k, output_token_length):

    # encode the sample sentence
    input_ids = tokenizer.encode(
        input_sentence, 
        add_special_tokens=False, 
        return_tensors="pt"
        )

    # decode the output sequence and print its outcome
    list_decoded_sequences = []
    while len(list_decoded_sequences) < k:
        # generate output sequence from the given encoded input sequence
        output_sequences = model.generate(
            input_ids=input_ids.to(device), 
            do_sample=True, 
            max_length=output_token_length, 
            num_return_sequences=k
            )

        for index, generated_sequence in enumerate(output_sequences):
            generated_sequence = generated_sequence.tolist()
            # remove padding from the generated sequence
            generated_sequence = generated_sequence[:generated_sequence.index(tokenizer.pad_token_id)]
            decoded_sequence = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
            # print(f"{index} : {decoded_sequence}")
            list_decoded_sequences.append(decoded_sequence)
        list_decoded_sequences = list(set(list_decoded_sequences))
    
    return list_decoded_sequences

print(f"Inferred sentences given '{input_sentence}'")
infer_sentence(input_sentence, k=7, output_token_length=CFG.max_token_length)

Inferred sentences given '그대 왜 내 꿈에'


['그대 왜 내 꿈에 나타나지 않는 걸까',
 '그대 왜 내 꿈에 나타나지 않았나',
 '그대 왜 내 꿈에 나타나지 않았을까 이유 모를 눈물이 흐른다',
 '그대 왜 내 꿈에 나오지 않는 걸까?',
 '그대 왜 내 꿈에 나오질 않는 거야',
 '그대 왜 내 꿈에 안 나왔죠',
 '그대 왜 내 꿈에 오지 않으려 하는 걸까요',
 '그대 왜 내 꿈에 나타나지 않았을까',
 '그대 왜 내 꿈에 오지 않는 거죠',
 '그대 왜 내 꿈에 오지 않죠',
 '그대 왜 내 꿈에 나타나지 않았지',
 '그대 왜 내 꿈에 나타나지 않았을까 후회',
 '그대 왜 내 꿈에 안오죠 지나간 글']

In [10]:
def make_samhaengshi(input_letter, k, output_token_length):
    list_samhaengshi = []
    for one_letter in input_letter:
        list_decoded_sequences = infer_sentence(one_letter, k=k, output_token_length=output_token_length)
        list_samhaengshi.extend(list_decoded_sequences)
    return list_samhaengshi

make_samhaengshi(input_letter="자탄풍", k=1, output_token_length=CFG.max_token_length)

['자라고 시간은 흐르고 당신은 변하고 난 당신으로부터 도망치고 있네',
 '탄탄한 인생보다 알찬 인생이 어디 있어',
 '풍파 없는 항해 얼마나 단조로운가.']