In [4]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

Mounted at /content/drive
/content/drive/MyDrive/SMU_MITB_NLP/NLP project


In [1]:
from aitextgen import aitextgen
from aitextgen.TokenDataset import TokenDataset, merge_datasets
from pytorch_lightning.loggers import TensorBoardLogger

In [2]:
### Standard Imports
import os
import sys
import torch
import tensorflow as tf
from tensorflow.python.client import device_lib
import numpy as np

In [3]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils

In [57]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'
newline_token = '<new>'

### General Parameters
random_seed = 42
model_folder = '../../../gpt2/finetuned/v2'
model_name = 'gpt2_ft'

### Model Parameters
window_len = 15
batch_size = 64
epochs = 3

In [5]:
os.makedirs(model_folder, exist_ok=True)

In [13]:
### Load Data
corpus = utils.load_corpus()
train_corpus, val_corpus, train_files, val_files = utils.split_corpus()

In [16]:
### Pre-Processing Text
_, word_count, index_to_vocab, vocab_to_index, _, _ = utils.tokenize_corpus(corpus,
                                                                            window_length = window_len,
                                                                            end_token = end_token,
                                                                            start_token = start_token,
                                                                            pad_token = pad_token,
                                                                            unk_token = unk_token,
                                                                            newline_token = newline_token)
vocab_size = len(word_count)

train_words, _, _, _, train_songs, train_songs_token_ind = utils.tokenize_corpus(train_corpus,
                                                                       window_length = window_len,
                                                                       index_to_vocab = index_to_vocab,
                                                                       vocab_to_index = vocab_to_index,
                                                                       end_token = end_token,
                                                                       start_token = start_token,
                                                                       pad_token = pad_token,
                                                                       unk_token = unk_token,
                                                                       newline_token = newline_token)

val_words, _, _, _, _, val_songs_token_ind = utils.tokenize_corpus(val_corpus,
                                                           window_length = window_len,
                                                           index_to_vocab = index_to_vocab,
                                                           vocab_to_index = vocab_to_index,
                                                           end_token = end_token,
                                                           start_token = start_token,
                                                           pad_token = pad_token,
                                                           unk_token = unk_token,
                                                           newline_token = newline_token)

In [45]:
# with open(model_folder+'/train_corpus.txt', 'w') as f:
#     f.write(train_corpus)

In [62]:
traindata = TokenDataset(texts = train_words, block_size = 1024)

  0%|                                                                                        | 0/11383 [00:00<…

In [63]:
### Loading Model
hf_model = "gpt2"
ai = aitextgen(model=hf_model, to_gpu = False, verbose=True)
#ai.to_gpu()
out_dir = model_folder

loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at aitextgen\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
   

In [64]:
steps = int(epochs * len(traindata) / batch_size) #5000 steps is about 30 epochs for this dataset

In [65]:
steps

1313

In [66]:
ai.train(
    traindata,
    n_gpu = 1,
    num_steps = steps,
    generate_every = 1000,
    output_dir = out_dir,
    loggers = [TensorBoardLogger(out_dir)],
    freeze_layers = True,
    num_layers_freeze = 9,
    line_by_line = False,
    header = False,
)

pytorch_model.bin already exists in /../../../gpt2/finetuned/v1 and will be overwritten!
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


  0%|                                                                                         | 0/1313 [00:00<…

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
Configuration saved in ../../../gpt2/finetuned/v1\config.json
Model weights saved in ../../../gpt2/finetuned/v1\pytorch_model.bin


In [70]:
prompts = ['Whenever I think back', 'And so this I know',
           'I am tired of being what you want me to be', 'Feeling so faithless, lost under the surface',
           'Relight our fire, we will find our way', 'We will rise stronger together']
result_strings = {}
for prompt in prompts:
    output = ai.generate(
        prompt = prompt,
        n = 1,
        min_len = 100,
        max_len = 500,
        temperature = 1,
        do_sample = True,
        use_cache = True,
        early_stopping = False,
        num_beams = 1,
        top_k = 50,
        top_p = 0.75,
        repetition_penalty = 1.2,
        length_penalty = 1.0,
        no_repeat_ngram_size = 0,
        num_beam_groups = 1,
        diversity_penalty = 0.0,
        remove_invalid_values = True,
        return_as_list = True,
        lstrip = False,
        skip_special_tokens = False
    )
    result_strings[prompt] = output

In [71]:
print(result_strings)

{'Whenever I think back': ['Whenever I think back<|endoftext|>'], 'And so this I know': ['And so this I know<|endoftext|>'], 'I am tired of being what you want me to be': ['I am tired of being what you want me to be<|endoftext|>'], 'Feeling so faithless, lost under the surface': ['Feeling so faithless, lost under the surface<|endoftext|>'], 'Relight our fire, we will find our way': ['Relight our fire, we will find our way<|endoftext|>'], 'We will rise stronger together': ['We will rise stronger together<|endoftext|>']}


In [None]:
for k, v in result_strings.items():
    with open(model_folder+f'/human_{model_name}-{utils.remove_punct(k.lower())}.txt', 'w') as f:
        f.write(v)

In [None]:
import json
with open('../../output/prompt_ref.json', 'r') as f:
    eval_prompts = json.load(f)

In [None]:
result_strings = {}
for prompt, actual in eval_prompts.items():
    output = ai.generate(
        prompt = prompt,
        n = 1,
        min_len = 100,
        max_len = 500,
        temperature = 1,
        do_sample = True,
        use_cache = True,
        early_stopping = False,
        num_beams = 1,
        top_k = 50,
        top_p = 0.75,
        repetition_penalty = 1.2,
        length_penalty = 1.0,
        no_repeat_ngram_size = 0,
        num_beam_groups = 1,
        diversity_penalty = 0.0,
        remove_invalid_values = True,
        return_as_list = True,
        lstrip = False,
        skip_special_tokens = False
    )
    result_strings[prompt] = output

In [None]:
for k, v in result_strings.items():
    with open(model_folder+f'/br_{model_name}-{utils.remove_punct(k.lower())}.txt', 'w') as f:
        f.write(v)