In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

In [None]:
from aitextgen import aitextgen
from aitextgen.TokenDataset import TokenDataset, merge_datasets
from pytorch_lightning.loggers import TensorBoardLogger

In [None]:
### Standard Imports
import os
import sys
import torch
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from gensim import downloader

In [None]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
import lib.sentiment as sentiment

In [None]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'
newline_token = '<new>'
neu_token = '<neu>'
pos_token = '<pos>'

### General Parameters
random_seed = 42
model_folder = '../../../gpt2/finetuned/v3'
model_name = 'gpt2_ft_sent'

### Model Parameters
batch_size = 64
epochs = 30
sentiment_themes = ['unity', 'affection', 'aspiration', 'home', 'relationship', 'motivation', 'nationhood', 'life', 'celebrate']

In [None]:
os.makedirs(model_folder, exist_ok=True)

In [None]:
### Embeddings for Sentiment Analysis
glove_vectors = downloader.load('glove-wiki-gigaword-100')

In [None]:
### Load Data
corpus = utils.load_corpus()
train_corpus, val_corpus, train_files, val_files = utils.split_corpus()
songs = utils.split_text(corpus)[:-1]
train_songs = utils.split_text(train_corpus)[:-1]
val_songs = utils.split_text(val_corpus)[:-1]

In [None]:
### Scoring Songs for Sentiment
vader_scores = []
sentiment_theme_scores = []
for song in songs:
    sentiment_scorer = sentiment.Sentiment()
    sentiment_scorer.clean_text(original_text = 'placeholder', generated_text = song)
    sentiment_scorer.get_theme_vector(sentiment_themes, embedding = glove_vectors, topn=10)
    sentiment_scorer.score_vader_sentiment()
    sentiment_scorer.score_word_vector_sentiment()
    vader_scores.append(sentiment_scorer.vader_sentiment_scores['generated'])
    sentiment_theme_scores.append(sentiment_scorer.word_vector_sentiment_scores['generated'])

In [None]:
vader_df = pd.DataFrame(vader_scores)
vader_df.hist(figsize = (12,8), sharex = True)
plt.show()
# There is negligible variety of sentiment in the entire corpus

In [None]:
sentiment_theme_df = pd.DataFrame(sentiment_theme_scores)
sentiment_theme_df.hist(figsize = (12,8), sharex = True)
plt.show()
# There is negligible variety of sentiment in the entire corpus

In [None]:
### For illlustrative purposes, we will use the mean positive value as the cutoff between a <neu> and <pos> context label.
avg_pos = vader_df['pos'].mean()
train_words = []
for i, song in enumerate(train_songs):
    sentiment_scorer = sentiment.Sentiment()
    sentiment_scorer.clean_text(original_text = 'placeholder', generated_text = song)
    #sentiment_scorer.get_theme_vector(sentiment_themes, embedding = glove_vectors, topn=10)
    sentiment_scorer.score_vader_sentiment()
    pos_test = sentiment_scorer.vader_sentiment_scores['generated']['pos'] > avg_pos
    #words = utils.tokenize_text(song, newline_token)
    if pos_test:
        #train_words.append([pos_token] + words)
        train_songs[i] = pos_token + song
    else:
        #train_words.append([neu_token] + words)
        train_songs[i] = neu_token + song

In [None]:
traindata = TokenDataset(texts = train_songs,
                         line_by_line = True,
                         block_size = 1024,
                         bos_token = start_token,
                         eos_token = end_token,
                         unk_token = unk_token,
                         pad_token = pad_token)

In [None]:
### Loading Model
hf_model = "gpt2"
ai = aitextgen(model=hf_model, to_gpu = True, verbose=True)
#ai.to_gpu()
out_dir = model_folder

In [None]:
steps = int(epochs * len(traindata) / batch_size) #5000 steps is about 30 epochs for this dataset
print(steps)

In [None]:
ai.train(
    traindata,
    n_gpu = 1,
    num_steps = steps,
    generate_every = 10000,
    output_dir = out_dir,
    loggers = [TensorBoardLogger(out_dir)],
    freeze_layers = True,
    num_layers_freeze = 9,
    line_by_line = True,
    header = False,
)

In [None]:
prompts = ['Whenever I think back', 'And so this I know',
           'I am tired of being what you want me to be', 'Feeling so faithless, lost under the surface',
           'Relight our fire, we will find our way', 'We will rise stronger together']
pos_result_strings = {}
neu_result_strings = {}
for prompt in prompts:
    output = ai.generate(
        prompt = pos_token+'<VERSE>'+prompt,
        n = 1,
        min_len = 100,
        max_len = 500,
        temperature = 1,
        do_sample = True,
        use_cache = True,
        early_stopping = False,
        num_beams = 1,
        top_k = 50,
        top_p = 0.75,
        repetition_penalty = 1.2,
        length_penalty = 1.0,
        no_repeat_ngram_size = 0,
        num_beam_groups = 1,
        diversity_penalty = 0.0,
        remove_invalid_values = True,
        return_as_list = True,
        lstrip = False,
        skip_special_tokens = False
    )
    pos_result_strings[prompt] = output[0]
    
    output = ai.generate(
        prompt = neu_token+'<VERSE>'+prompt,
        n = 1,
        min_len = 100,
        max_len = 500,
        temperature = 1,
        do_sample = True,
        use_cache = True,
        early_stopping = False,
        num_beams = 1,
        top_k = 50,
        top_p = 0.75,
        repetition_penalty = 1.2,
        length_penalty = 1.0,
        no_repeat_ngram_size = 0,
        num_beam_groups = 1,
        diversity_penalty = 0.0,
        remove_invalid_values = True,
        return_as_list = True,
        lstrip = False,
        skip_special_tokens = False
    )
    neu_result_strings[prompt] = output[0]

In [None]:
print(result_strings)

In [None]:
for k, v in pos_result_strings.items():
    with open(model_folder+f'/human_{model_name}-pos-{utils.remove_punct(k.lower())}.txt', 'w') as f:
        f.write(v)
for k, v in neu_result_strings.items():
    with open(model_folder+f'/human_{model_name}-neu-{utils.remove_punct(k.lower())}.txt', 'w') as f:
        f.write(v)

In [None]:
import json
with open('../../output/prompt_ref.json', 'r') as f:
    eval_prompts = json.load(f)

In [None]:
# result_strings = {}
# for prompt, actual in eval_prompts.items():
#     output = ai.generate(
#         prompt = prompt,
#         n = 1,
#         min_len = 100,
#         max_len = 500,
#         temperature = 1,
#         do_sample = True,
#         use_cache = True,
#         early_stopping = False,
#         num_beams = 1,
#         top_k = 50,
#         top_p = 0.75,
#         repetition_penalty = 1.2,
#         length_penalty = 1.0,
#         no_repeat_ngram_size = 0,
#         num_beam_groups = 1,
#         diversity_penalty = 0.0,
#         remove_invalid_values = True,
#         return_as_list = True,
#         lstrip = False,
#         skip_special_tokens = False
#     )
#     result_strings[prompt] = output

In [None]:
# for k, v in result_strings.items():
#     with open(model_folder+f'/br_{model_name}-{utils.remove_punct(k.lower())}.txt', 'w') as f:
#         f.write(v)