In [1]:
import sys
sys.path.append('../core')

In [2]:
from transformers import GPT2Tokenizer, GPT2Model, AutoModelForCausalLM

import time
from typing import Dict, List
from argparse import ArgumentParser

import numpy as np
import optuna
import torch
import torch.nn.functional as F
from data_utils import format_time, save_stats
from dataloader import create_bert_dataloaders
from dataset_loader import dataset_loader
from optuna.trial import Trial
from torch.utils.data import DataLoader
from models.bert_discriminator import BERTDiscriminator, model_name
from models.generator import Generator
from transformers import AutoTokenizer
from util.early_stopping import EarlyStopping

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
train_sentences, train_labels, _, _ = dataset_loader.load_dataset('aclImdb_001')

In [14]:
train_labels[-10:]

['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']

In [23]:
train_sentences[-8]

'What was Steven Seagal thinking? I mean firstly I love Seagal. I love all his movies up to the mid 2000s. His early stuff is some of the best in the genre. This however does not live up to its excellent name. Attack Force (with protagonist Marshall Lawson {Seagal}) would be expected to be a mindless action movie with Seagal in typical one-liner ass kicking form. However, what we get is a crime mystery, bordering on a political thriller with little or no action. Seagal is always in shadows because of his weight. I could not follow this story. There\'s people who mutate to superhumans when they take a drug. What happened in this movie. The dubbing of Seagal is a disgrace, a shambles and a shame. Why dub the man? The story is terrible. This got a 2/10 from me because of the scene where Seagal asks for backup despite having an army with him, and an hilarious fight scene where seagal swings his hands like a girl facing the camera! "Revenge is a two way street" seagal says in this movie...w

In [26]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left')
generator = AutoModelForCausalLM.from_pretrained('gpt2')
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
positive_prompt = "Here are 3 positive movie review from IMDB website written in a user post:\n 1. For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan \"The Skipper\" Hale jr. as a police Sgt\n 2. A solid, if unremarkable film. Matthau, as Einstein, was wonderful. My favorite part, and the only thing that would make me go out of my way to see this again, was the wonderful scene with the physicists playing badmitton, I loved the sweaters and the conversation while they waited for Robbins to retrieve the birdie.\n 3. "
# positive_prompt = "İşte Türkçe olumlu bir ürün yorumu:\n\n"
negative_prompt = "Here are 3 negative movie review from IMDB website written in a user post:\n 1. Wow! I remember so many awful films that loosely revolved around high school from the early 1980s. They usually had someincredibly strained plot and lots of 27 year old actors pretending to be students. As I watched this film I felt a little of the nostalgia of growing up in the 1980s. However, then I find out that this film was made in 1989? Say what! Well, the nostalgia factor ends right there, this is just bad. The plot has the city preparing to close a high school and threatening to bus all of the students to inner city high schools. Which is odd, in that the students at this school are both wealthy and abundant. In fact, the main character lives in a mansion. Makes you wonder how they cannot find money to keep this school alive, have they never heard of property taxes. Oh, but here is the kicker. The school board says that they will keep the school alive, if the students can raise $200,000. So the seniors go about doing this. Hmmm, you raise $200,000 but instead of saving that for college, you put it towards saving the high school that you are a Senior in? And why exactly would they close an overpopulated school before the year is out? And...ahh forget it, this film was stupid and made in 1989!?\n 2. What was Steven Seagal thinking? I mean firstly I love Seagal. I love all his movies up to the mid 2000s. His early stuff is some of the best in the genre. This however does not live up to its excellent name. Attack Force (with protagonist Marshall Lawson {Seagal}) would be expected to be a mindless action movie with Seagal in typical one-liner ass kicking form. However, what we get is a crime mystery, bordering on a political thriller with little or no action. Seagal is always in shadows because of his weight. I could not follow this story. There\'s people who mutate to superhumans when they take a drug. What happened in this movie. The dubbing of Seagal is a disgrace, a shambles and a shame. Why dub the man? The story is terrible. This got a 2/10 from me because of the scene where Seagal asks for backup despite having an army with him, and an hilarious fight scene where seagal swings his hands like a girl facing the camera! \"Revenge is a two way street\" seagal says in this movie...well forget revenge Steven, you need redemption!\n 3. "
# negative_prompt = "İşte Türkçe olumsuz bir ürün yorumu:\n\n"
# prompt = "Here is an example of helpdesk emails texts about one of the topics General Inquiry, Human Resources, Billing and Payments, Sales and Pre-Sales, IT Support, Customer Service, Product Support, Returns and Exchanges, Service Outages and Maintenance or Technical Support: \n\n"
# prompt = 'ประโยคภาษาไทยต่อไปนี้มีเนื้อหาเชิงบวก เชิงลบ หรือเป็นกลาง:\n\n'
encoded_input = gpt_tokenizer([positive_prompt, negative_prompt], return_tensors='pt', padding=True)
output = generator.generate(**encoded_input, temperature=0.9, do_sample=True, max_length=800)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [27]:
gpt_tokenizer.batch_decode(output)

['<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftex

In [28]:
print_each_n_step = 50
num_train_epochs = 50
noise_size = 1
batch_size = 8
epsilon = 1e-8
initial_temp = 1.0
anneal_rate = 0.95
min_temp = 0.1

In [29]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# If there's a GPU available...
if torch.backends.mps.is_available():
    print('Using MPS backend')
    device = torch.device('mps')
elif torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


Using MPS backend


In [30]:
labels = dataset_loader.get_labels('task-oriented-dialog_th_001')

train_dataloader, test_dataloader, seq_size = create_bert_dataloaders(
    'task-oriented-dialog_th_001', batch_size=batch_size, device=device,
    tokenizer=tokenizer)

# Models
discriminator = BERTDiscriminator(1, seq_size, device, num_labels=len(labels))

# print(generator)
# print('generator parameters: ' + str(sum(p.numel() for p in generator.parameters() if p.requires_grad)))
# print(discriminator)
# print('discriminator parameters: ' + str(sum(p.numel() for p in discriminator.parameters() if p.requires_grad)))

generator.to(device)
discriminator.to(device)
if torch.cuda.is_available():
    generator.cuda()
    discriminator.cuda()

# Training
training_stats = []

g_vars = [v for v in generator.parameters()]
d_vars = [v for v in discriminator.parameters()]

gen_optimizer = torch.optim.AdamW(g_vars, lr=5e-5)
dis_optimizer = torch.optim.AdamW(d_vars, lr=5e-5)

early_stopping = EarlyStopping(patience=5, min_delta=0.001, verbose=True)

Using dataset task-oriented-dialog_th_001


In [34]:
positive_prompt_size = len(positive_prompt)
negative_prompt_size = len(negative_prompt)
# prompt_size = len(prompt)
prompts = [positive_prompt, negative_prompt] * (batch_size // 2)
encoded_input = gpt_tokenizer(prompts, return_tensors='pt', padding=True)
encoded_input.to(device)

def generate_fake() -> list[str]:
    output = generator.generate(**encoded_input, temperature=0.6, do_sample=True, max_length=800)
    texts = gpt_tokenizer.batch_decode(output, skip_special_tokens=True)
    samples =[]
    # for i in range(0, len(texts)):
    #     samples.append(texts[i][prompt_size:])
    for i in range(0, len(texts), 2):
        samples.append(texts[i][positive_prompt_size:])
        samples.append(texts[i+1][negative_prompt_size:])
    return samples

generate_fake()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  test_elements = torch.tensor(test_elements)


["\xa0This one was a must-watch for anyone who loves science fiction. It's a fantastic read.\n4. \xa0The movie has one of the best scenes of all time, which is how I found it. The ending is a beautiful one, but the movie is also a good film overall.\n5. \xa0This is one of my all time favorites.\n6. \xa0This is my favorite movie.\n7. \xa0This is my favorite movie, and I think it's the best movie of all time.\n8. \xa0This is by far the best movie ever made.\n9. \xa0This is by far the best movie ever made, and I think it's the best movie ever made.\n10. \xa0This is by far the best movie ever made.\n11. \xa0This is by far the best movie ever made.\n12. \xa0This is by far the best movie ever made.\n13. \xa0This is by far the best movie ever made.\n14. \xa0This is by far the best movie ever made.\n15. \xa0This is by far the best movie ever made.\n16. \xa0This is by far the best movie ever made.",
 '\xa0Sebastian is my favorite character in this movie. He is a good actor. He gets to see the c

In [35]:
def test(test_dataloader: DataLoader, epoch_i: int, avg_train_loss_g: float, avg_train_loss_d: float, training_time: int,
         training_stats: List[Dict]):
    """Perform test step at the end of one epoch"""

    print("")
    print("Running Test...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    discriminator.eval()

    # Tracking variables
    total_test_loss = 0
    all_preds = []
    all_labels_ids = []

    # loss
    nll_loss = torch.nn.CrossEntropyLoss(ignore_index=-1)

    # Evaluate data for one epoch
    for text, input_mask, label, label_mask in test_dataloader:
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            _, logits, probs = discriminator(text, input_mask)
            filtered_logits = logits[:, 0:-1]
            total_test_loss += nll_loss(filtered_logits, label)

        # Accumulate the predictions and the input labels
        _, preds = torch.max(filtered_logits, 1)
        all_preds += preds.detach().cpu()
        all_labels_ids += label.detach().cpu()

    # Report the final accuracy for this validation run.
    all_preds = torch.stack(all_preds).numpy()
    all_labels_ids = torch.stack(all_labels_ids).numpy()
    test_accuracy = np.sum(all_preds == all_labels_ids) / len(all_preds)
    print("  Accuracy: {0:.3f}".format(test_accuracy))

    # Calculate the average loss over all of the batches.
    avg_test_loss = total_test_loss / len(test_dataloader)
    avg_test_loss = avg_test_loss.item()

    # Measure how long the validation run took.
    test_time = format_time(time.time() - t0)

    print("  Test Loss: {0:.3f}".format(avg_test_loss))
    print("  Test took: {:}".format(test_time))

    # Record all statistics from this epoch.
    training_stats.append({
        'epoch': epoch_i + 1,
        'Training Loss generator': avg_train_loss_g,
        'Training Loss discriminator': avg_train_loss_d,
        'Valid. Loss': avg_test_loss,
        'Valid. Accur.': test_accuracy,
        # 'Valid. F1': f1_score(all_labels_ids, all_preds),
        # 'Valid. Recall': recall_score(all_labels_ids, all_preds),
        # 'Valid. Precision': precision_score(all_labels_ids, all_preds),
        'Training Time': training_time,
        'Test Time': test_time
    })
    return test_accuracy


In [36]:
for epoch_i in range(0, num_train_epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_train_epochs))
    print('Training...')

    t0 = time.time()

    # Reset the total loss for this epoch.
    tr_g_loss = 0
    tr_d_loss = 0
    true_fakes = 0

    # Put the model into training mode.
    generator.train()
    discriminator.train()

    for step, (text, input_mask, label, label_mask) in enumerate(train_dataloader):
        # Progress update every print_each_n_step batches.
        if step % print_each_n_step == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))


        gen_samples = generate_fake()
        encode_result = tokenizer.batch_encode_plus(gen_samples, add_special_tokens=True, max_length=seq_size, padding="max_length", truncation=True, return_tensors='pt')
        gen_rep = encode_result['input_ids'].to(device)
        gen_att_mask = encode_result['attention_mask'].to(device)

        
        # Generate the output of the Discriminator for real and fake data.
        # First, we put together the output of the tranformer and the generator
        disciminator_input = torch.cat([text, gen_rep], dim=0)
        # Also, join with the fake sentences mask

        input_mask = torch.cat([input_mask, gen_att_mask], dim=0)
        # Then, we select the output of the disciminator
        features, logits, probs = discriminator(disciminator_input, input_mask)

        # Finally, we separate the discriminator's output for the real and fake
        # data
        split_size = batch_size
        features_list = torch.split(features, split_size)
        # Splits the tensor into chunks. Each chunk is a view of the original tensor
        D_real_features = features_list[0]
        D_fake_features = features_list[1]

        logits_list = torch.split(logits, split_size)
        D_real_logits = logits_list[0]

        probs_list = torch.split(probs, split_size)
        D_real_probs = probs_list[0]
        D_fake_probs = probs_list[1]

        # Fake labels counting
        true_fakes_batch = (torch.argmax(D_fake_probs, dim=1) == len(labels)).sum().item()
        true_fakes += true_fakes_batch

        # ---------------------------------
        #  LOSS evaluation
        # ---------------------------------
        # Generator's LOSS estimation
        g_loss_d = -1 * torch.mean(torch.log(1 - D_fake_probs[:, -1] + epsilon))
        g_feat_reg = 0 * torch.mean(
            torch.pow(torch.mean(D_real_features, dim=0) - torch.mean(D_fake_features, dim=0), 2)
            )
        g_loss = g_loss_d + g_feat_reg
        # print(g_loss_d, g_feat_reg)

        # Disciminator's LOSS estimation
        logits = D_real_logits[:, 0:-1]
        log_probs = F.log_softmax(logits, dim=-1)

        # The discriminator provides an output for labeled and unlabeled real data
        # so the loss evaluated for unlabeled data is ignored (masked)
        label2one_hot = torch.nn.functional.one_hot(label, len(labels))
        per_example_loss = -torch.sum(label2one_hot * log_probs, dim=-1)
        per_example_loss = torch.masked_select(per_example_loss, label_mask)
        labeled_example_count = per_example_loss.type(torch.float32).numel()

        # It may be the case that a batch does not contain labeled examples,
        # so the "supervised loss" in this case is not evaluated
        if labeled_example_count == 0:
            D_L_Supervised = 0
        else:
            D_L_Supervised = torch.div(torch.sum(per_example_loss.to(device)), labeled_example_count)

        D_L_unsupervised1U = -1 * torch.mean(torch.log(1 - D_real_probs[:, -1] + epsilon))
        D_L_unsupervised2U = -1 * torch.mean(torch.log(D_fake_probs[:, -1] + epsilon))
        d_loss = D_L_Supervised + D_L_unsupervised1U + D_L_unsupervised2U
        # print(D_L_Supervised, D_L_unsupervised1U, D_L_unsupervised2U)

        # ---------------------------------
        #  OPTIMIZATION
        # ---------------------------------
        # Avoid gradient accumulation
        gen_optimizer.zero_grad()
        dis_optimizer.zero_grad()

        # Calculate weigth updates
        # retain_graph=True is required since the underlying graph will be deleted after backward
        g_loss.backward(retain_graph=True)
        d_loss.backward(retain_graph=True)

        # Apply modifications
        gen_optimizer.step()
        dis_optimizer.step()

        # Save the losses to print them later
        tr_g_loss += g_loss.item()
        tr_d_loss += d_loss.item()


    # Calculate the average loss over all of the batches.
    avg_train_loss_g = tr_g_loss / len(train_dataloader)
    avg_train_loss_d = tr_d_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss generetor: {0:.3f}".format(avg_train_loss_g))
    print("  Average training loss discriminator: {0:.3f}".format(avg_train_loss_d))
    print("  Training epoch took: {:}".format(training_time))
    print("  Fakes correct discriminared: {}".format(true_fakes))

    print("Saving the models...............................")
    # Saving the model
    torch.save(generator, '../models/generator')
    torch.save(discriminator, '../models/discriminator')

    test_accuracy = test(
        test_dataloader, epoch_i,
        avg_train_loss_g, avg_train_loss_d, training_time, training_stats
    )
    training_stats[-1]['True fakes'] = true_fakes

    # save_stats(training_stats, trial)

    # check early stopping
    early_stopping(test_accuracy)
    if early_stopping.early_stop:
        print('early stopping. Training Stopped')
        break

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.277
  Average training loss discriminator: 2.795
  Training epoch took: 0:00:46
  Fakes correct discriminared: 0
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.385
  Test Loss: 1.094
  Test took: 0:00:10
Initial score set at 0.385071

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.298
  Average training loss discriminator: 2.718
  Training epoch took: 0:00:34
  Fakes correct discriminared: 4
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.261
  Test Loss: 1.105
  Test took: 0:00:10

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.348
  Average training loss discriminator: 2.573
  Training epoch took: 0:00:30
  Fakes correct discriminared: 11
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.264
  Test Loss: 1.097
  Test took: 0:00:10

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.456
  Average training loss discriminator: 2.401
  Training epoch took: 0:00:31
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.486
  Test Loss: 1.008
  Test took: 0:00:10
Improvement found: 0.485782 (previous best: 0.385071)

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.563
  Average training loss discriminator: 2.069
  Training epoch took: 0:00:31
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.628
  Test Loss: 0.877
  Test took: 0:00:10
Improvement found: 0.627962 (previous best: 0.485782)

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.654
  Average training loss discriminator: 1.905
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.594
  Test Loss: 0.843
  Test took: 0:00:10

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.717
  Average training loss discriminator: 1.566
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.722
  Test Loss: 0.815
  Test took: 0:00:10
Improvement found: 0.721564 (previous best: 0.627962)

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.745
  Average training loss discriminator: 1.530
  Training epoch took: 0:00:31
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.656
  Test Loss: 0.781
  Test took: 0:00:10

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.694
  Average training loss discriminator: 1.463
  Training epoch took: 0:00:31
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.710
  Test Loss: 0.693
  Test took: 0:00:10

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.640
  Average training loss discriminator: 1.366
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.757
  Test Loss: 0.669
  Test took: 0:00:10
Improvement found: 0.756517 (previous best: 0.721564)

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.681
  Average training loss discriminator: 1.203
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.771
  Test Loss: 0.731
  Test took: 0:00:10
Improvement found: 0.771327 (previous best: 0.756517)

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.711
  Average training loss discriminator: 1.071
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.751
  Test Loss: 0.827
  Test took: 0:00:10

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.730
  Average training loss discriminator: 0.989
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.742
  Test Loss: 0.845
  Test took: 0:00:10

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.701
  Average training loss discriminator: 0.929
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.794
  Test Loss: 0.804
  Test took: 0:00:10
Improvement found: 0.794431 (previous best: 0.771327)

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.679
  Average training loss discriminator: 0.888
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.815
  Test Loss: 0.755
  Test took: 0:00:10
Improvement found: 0.814573 (previous best: 0.794431)

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.675
  Average training loss discriminator: 0.903
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.847
  Test Loss: 0.557
  Test took: 0:00:10
Improvement found: 0.846564 (previous best: 0.814573)

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.699
  Average training loss discriminator: 0.830
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.855
  Test Loss: 0.546
  Test took: 0:00:10
Improvement found: 0.855450 (previous best: 0.846564)

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.710
  Average training loss discriminator: 0.872
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.792
  Test Loss: 0.628
  Test took: 0:00:11

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.674
  Average training loss discriminator: 0.808
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.727
  Test Loss: 0.816
  Test took: 0:00:10

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.714
  Average training loss discriminator: 0.914
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.742
  Test Loss: 0.849
  Test took: 0:00:10

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.705
  Average training loss discriminator: 0.877
  Training epoch took: 0:00:30
  Fakes correct discriminared: 16
Saving the models...............................

Running Test...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


  Accuracy: 0.753
  Test Loss: 0.847
  Test took: 0:00:11

Training...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



  Average training loss generetor: 0.645
  Average training loss discriminator: 0.849
  Training epoch took: 0:00:31
  Fakes correct discriminared: 15
Saving the models...............................

Running Test...
  Accuracy: 0.799
  Test Loss: 0.823
  Test took: 0:00:10
Early stopping triggered after 5 epochs with no improvement.
early stopping. Training Stopped
