### GAN Attempt!

In [None]:
!pip install transformers
!pip install datasets

In [None]:
"""
Setup
"""

import torch
import torch.nn as nn
import torch.optim as optim
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
import pandas as pd
from datasets import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd drive/MyDrive/CS\ 224N\ Project
%ls # verify that you are in the right directory

In [None]:
"""
Define the generator (use the pre-trained BART implementation)
"""

# bart-base checkpoint pre-trained on our dataset
# (can also try generically pre-trained bart base)
model_dir = 'bart-base-checkpoint-204000'
tokenizer = AutoTokenizer.from_pretrained(model_dir)
netG = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
print(netG)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05,

In [None]:
"""
Define transformer discriminator
"""

nc = 1
ndf = 64

class Discriminator(nn.Module):
    def __init__(self, ngpu):
        super(Discriminator, self).__init__()
        self.ngpu = ngpu
        
        # Transformer Encoder
        self.upsample = nn.Upsample(size=(64))
        encoder_layer = nn.TransformerEncoderLayer(d_model=64, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
        
        self.classifier = nn.Sequential(
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, input):
        #input shape: (batch_size, seq_len, d_model)
        upsampled_input = self.upsample(input)
        transformer_output = self.transformer_encoder(upsampled_input) 
        discriminator_output = self.classifier(transformer_output.mean(dim=1)) #(batch_size, 1)
        
        return discriminator_output


In [None]:
ngpu = 1
netD = Discriminator(ngpu).to(device)

In [None]:
"""
Loss functions and optimizers
"""
# Size of generator input
nz = 512
# Optim params
lr = 0.0002
beta1 = 0.5

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(64, nz, 1, 1, device=device)

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))

In [None]:
train_df = pd.read_csv('aita_train_set.csv')[['text', 'comments']]
valid_df = pd.read_csv('aita_valid_set.csv')[['text', 'comments']]
test_df = pd.read_csv('aita_test_set.csv')[['text', 'comments']]

In [None]:
train_data_txt = Dataset.from_pandas(train_df)
validation_data_txt = Dataset.from_pandas(valid_df)
test_data_txt = Dataset.from_pandas(test_df)
print(train_data_txt)
print(validation_data_txt)
print(test_data_txt)

Dataset({
    features: ['text', 'comments'],
    num_rows: 81614
})
Dataset({
    features: ['text', 'comments'],
    num_rows: 998
})
Dataset({
    features: ['text', 'comments'],
    num_rows: 998
})


In [None]:
"""
Preprocess
"""

encoder_max_length = 256  # changed from 256
decoder_max_length = 64  # changed from 64

def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["text"], batch["comments"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length, return_tensors="pt"
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length, return_tensors="pt"
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch


train_data = train_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

validation_data = validation_data_txt.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

In [None]:
print(train_data)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 81614
})


In [None]:
print(len(train_data[0]['input_ids']))

256


In [None]:
fixed_validation_index = 17
fixed_validation_inputs = valid_df.iloc[fixed_validation_index]['text']
fixed_validation_data = tokenizer(fixed_validation_inputs, max_length=512, padding='max_length', truncation=True, return_tensors="pt")

In [None]:
batch_size=4
dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
                                         shuffle=True)
for i, data in enumerate(dataloader, 0):
  print(torch.stack(data['attention_mask']).shape)
  break


torch.Size([256, 4])


In [None]:
# Training Loop

# Lists to keep track of progress
img_list = []
G_losses = []
D_losses = []
iters = 0
num_epochs = 1
max_input_length = 512

print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    # todo: batch this/use a dataloader
    for i in range(len(validation_data)):
        data = validation_data[i]
        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        # Format batch
        #print(len(data['labels']))
        real_cpu = torch.tensor(data['labels'], dtype=torch.float32)
        #real_cpu = data['labels']
        #print(real_cpu.shape)
        # real_cpu = real_cpu.unsqueeze(0)
        # real_cpu = real_cpu.unsqueeze(0)
        # real_cpu = real_cpu.unsqueeze(0)
        #print(real_cpu.shape)
        real_cpu = real_cpu.view(1, 1, 64) #these are the comment tokens
        #print(real_cpu.shape)
        #print(real_cpu.shape)
        real_cpu = real_cpu.to(device)
        b_size = real_cpu.size(0)
        label = torch.full((b_size,), real_label, dtype=torch.float, device=device)

        #discriminator will train off of true comments in the real batch pass
        # Forward pass real batch through D
        output = netD(real_cpu).view(-1) 
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
        errD_real.backward()
        D_x = output.mean().item()

        ## Train with all-fake batch
        # Generate batch of latent vectors
        # noise = torch.randn(b_size, nz, 1, 1, device=device)
        # print(inputs['input_ids'].shape)
        # output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=10, max_length=64)
        # Generate fake image batch with G

        inputs = valid_df.iloc[i]['text']
        data = tokenizer(inputs, max_length=max_input_length, padding='max_length', truncation=True, return_tensors="pt")
        fake = netG.generate(**data, num_beams=8, do_sample=True, min_length=10, max_length=64) #generate a fake comment
        label.fill_(fake_label)
        # Classify all fake batch with D
        #print(fake.shape)
        fake = fake.type(torch.float32)
        fake = fake.view(1, 1, -1)
        #print(fake.shape)
        #fake = correct_to_64(fake)
        #print(fake.shape)
        fake = fake.detach().to(device)
        output = netD(fake).view(-1)
        # Calculate D's loss on the all-fake batch
        errD_fake = criterion(output, label)
        # Calculate the gradients for this batch, accumulated (summed) with previous gradients
        errD_fake.backward()
        D_G_z1 = output.mean().item()
        # Compute error of D as sum over the fake and the real batches
        errD = errD_real + errD_fake
        # Update D
        optimizerD.step()

        ############################
        # (2) Update G network: maximize log(D(G(z)))
        ###########################
        netG.zero_grad()
        label.fill_(real_label)  # fake labels are real for generator cost
        # Since we just updated D, perform another forward pass of all-fake batch through D
        output = netD(fake).view(-1)
        # Calculate G's loss based on this output
        errG = criterion(output, label)
        # Calculate gradients for G
        errG.backward()
        D_G_z2 = output.mean().item()
        # Update G
        optimizerG.step()

        # Output training stats
        if i % 5 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                  % (epoch, num_epochs, i, len(validation_data),
                     errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

        # Save Losses for plotting later
        G_losses.append(errG.item())
        D_losses.append(errD.item())

        if iters == 5: netG.save_pretrained('pineapple/pineapple-initial-save')
        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters != 0 and iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(train_data)-1)):
            netG.save_pretrained('pineapple/pineapple-halfway')
            with torch.no_grad():
                fake = netG.generate(**fixed_validation_data, num_beams=8, do_sample=True, min_length=10, max_length=64).detach()
            # img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1

netG.save_pretrained('pineapple/pineapple-full')

Starting Training Loop...
[0/1][0/998]	Loss_D: 1.3979	Loss_G: 1.0473	D(x): 0.5657	D(G(z)): 0.5632 / 0.3509
[0/1][5/998]	Loss_D: 2.1694	Loss_G: 0.7613	D(x): 0.4147	D(G(z)): 0.7245 / 0.4671
[0/1][10/998]	Loss_D: 1.7153	Loss_G: 0.7108	D(x): 0.4136	D(G(z)): 0.5650 / 0.4912
[0/1][15/998]	Loss_D: 1.2648	Loss_G: 1.2467	D(x): 0.3801	D(G(z)): 0.2572 / 0.2874
[0/1][20/998]	Loss_D: 1.9756	Loss_G: 1.2152	D(x): 0.3145	D(G(z)): 0.5590 / 0.2967
[0/1][25/998]	Loss_D: 1.6681	Loss_G: 0.5402	D(x): 0.6364	D(G(z)): 0.7036 / 0.5827
[0/1][30/998]	Loss_D: 1.7606	Loss_G: 0.4695	D(x): 0.5831	D(G(z)): 0.7051 / 0.6253
[0/1][35/998]	Loss_D: 0.8031	Loss_G: 1.8999	D(x): 0.6768	D(G(z)): 0.3382 / 0.1496
[0/1][40/998]	Loss_D: 0.5026	Loss_G: 2.7055	D(x): 0.6407	D(G(z)): 0.0558 / 0.0668
[0/1][45/998]	Loss_D: 1.6420	Loss_G: 0.7098	D(x): 0.5200	D(G(z)): 0.6278 / 0.4918
[0/1][50/998]	Loss_D: 1.5490	Loss_G: 2.0841	D(x): 0.3529	D(G(z)): 0.3979 / 0.1244
[0/1][55/998]	Loss_D: 2.6368	Loss_G: 0.8763	D(x): 0.0801	D(G(z)): 0.1057 /