In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
file_path = '/content/drive/My Drive/masked_examples_LARGE.json'
!pip install git+https://github.com/google-research/bleurt.git
!wget https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
!unzip bleurt-base-128.zip

Mounted at /content/drive
Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-r3h18wr2
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /tmp/pip-req-build-r3h18wr2
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: BLEURT
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone
  Created wheel for BLEURT: filename=BLEURT-0.0.2-py3-none-any.whl size=16456763 sha256=0ee5eaaa0ca9aad56c9fcd8cbb764389ea69a2117fee83423fb30e3c13e586f5
  Stored in directory: /tmp/pip-ephem-wheel-cache-ipplc7y2/wheels/64/f4/2c/509a6c31b8ebde891a81029fd94f199b1b92f0e7cfc20d417a
Successfully built BLEURT
Installing collected packages: BLEURT
Successfully installed BLEURT-0.0.2
--2024-06-09 00:31:39--  https://storage.googleapi

NameError: name 'variables' is not defined

# FINE-TUNE BART

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from sklearn.model_selection import train_test_split
import json
import random

# Load and prepare data
file_path = '/content/drive/My Drive/masked_examples_LARGE.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the end-of-sequence token
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Add cross-attention layer
class GPT2WithCrossAttention(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)
        self.cross_attention = torch.nn.MultiheadAttention(config.n_embd, config.n_head, dropout=config.attn_pdrop)

    def forward(self, input_ids, attention_mask=None, cross_attention_ids=None, cross_attention_mask=None, labels=None):
        transformer_outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            head_mask=None,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            past_key_values=None,
            use_cache=False,
            output_attentions=False,
            output_hidden_states=False,
            return_dict=False,
        )

        hidden_states = transformer_outputs[0]

        if cross_attention_ids is not None:
            cross_attention_outputs = self.transformer(
                cross_attention_ids,
                attention_mask=cross_attention_mask,
                head_mask=None,
                encoder_hidden_states=None,
                encoder_attention_mask=None,
                past_key_values=None,
                use_cache=False,
                output_attentions=False,
                output_hidden_states=False,
                return_dict=False,
            )
            cross_attention_hidden = cross_attention_outputs[0]
            cross_attention_outputs, _ = self.cross_attention(hidden_states, cross_attention_hidden, cross_attention_hidden)
            hidden_states = hidden_states + cross_attention_outputs

        lm_logits = self.lm_head(hidden_states)

        outputs = (lm_logits,) + transformer_outputs[1:]
        if labels is not None:
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            outputs = (loss,) + outputs

        return outputs

model = GPT2WithCrossAttention.from_pretrained('gpt2')

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")
model = model.to(device)

class DialogueDataset(Dataset):
    def __init__(self, tokenizer, inputs, targets, max_len=512):
        self.tokenizer = tokenizer
        self.inputs = inputs
        self.targets = targets
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        input_encoding = tokenizer(input_text, padding='max_length', max_length=self.max_len, truncation=True, return_tensors='pt')
        target_encoding = tokenizer(target_text, padding='max_length', max_length=self.max_len, truncation=True, return_tensors='pt')

        labels = target_encoding['input_ids']
        labels[labels == tokenizer.pad_token_id] = -100  # Set padding token id to -100 so that it is ignored in loss computation

        return input_encoding['input_ids'].squeeze(), input_encoding['attention_mask'].squeeze(), labels.squeeze()

# Prepare the dataset
inputs = [item['input'] for item in data]
targets = [item['target'] for item in data]
input_train, input_val, target_train, target_val = train_test_split(inputs, targets, test_size=0.2, random_state=42)

train_dataset = DialogueDataset(tokenizer, input_train, target_train)
val_dataset = DialogueDataset(tokenizer, input_val, target_val)

# Parameters
batch_size = 1  # Reduced batch size to avoid out of memory error
num_epochs = 16
num_workers = 4  # Assuming a multi-core machine

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers)

# Setup the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        # Extract game state information from the input
        game_state_ids = input_ids.clone()
        game_state_mask = attention_mask.clone()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, cross_attention_ids=game_state_ids, cross_attention_mask=game_state_mask, labels=labels)
        loss = outputs[0]

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Training loss: {avg_train_loss}")

# Save the model and tokenizer
model.save_pretrained('trained_gpt2_model')
tokenizer.save_pretrained('trained_gpt2_tokenizer')


Some weights of GPT2WithCrossAttention were not initialized from the model checkpoint at gpt2 and are newly initialized: ['cross_attention.in_proj_bias', 'cross_attention.in_proj_weight', 'cross_attention.out_proj.bias', 'cross_attention.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using cuda device


KeyboardInterrupt: 

# EVAL PART

In [13]:
!pip install datasets
!pip install rouge_score
!pip install bleurt
import torch
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_metric
import nltk
from bleurt import score

# Ensure that nltk's resources are downloaded
nltk.download('punkt')

# Check if CUDA is available and set the default device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

# Load pretrained tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('trained_gpt2_tokenizer')
model = GPT2WithCrossAttention.from_pretrained('trained_gpt2_model')
model.to(device)
model.eval()

# Assuming validation data is loaded correctly into these variables
val_dataset = DialogueDataset(tokenizer, input_val, target_val)
val_loader = DataLoader(val_dataset, batch_size=8)

# Load metrics
bleu_metric = load_metric('bleu')
rouge_metric = load_metric('rouge')
accuracy_metric = load_metric('accuracy')

# Initialize BLEURT scorer
bleurt_scorer = score.BleurtScorer("bleurt-base-128")

def compute_metrics(pred_ids, labels):
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_mod = labels.clone()
    labels_mod[labels_mod == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_mod, skip_special_tokens=True)

    # Tokenize predictions and references for BLEU
    predictions_tokens = [nltk.word_tokenize(pred) for pred in pred_str]
    references_tokens = [[nltk.word_tokenize(ref)] for ref in label_str]

    # Update metrics
    bleu_metric.add_batch(predictions=predictions_tokens, references=references_tokens)
    rouge_metric.add_batch(predictions=pred_str, references=label_str)
    accuracy_metric.add_batch(predictions=pred_ids.flatten().tolist(), references=labels_mod.flatten().tolist())

    # Compute BLEURT scores
    bleurt_scores = bleurt_scorer.score(references=label_str, candidates=pred_str)

    return pred_str, label_str, bleurt_scores

# Evaluation loop
model.eval()
total_loss = 0
all_bleurt_scores = []
pred_str_list = []
label_str_list = []
bleurt_scores_list = []

for input_ids, attention_mask, labels in val_loader:
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)

    # Extract game state information from the input
    game_state_ids = input_ids.clone()
    game_state_mask = attention_mask.clone()

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, cross_attention_ids=game_state_ids, cross_attention_mask=game_state_mask, labels=labels)
        logits = outputs[0]  # The logits are the first output in GPT-2
        pred_ids = torch.argmax(logits, dim=-1)
        loss = outputs[0]
        total_loss += loss.item()

        # Compute metrics
        pred_str, label_str, bleurt_scores = compute_metrics(pred_ids, labels)
        all_bleurt_scores.extend(bleurt_scores)
        pred_str_list.extend(pred_str)
        label_str_list.extend(label_str)
        bleurt_scores_list.extend(bleurt_scores)

# Calculate final scores
final_bleu = bleu_metric.compute()
final_rouge = rouge_metric.compute()
final_accuracy = accuracy_metric.compute()
avg_bleurt = sum(all_bleurt_scores) / len(all_bleurt_scores)
avg_loss = total_loss / len(val_loader)

print(f"Validation Loss: {avg_loss}")
print(f"BLEU Score: {final_bleu['bleu']}")
print(f"ROUGE Score: {final_rouge}")
print(f"Accuracy: {final_accuracy['accuracy']}")
print(f"BLEURT Score: {avg_bleurt}")

# Print some predictions and their corresponding targets for qualitative analysis
for i in range(5):
    print(f"Prediction: {pred_str_list[i]}")
    print(f"Reference: {label_str_list[i]}")
    print(f"BLEURT Score: {bleurt_scores_list[i]}")


Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=7e17179642b2315f1bb145dfc9fcfc23ab72ec1837a6f50d3ee1685889e4da03
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Using cuda device


OSError: trained_gpt2_tokenizer is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load DialogRPT model and tokenizer
tokenizer_rpt = AutoTokenizer.from_pretrained("microsoft/DialogRPT-human-vs-rand")
model_rpt = AutoModelForSequenceClassification.from_pretrained("microsoft/DialogRPT-human-vs-rand")
model_rpt.to(device)  # Move the model to GPU if available

def generate_responses(model, tokenizer, dataloader, device):
    model.eval()
    responses = []
    contexts = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)

            # Decode the input_ids to text for context
            context = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
            contexts.extend(context)

            # Generate responses
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=50)
            decoded_responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            responses.extend(decoded_responses)

    return contexts, responses

def evaluate_with_dialogrpt(model_rpt, tokenizer_rpt, contexts, responses, device):
    model_rpt.eval()
    scores = []
    with torch.no_grad():
        for context, response in zip(contexts, responses):
            inputs = tokenizer_rpt.encode_plus(context, response, return_tensors="pt", max_length=512, truncation=True, padding=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}  # Ensure tensor is on the correct device
            outputs = model_rpt(**inputs)
            score = torch.sigmoid(outputs.logits).squeeze().item()  # Use sigmoid if the logits are not already probabilities
            scores.append(score)
    return scores

# Generate responses using the GPT-2 model with cross-attention
contexts, responses = generate_responses(model, tokenizer, val_loader, device)

# Evaluate the responses using DialogRPT
scores = evaluate_with_dialogrpt(model_rpt, tokenizer_rpt, contexts, responses, device)

print("Average DialogRPT Score:", sum(scores) / len(scores))


# DIALOGUE RPT BUT FOR HUMAN VS MACHINE

config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Average DialogRPT Score: 0.9973594482643489


# BERTSCORE EVAL METRICS

# ALL CODE + EVAL FOR DEFAULT BART NO FINETUNING