# Installation

In [None]:
# pip install pandas transformers 

In [None]:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [1]:
import torch

print("torch + cuda:", torch.__version__) # Check PyTorch version
print("Is cuda avialable:", torch.cuda.is_available())

torch + cuda: 2.5.1+cu121
Is cuda avialable: True


# Dataset Pre-processing

In [11]:
DATASET = "FoCus"

In [12]:
import pandas as pd
import json

with open(f'./datasets/{DATASET}/valid_focus.json') as f:
    valid_data = json.load(f)

In [13]:
def convertToDialogue(my_list):
    formatted_string = ""
    for index, item in enumerate(my_list):
        if index % 2 == 0:
            user = "User1"
        else:
            user = "User2"
        formatted_string += f"{user}: {item}\n"
    formatted_string = formatted_string.rstrip("\n")
    return formatted_string

flattened_data = []
data_list = valid_data['data']
for entry in data_list:
    persona =  "".join(entry['persona'])
    list_length = len(entry["utterance"])
    last_utterance = entry["utterance"][-1]
    dialogue_key = f"dialogue{list_length}"
    last_item = last_utterance[dialogue_key]
    flattened_data.append({
                'dialogID': entry['dialogID'],
                'persona': persona,
                'utterance': convertToDialogue(last_item)
            })

df = pd.DataFrame(flattened_data)

In [14]:
df = df.replace(r'\*\*', '', regex=True)
df = df.replace(r'\r', '', regex=True)
df = df.replace("'", "", regex=True)

df.dropna(inplace=True)

# Function to split the conversation
def split_conversation(conv_str):
    utterances = conv_str.split("\n")
    context = "\n".join(utterances[:-1])
    response = utterances[-1]
    return context, response

new_rows = []
for index, row in df.iterrows():
    context, response = split_conversation(row['utterance'])
    new_row = {
        'personas': row['persona'],
        'context': context,
        'act_response': response
    }
    new_rows.append(new_row)

new_df = pd.DataFrame(new_rows)

new_df.head(4)

Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...


In [15]:
# Calculate minimum and maximum number of words in each column
min_persona_length = new_df['personas'].apply(lambda x: len(x.split())).min()
max_persona_length = new_df['personas'].apply(lambda x: len(x.split())).max()

min_context_length = new_df['context'].apply(lambda x: len(x.split())).min()
max_context_length = new_df['context'].apply(lambda x: len(x.split())).max()

min_response_length = new_df['act_response'].apply(lambda x: len(x.split())).min()
max_response_length = new_df['act_response'].apply(lambda x: len(x.split())).max()

# Print the lengths in min-max format
print(f"Persona Length (in words): {min_persona_length}-{max_persona_length}")
print(f"Context Length (in words): {min_context_length}-{max_context_length}")
print(f"Response Length (in words): {min_response_length}-{max_response_length}")

Persona Length (in words): 11-61
Context Length (in words): 56-513
Response Length (in words): 4-108


In [16]:
# Save the new DataFrame to a CSV file
new_df.to_csv(f'./datasets/{DATASET}/ds_cleaned.csv', index=False)

In [17]:
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset

In [18]:
# Step 1: Load the CSV file - Preprocessed FoCuFs Dataset

df = pd.read_csv(f'./datasets/{DATASET}/ds_cleaned.csv')
df.head()

Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"User2: I suggest a place, for your wish of see..."


# Custome Torch Dataset Class


In [19]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, AdamW
import torch.nn as nn
from transformers import GPT2Config, GPT2Model

In [20]:
# Section 1: Custom Dataset for FoCus

class FoCusDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_len=512):
        """
        Args:
            csv_path (str): Path to the cleaned FoCus dataset (ds_cleaned.csv).
            tokenizer (GPT2Tokenizer): Pretrained GPT-2 tokenizer.
            max_len (int): Maximum sequence length for padding/truncation.
        """
        self.data = pd.read_csv(csv_path)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Extract columns
        persona = row['personas']
        context = row['context']
        response = row['act_response']

        # Tokenize persona, context, and response
        persona_tokens = self.tokenizer(
            persona,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt",
        )
        context_tokens = self.tokenizer(
            context,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt",
        )
        response_tokens = self.tokenizer(
            response,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt",
        )

        return {
            'persona': persona_tokens['input_ids'].squeeze(0),
            'context': context_tokens['input_ids'].squeeze(0),
            'response': response_tokens['input_ids'].squeeze(0),
            'persona_mask': persona_tokens['attention_mask'].squeeze(0),
            'context_mask': context_tokens['attention_mask'].squeeze(0),
            'response_mask': response_tokens['attention_mask'].squeeze(0),
        }

In [21]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", cache_dir="downloaded_LM")  # GPT-2 tokenizer with caching

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # Use eos_token as pad_token


In [22]:
# Initialize tokenizer and dataset
DATASET = "FoCus"

csv_path = f"./datasets/{DATASET}/ds_cleaned.csv"  # Path to your dataset

dataset = FoCusDataset(csv_path, tokenizer)

# Display a sample in a readable format
sample = dataset[0]

persona_text = tokenizer.decode(sample['persona'], skip_special_tokens=True)
context_text = tokenizer.decode(sample['context'], skip_special_tokens=True)
response_text = tokenizer.decode(sample['response'], skip_special_tokens=True)

print("Persona:", persona_text)
print("Context:", context_text)
print("Response:", response_text)

Persona: I would like to visit the Nazareth House again.I love Benevolent institutions.I am interested in History.I have curiosity about the Description of this place.I would like to know when it was Built.
Context: User1: I think Ive been there before but I dont remember the name of this place.
User2: This place is the Nazareth House, which you would like to visit again.
User1: Can you describe this house to me?
User2: You have curiosity about the description of Nazareth House and I will tell you. Nazareth House is prominently located on an elevation along Wynnum North Road. The complex consists of a number of buildings including the original building, the Convent and Chapel and two more recent additions, St Josephs Hostel and the nursing home known as Larmeniere.
User1: Does this house look old to me, when it was built?
User2: This house is relatively old, but since you would like to know when it was built, I will explain it to you. Nazareth House was built from 1924 to 1939.
User1: 

# Encoders

In [23]:
import torch.nn as nn
from transformers import GPT2Config, GPT2Model

# Section 2: Persona and Context Encoders
class TransformerEncoder(nn.Module):
    def __init__(self, hidden_size, num_layers, num_heads):
        super(TransformerEncoder, self).__init__()
        # Initialize transformer configuration
        self.config = GPT2Config(
            n_embd=hidden_size,
            n_layer=num_layers,
            n_head=num_heads,
        )
        self.transformer = GPT2Model(self.config)
    
    def forward(self, input_ids, attention_mask):
        # Pass inputs through the transformer
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  # Return hidden states


# Hyperparameters for the encoders
hidden_size = 768  # Match GPT-2 small
num_layers = 4  # As described in the paper
num_heads = 4

# Initialize Persona and Context Encoders
persona_encoder = TransformerEncoder(hidden_size, num_layers, num_heads)
context_encoder = TransformerEncoder(hidden_size, num_layers, num_heads)

## Example Input

In [24]:
dummy_input_ids = torch.randint(0, 50256, (1, 128))  # Example input IDs
dummy_attention_mask = torch.ones_like(dummy_input_ids)  # Example attention mask

# Forward pass through the encoders
persona_hidden_states = persona_encoder(dummy_input_ids, dummy_attention_mask)
context_hidden_states = context_encoder(dummy_input_ids, dummy_attention_mask)

print("Persona Hidden States Shape:", persona_hidden_states.shape)
print("Context Hidden States Shape:", context_hidden_states.shape)

Persona Hidden States Shape: torch.Size([1, 128, 768])
Context Hidden States Shape: torch.Size([1, 128, 768])


# Dialog Decoder with PAA

In [30]:
class PersonaAdaptiveAttention(nn.Module):
    def __init__(self, hidden_size):
        super(PersonaAdaptiveAttention, self).__init__()
        self.hidden_size = hidden_size
        self.weight_fc = nn.Linear(2 * hidden_size, hidden_size)  # For generating weights
        self.sigmoid = nn.Sigmoid()

    def forward(self, persona_states, context_states, decoder_states):
        """
        Args:
            persona_states: Hidden states from the persona encoder (batch_size, seq_len, hidden_size).
            context_states: Hidden states from the context encoder (batch_size, seq_len, hidden_size).
            decoder_states: Hidden states from the decoder's self-attention (batch_size, seq_len, hidden_size).
        Returns:
            Balanced representation of persona and context (batch_size, seq_len, hidden_size).
        """
        print("decoder_states shape:", decoder_states.shape)  # (batch, seq_len, hidden_size)
        print("persona_states shape:", persona_states.shape)  # Expected: (batch, seq_len, hidden_size)
        print("context_states shape:", context_states.shape)  # Expected: (batch, seq_len, hidden_size)


        # Attention alignment using dot product
        persona_attention = torch.bmm(decoder_states, persona_states.transpose(1, 2))  # (batch, seq_len, seq_len)
        context_attention = torch.bmm(decoder_states, context_states.transpose(1, 2))  # (batch, seq_len, seq_len)

        # Weighted sum of encoder hidden states
        persona_weighted = torch.bmm(persona_attention, persona_states)  # Align persona states
        context_weighted = torch.bmm(context_attention, context_states)  # Align context states

        # Concatenate aligned persona and context states with decoder states
        combined_states = torch.cat((persona_weighted, context_weighted), dim=-1)  # (batch, seq_len, 2*hidden_size)

        # Compute dynamic weights for persona and context
        persona_weights = self.sigmoid(self.weight_fc(combined_states))
        context_weights = 1.0 - persona_weights

        # Weighted contributions
        persona_contribution = persona_weights * persona_weighted
        context_contribution = context_weights * context_weighted

        # Combine contributions
        combined_representation = persona_contribution + context_contribution
        return combined_representation




class DialogDecoderWithPAA(nn.Module):
    def __init__(self, hidden_size, num_layers, num_heads, vocab_size):
        super(DialogDecoderWithPAA, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_heads = num_heads
        
        # GPT-2 decoder
        self.config = GPT2Config(
            n_embd=hidden_size,
            n_layer=num_layers,
            n_head=num_heads,
            vocab_size=vocab_size,
        )
        self.decoder = GPT2Model(self.config)
        
        # Persona-Adaptive Attention
        self.paa = PersonaAdaptiveAttention(hidden_size)
        
        # Final output layer for generating responses
        self.output_layer = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input_ids, attention_mask, persona_states, context_states):
        """
        Args:
            input_ids: Input IDs for the decoder (batch_size, seq_len).
            attention_mask: Attention mask for the decoder (batch_size, seq_len).
            persona_states: Hidden states from the persona encoder (batch_size, seq_len, hidden_size).
            context_states: Hidden states from the context encoder (batch_size, seq_len, hidden_size).
        Returns:
            Token logits (batch_size, seq_len, vocab_size).
        """
        # Pass through the decoder
        decoder_outputs = self.decoder(input_ids=input_ids, attention_mask=attention_mask)
        decoder_states = decoder_outputs.last_hidden_state
        
        # Apply Persona-Adaptive Attention
        balanced_representation = self.paa(persona_states, context_states, decoder_states)
        
        # Generate token logits
        token_logits = self.output_layer(balanced_representation)
        return token_logits


## Example usage

In [31]:
vocab_size = 50257  # GPT-2 vocabulary size
decoder = DialogDecoderWithPAA(hidden_size, num_layers, num_heads, vocab_size)

dummy_decoder_input_ids = torch.randint(0, 50256, (1, 128))  # Example input IDs for the decoder
dummy_decoder_attention_mask = torch.ones_like(dummy_decoder_input_ids)  # Example attention mask

# Forward pass through the dialog decoder
token_logits = decoder(
    input_ids=dummy_decoder_input_ids,
    attention_mask=dummy_decoder_attention_mask,
    persona_states=persona_hidden_states,
    context_states=context_hidden_states,
)

print("Token Logits Shape:", token_logits.shape)

decoder_states shape: torch.Size([1, 128, 768])
persona_states shape: torch.Size([1, 128, 768])
context_states shape: torch.Size([1, 128, 768])
Token Logits Shape: torch.Size([1, 128, 50257])


# Training Loop

In [32]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        persona = batch['persona'].to(device)
        context = batch['context'].to(device)
        response = batch['response'].to(device)
        persona_mask = batch['persona_mask'].to(device)
        context_mask = batch['context_mask'].to(device)
        response_mask = batch['response_mask'].to(device)

        # Shift response tokens for decoder input and target alignment
        decoder_input_ids = response[:, :-1]
        decoder_target_ids = response[:, 1:]

        # Forward pass
        optimizer.zero_grad()
        outputs = model(
            input_ids=decoder_input_ids,
            attention_mask=response_mask[:, :-1],
            persona_states=persona,
            context_states=context
        )

        # Compute loss
        loss = criterion(outputs.view(-1, outputs.size(-1)), decoder_target_ids.reshape(-1))
        loss.backward()

        # Update weights
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [33]:
# Model initialization
vocab_size = len(tokenizer)
hidden_size = 768
num_layers = 4
num_heads = 4
model = DialogDecoderWithPAA(hidden_size, num_layers, num_heads, vocab_size).to("cuda")

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [34]:
for epoch in range(3):
    avg_loss = train_model(model, dataloader, optimizer, criterion, device)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

decoder_states shape: torch.Size([8, 511, 768])
persona_states shape: torch.Size([8, 512])
context_states shape: torch.Size([8, 512])


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)