In [None]:
# The model consists of the following key components:

# Persona Encoder - Applies self-attention to persona embeddings.
# Context Encoder - Applies self-attention to dialogue history embeddings.
# Persona-Adaptive Attention (PAA) - Uses cross-attention between persona and context, dynamically adjusting their influence.
# Dialogue Decoder - Generates responses using a transformer-based model, incorporating persona-aware representations.

In [None]:
# pip install torch transformers datasets

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class SelfAttention(nn.Module):
    """Single-head self-attention mechanism."""
    def __init__(self, embed_size):
        super(SelfAttention, self).__init__()
        self.query = nn.Linear(embed_size, embed_size)
        self.key = nn.Linear(embed_size, embed_size)
        self.value = nn.Linear(embed_size, embed_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)

        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / (K.shape[-1] ** 0.5)
        attention_weights = self.softmax(attention_scores)

        output = torch.matmul(attention_weights, V)
        return output


In [3]:
class PersonaEncoder(nn.Module):
    def __init__(self, embed_size):
        super(PersonaEncoder, self).__init__()
        self.attention = SelfAttention(embed_size)
        self.norm = nn.LayerNorm(embed_size)

    def forward(self, persona_embeds):
        attended = self.attention(persona_embeds)
        return self.norm(attended + persona_embeds)  # Residual Connection


In [4]:
class ContextEncoder(nn.Module):
    def __init__(self, embed_size):
        super(ContextEncoder, self).__init__()
        self.attention = SelfAttention(embed_size)
        self.norm = nn.LayerNorm(embed_size)

    def forward(self, context_embeds):
        attended = self.attention(context_embeds)
        return self.norm(attended + context_embeds)  # Residual Connection


In [5]:
class PersonaAdaptiveAttention(nn.Module):
    def __init__(self, embed_size):
        super(PersonaAdaptiveAttention, self).__init__()
        self.context_attention = SelfAttention(embed_size)
        self.persona_attention = SelfAttention(embed_size)

        self.weighting = nn.Linear(embed_size, 1)  # Adaptive weighting
        self.norm = nn.LayerNorm(embed_size)

    def forward(self, persona_embeds, context_embeds):
        attended_context = self.context_attention(context_embeds)
        attended_persona = self.persona_attention(persona_embeds)

        # Adaptive Weighting (From PAA Mechanism)
        weights = torch.sigmoid(self.weighting(attended_persona))
        weighted_persona = weights * attended_persona

        # Fusion: Combining Persona and Context
        fused_representation = attended_context + weighted_persona
        return self.norm(fused_representation)


In [6]:
class DialogueDecoder(nn.Module):
    def __init__(self, model_name="gpt2", embed_size=768):
        super(DialogueDecoder, self).__init__()
        self.gpt2 = GPT2LMHeadModel.from_pretrained(model_name)

    def forward(self, input_embeds, response_tokens):
        outputs = self.gpt2(inputs_embeds=input_embeds, labels=response_tokens)
        return outputs.loss, outputs.logits


In [7]:
class PersonaAdaptiveChatbot(nn.Module):
    def __init__(self, embed_size=768, model_name="gpt2"):
        super(PersonaAdaptiveChatbot, self).__init__()

        self.persona_encoder = PersonaEncoder(embed_size)
        self.context_encoder = ContextEncoder(embed_size)
        self.paa = PersonaAdaptiveAttention(embed_size)
        self.decoder = DialogueDecoder(model_name, embed_size)

        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    def forward(self, persona_tokens, context_tokens, response_tokens):
        # Token Embeddings
        persona_embeds = self.decoder.gpt2.transformer.wte(persona_tokens)
        context_embeds = self.decoder.gpt2.transformer.wte(context_tokens)

        # Encoders
        encoded_persona = self.persona_encoder(persona_embeds)
        encoded_context = self.context_encoder(context_embeds)

        # Persona-Adaptive Attention
        fused_representation = self.paa(encoded_persona, encoded_context)

        # Use PAA Output for GPT-2 Decoder
        response_embeds = self.decoder.gpt2.transformer.wte(response_tokens)
        response_embeds = response_embeds + fused_representation  # Inject PAA info

        return self.decoder(response_embeds, response_tokens)


In [8]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer


csv_path = "datasets\FoCus\ds_cleaned.csv" #focus
df = pd.read_csv(csv_path)

assert "personas" in df.columns and "context" in df.columns and "act_response" in df.columns, "Missing columns in dataset!"


In [9]:
df.head()

Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"User2: I suggest a place, for your wish of see..."


In [10]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a padding token

# Tokenization function
def tokenize_texts(persona, context, response, max_length=50):
    persona_tokens = tokenizer(persona, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")["input_ids"]
    context_tokens = tokenizer(context, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")["input_ids"]
    response_tokens = tokenizer(response, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")["input_ids"]
    return persona_tokens.squeeze(0), context_tokens.squeeze(0), response_tokens.squeeze(0)

# Apply tokenization to the entire dataset
df["tokenized"] = df.apply(lambda row: tokenize_texts(row["personas"], row["context"], row["act_response"]), axis=1)

# Extract tokenized tensors
persona_tensors = torch.stack([x[0] for x in df["tokenized"]])
context_tensors = torch.stack([x[1] for x in df["tokenized"]])
response_tensors = torch.stack([x[2] for x in df["tokenized"]])


In [11]:
class FoCusDataset(Dataset):
    def __init__(self, persona_tensors, context_tensors, response_tensors):
        self.persona_tensors = persona_tensors
        self.context_tensors = context_tensors
        self.response_tensors = response_tensors

    def __len__(self):
        return len(self.persona_tensors)

    def __getitem__(self, idx):
        return {
            "personas": self.persona_tensors[idx],
            "context": self.context_tensors[idx],
            "response": self.response_tensors[idx],
        }

# Create dataset
dataset = FoCusDataset(persona_tensors, context_tensors, response_tensors)

# Create DataLoader
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)


In [12]:
# Define model
model = PersonaAdaptiveChatbot()

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

# Define loss function (GPT-2 uses CrossEntropyLoss)
criterion = nn.CrossEntropyLoss()


In [13]:
device="cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [14]:
import torch
from tqdm import tqdm

# Set device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(model, dataloader, optimizer, num_epochs=20):
    model.to(device)  # Move model to GPU
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True)

        for batch in progress_bar:
            # Move batch tensors to GPU
            persona_tokens = batch["personas"].to(device)
            context_tokens = batch["context"].to(device)
            response_tokens = batch["response"].to(device)

            optimizer.zero_grad()
            loss, logits = model(persona_tokens, context_tokens, response_tokens)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Update tqdm progress bar with current loss
            progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

# Train the model with CSV data
train(model, train_loader, optimizer)


Epoch 1/20: 100%|██████████| 125/125 [00:13<00:00,  9.05it/s, loss=4.45]


Epoch 1/20, Loss: 556.5154


Epoch 2/20: 100%|██████████| 125/125 [00:13<00:00,  8.95it/s, loss=3.35]


Epoch 2/20, Loss: 418.2602


Epoch 3/20: 100%|██████████| 125/125 [00:13<00:00,  8.95it/s, loss=2.15]


Epoch 3/20, Loss: 268.3800


Epoch 4/20: 100%|██████████| 125/125 [00:13<00:00,  8.95it/s, loss=1.87]


Epoch 4/20, Loss: 233.9319


Epoch 5/20: 100%|██████████| 125/125 [00:13<00:00,  9.17it/s, loss=1.69]


Epoch 5/20, Loss: 211.1146


Epoch 6/20: 100%|██████████| 125/125 [00:13<00:00,  9.19it/s, loss=1.53]


Epoch 6/20, Loss: 191.1770


Epoch 7/20: 100%|██████████| 125/125 [00:13<00:00,  9.05it/s, loss=1.36]


Epoch 7/20, Loss: 170.2746


Epoch 8/20: 100%|██████████| 125/125 [00:13<00:00,  9.17it/s, loss=1.2] 


Epoch 8/20, Loss: 150.2616


Epoch 9/20: 100%|██████████| 125/125 [00:13<00:00,  9.19it/s, loss=1.05]


Epoch 9/20, Loss: 131.5594


Epoch 10/20: 100%|██████████| 125/125 [00:13<00:00,  9.19it/s, loss=0.914]


Epoch 10/20, Loss: 114.2637


Epoch 11/20: 100%|██████████| 125/125 [00:13<00:00,  9.14it/s, loss=0.78] 


Epoch 11/20, Loss: 97.5561


Epoch 12/20: 100%|██████████| 125/125 [00:13<00:00,  9.19it/s, loss=0.676]


Epoch 12/20, Loss: 84.5215


Epoch 13/20: 100%|██████████| 125/125 [00:13<00:00,  9.14it/s, loss=0.583]


Epoch 13/20, Loss: 72.8321


Epoch 14/20: 100%|██████████| 125/125 [00:13<00:00,  9.19it/s, loss=0.501]


Epoch 14/20, Loss: 62.6469


Epoch 15/20: 100%|██████████| 125/125 [00:13<00:00,  9.24it/s, loss=0.44] 


Epoch 15/20, Loss: 54.9808


Epoch 16/20: 100%|██████████| 125/125 [00:13<00:00,  9.28it/s, loss=0.385]


Epoch 16/20, Loss: 48.1615


Epoch 17/20: 100%|██████████| 125/125 [00:13<00:00,  9.22it/s, loss=0.352]


Epoch 17/20, Loss: 44.0043


Epoch 18/20: 100%|██████████| 125/125 [00:13<00:00,  9.22it/s, loss=0.321]


Epoch 18/20, Loss: 40.0975


Epoch 19/20: 100%|██████████| 125/125 [00:13<00:00,  8.94it/s, loss=0.294]


Epoch 19/20, Loss: 36.7722


Epoch 20/20: 100%|██████████| 125/125 [00:13<00:00,  9.20it/s, loss=0.278]

Epoch 20/20, Loss: 34.7669





In [16]:
import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def generate_response(model, persona_text, context_text, max_length=50):
    model.to(device)  # Ensure model is on GPU
    model.eval()

    # Tokenize and move to GPU, ensuring padding for alignment
    persona_tokens = tokenizer(persona_text, return_tensors="pt", padding=True, truncation=True)["input_ids"].to(device)
    context_tokens = tokenizer(context_text, return_tensors="pt", padding=True, truncation=True)["input_ids"].to(device)

    with torch.no_grad():
        persona_embeds = model.decoder.gpt2.transformer.wte(persona_tokens)
        context_embeds = model.decoder.gpt2.transformer.wte(context_tokens)

        # Determine max sequence length for padding
        max_seq_len = max(persona_embeds.shape[1], context_embeds.shape[1])

        # Pad tensors to the same length
        pad_size_persona = max_seq_len - persona_embeds.shape[1]
        pad_size_context = max_seq_len - context_embeds.shape[1]

        persona_embeds = torch.nn.functional.pad(persona_embeds, (0, 0, 0, pad_size_persona), "constant", 0)
        context_embeds = torch.nn.functional.pad(context_embeds, (0, 0, 0, pad_size_context), "constant", 0)

        # Encode using persona and context encoders
        encoded_persona = model.persona_encoder(persona_embeds)
        encoded_context = model.context_encoder(context_embeds)

        # Apply Persona-Adaptive Attention (PAA)
        fused_representation = model.paa(encoded_persona, encoded_context)

    # Generate response ensuring input is on GPU
    generated = model.decoder.gpt2.generate(input_ids=context_tokens, max_length=max_length).to(device)
    return tokenizer.decode(generated[0], skip_special_tokens=True)

# Example
persona_text = "I love sci-fi movies."
context_text = "user1: What's your favorite movie?"
response = generate_response(model, persona_text, context_text)
print("Generated Response:", response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Response: user1: What's your favorite movie?


In [20]:
print(df.iloc[29]['personas'])
print( )                                                                                                             

'I like construction.I like historic places.I like things in the army.I like architecture.I would like to visit San Angelo, Texas.'