## Imports

In [17]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaTokenizer, RobertaModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

## Submission Flag

In [18]:
is_submission = False

## Read Datasets

In [19]:
train_path = r"data\train_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/train_essays.csv"
test_path = r"data\test_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/test_essays.csv"
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [20]:
train_texts, test_texts, train_labels, test_labels = train_test_split(train_data['text'].values, train_data['generated'].values, test_size=0.33, random_state=42)

## Tokenize Text

In [21]:
# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta = RobertaModel.from_pretrained('roberta-base')

# Tokenize the text data
train_tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, return_tensors='pt')[0] for text in train_texts]
test_tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, return_tensors='pt')[0] for text in test_texts]


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors


In [24]:
# Get the maximum sequence length
max_len = max(max(len(tokenized_text) for tokenized_text in train_tokenized_texts),
              max(len(tokenized_text) for tokenized_text in test_tokenized_texts))

# Pad tokenized sequences to the maximum length
padded_train_tokenized_texts = [torch.cat((tokenized_text, torch.zeros((max_len - tokenized_text.shape[0]), dtype=torch.long)))[:max_len] for tokenized_text in train_tokenized_texts]
padded_test_tokenized_texts = [torch.cat((tokenized_text, torch.zeros((max_len - tokenized_text.shape[0]), dtype=torch.long)))[:max_len] for tokenized_text in test_tokenized_texts]

# Convert to PyTorch tensors
train_input_ids = torch.stack(padded_train_tokenized_texts)
test_input_ids = torch.stack(padded_test_tokenized_texts)

train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create TensorDataset
train_dataset = TensorDataset(train_input_ids, train_labels)
test_dataset = TensorDataset(test_input_ids, test_labels)


## Define Model

In [25]:
class GATLayer(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(GATLayer, self).__init__()
        self.W = nn.Linear(in_dim, out_dim)
        self.a = nn.Linear(2 * out_dim, 1)

    def forward(self, h, adj):
        Wh = self.W(h)
        a_input = self.prepare_attention_input(Wh)
        e = self.leaky_relu(self.a(a_input).squeeze(-1))
        attention = self.softmax(e, adj)
        h_prime = self.aggregate(Wh, attention)
        return h_prime

    def prepare_attention_input(self, Wh):
        N = Wh.size(0)
        Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=0)
        Wh_repeated_alternating = Wh.repeat(N, 1)
        a_input = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=1)
        return a_input

    def leaky_relu(self, input_tensor, alpha=0.2):
        return F.leaky_relu(input_tensor, alpha)

    def softmax(self, e, adj):
        attention = e.view(-1, 1)
        attention = torch.exp(attention - torch.max(attention))
        attention = attention * adj
        attention = attention / (torch.sum(attention, dim=1, keepdim=True) + 1e-7)
        return attention

    def aggregate(self, Wh, attention):
        return torch.matmul(attention, Wh)

class GAT(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads):
        super(GAT, self).__init__()
        self.attention_heads = nn.ModuleList([
            GATLayer(in_dim, hidden_dim) for _ in range(num_heads)
        ])
        self.out_layer = nn.Linear(hidden_dim * num_heads, out_dim)

    def forward(self, h, adj):
        head_outs = [attn_head(h, adj) for attn_head in self.attention_heads]
        concat_heads = torch.cat(head_outs, dim=1)
        out = self.out_layer(concat_heads)
        return out

# Define the GAT model
model = GAT(in_dim=roberta.config.hidden_size, hidden_dim=64, out_dim=1, num_heads=2)

## Train Model

In [26]:
# Assuming tokenized_train_texts is a list of lists, each containing token IDs
num_texts = len(padded_train_tokenized_texts)
adjacency_matrix = np.zeros((num_texts, num_texts))

# Calculate shared tokens between texts
for i in range(num_texts):
    for j in range(num_texts):
        shared_tokens = set(padded_train_tokenized_texts[i]) & set(padded_train_tokenized_texts[j])
        similarity_score = len(shared_tokens)
        adjacency_matrix[i][j] = similarity_score

print(adjacency_matrix)

In [16]:
# Training configurations
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

# DataLoader setup
batch_size = 32  # Adjust batch size as needed
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Lists to store accuracy values during training
train_accuracy_values = []

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set the model to train mode
    
    epoch_losses = []
    predicted_labels = []
    true_labels = []
    
    for input_ids_batch, labels_batch in train_loader:  # Iterate over batches
        optimizer.zero_grad()  # Clear gradients
        
        # Forward pass
        with torch.no_grad():
            outputs = roberta(input_ids_batch)
            last_hidden_states = outputs.last_hidden_state
            
        output = model(last_hidden_states, adjacency_matrix)
        
        # Compute loss
        loss = criterion(output.squeeze(), labels_batch.float())
        epoch_losses.append(loss.item())
        
        # Calculate accuracy
        predicted = (torch.sigmoid(output) > 0.5).detach().cpu().numpy().astype(int)
        true = labels_batch.cpu().numpy()
        predicted_labels.extend(predicted)
        true_labels.extend(true)
    
        # Backpropagation
        loss.backward()
        optimizer.step()
    
    # Calculate accuracy for the epoch
    accuracy = accuracy_score(true_labels, predicted_labels)
    train_accuracy_values.append(accuracy)
    
    # Print epoch loss and accuracy
    print(f"Epoch [{epoch + 1}/{num_epochs}] Loss: {np.mean(epoch_losses):.4f} Accuracy: {accuracy:.4f}")

# Plotting accuracy
plt.figure(figsize=(8, 6))
plt.plot(range(1, num_epochs + 1), train_accuracy_values, label='Training Accuracy')
plt.title('Training Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

TypeError: list indices must be integers or slices, not tuple

## Predict Test Set

## Create Submission

In [None]:
submission = pd.DataFrame({"id": test_data["id"], "generated": predictions})
submission_path = r"data\submission.csv" if not is_submission else r"/kaggle/working/submission.csv"
submission.to_csv(submission_path, index=False)