## Imports

In [2]:
import pandas as pd
import numpy as np
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
import random
from sklearn.metrics import roc_auc_score
from transformers import RobertaModel, RobertaTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


## Submission Flag

In [3]:
is_submission = False

## Read Datasets

In [3]:
train_path1 = r"data\train_v2_drcat_02.csv" if not is_submission else r"/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv"
train_path2 = r"data\train_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/train_essays.csv"
test_path = r"data\test_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/test_essays.csv"
train_data1 = pd.read_csv(train_path1)
train_data1.rename(columns={'label': 'generated'}, inplace=True)
train_data2 = pd.read_csv(train_path2)
test_data = pd.read_csv(test_path)

In [4]:
train = pd.concat([train_data1[['text','generated']], train_data2[['text','generated']]])
train['text'] = train['text'].str.replace('\n', '')
test_data['text'] = test_data['text'].str.replace('\n', '')
train['generated'].value_counts()

generated
0    28746
1    17500
Name: count, dtype: int64

In [5]:
rus = RandomUnderSampler(random_state=42)
train_text, train_label = rus.fit_resample(train['text'].to_numpy().reshape(-1,1), train['generated'].to_numpy().reshape(-1,1))
print('0: ', np.count_nonzero(train_label == 0))
print('1: ', np.count_nonzero(train_label == 1))

data = {'text': train_text.reshape(-1), 'generated': train_label.reshape(-1)}
train_data = pd.DataFrame(data)

if not is_submission:
    seed=202
    random.seed(seed)
    np.random.seed(seed)
    mask = np.random.rand(len(train_data)) < 0.8
    test_data = train_data[~mask]
    train_data = train_data[mask]

0:  17500
1:  17500


## Embeddings

In [6]:
# Load pre-trained RoBERTa model and tokenizer
model = RobertaModel.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the text in train data
tokenized_train_texts = tokenizer(train_data['text'].to_list(), padding=True, truncation=True, return_tensors='pt')

# Tokenize the text in train data
tokenized_test_texts = tokenizer(test_data['text'].to_list(), padding=True, truncation=True, return_tensors='pt')


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
## Create Dataloader
# Convert train labels to tensor
train_labels_tensor = torch.tensor(train_data['generated'].values)

# Create a train TensorDataset
train_dataset = TensorDataset(
    tokenized_train_texts['input_ids'],
    tokenized_train_texts['attention_mask'],
    train_labels_tensor
)

if is_submission:

    # Create a test TensorDataset
    test_dataset = TensorDataset(
        tokenized_test_texts['input_ids'],
        tokenized_test_texts['attention_mask']
    )

else:
    # Convert text labels to tensor
    test_labels_tensor = torch.tensor(test_data['generated'].values)

    # Create a test TensorDataset
    test_dataset = TensorDataset(
        tokenized_test_texts['input_ids'],
        tokenized_test_texts['attention_mask'],
        test_labels_tensor
    )

# Define batch size
batch_size = 16  # You can adjust this based on your system's memory capacity

# Create a DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [8]:
def extract_features(concrete_data_loader):
    features_list = []
    labels_list = []

    with torch.no_grad():
        for batch in concrete_data_loader:
            batch_tokenized_texts = {
                'input_ids': batch[0],
                'attention_mask': batch[1]
            }
            
            # Extract token embeddings for the batch
            batch_outputs = model(**batch_tokenized_texts)
            batch_embeddings = batch_outputs.last_hidden_state
            
            # Flatten the embeddings to use as features for the batch
            batch_features = batch_embeddings.mean(dim=1).numpy()
            features_list.append(batch_features)
            
            # Get labels for the batch
            batch_labels = batch[2].numpy()
            labels_list.append(batch_labels)
    
    # Concatenate features from all batches
    return np.concatenate(features_list, axis=0), np.concatenate(labels_list, axis=0)

def extract_features_test_submission(concrete_data_loader):
    features_list = []

    with torch.no_grad():
        for batch in concrete_data_loader:
            batch_tokenized_texts = {
                'input_ids': batch[0],
                'attention_mask': batch[1]
            }
            
            # Extract token embeddings for the batch
            batch_outputs = model(**batch_tokenized_texts)
            batch_embeddings = batch_outputs.last_hidden_state
            
            # Flatten the embeddings to use as features for the batch
            batch_features = batch_embeddings.mean(dim=1).numpy()
            features_list.append(batch_features)
            
    
    # Concatenate features from all batches
    return np.concatenate(features_list, axis=0)

# Extract features using DataLoader
if is_submission:
    train_features, train_labels = extract_features(train_loader)
    test_features = extract_features_test_submission(tokenized_test_texts)
    print('Len train_features: ', len(train_features), ' Len train_labels: ', len(train_labels))
    print('Len train_features: ', len(test_features))
else: 
    train_features, train_labels = extract_features(train_loader)


In [9]:
test_features, test_labels = extract_features(test_loader)
print('Len train_features: ', len(train_features), ' Len train_labels: ', len(train_labels))
print('Len train_features: ', len(test_features), ' Len train_labels: ', len(test_labels))

Len train_features:  27983  Len train_labels:  27983
Len train_features:  7017  Len train_labels:  7017


In [1]:
train_features[0]

NameError: name 'train_features' is not defined

## Create Graph

In [10]:
# Calculate pairwise cosine similarity between sentence embeddings
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(train_features, train_features)

# Convert similarity matrix to a graph (adjacency matrix)
import networkx as nx
import numpy as np

# Threshold for considering edges
threshold = 0.8

adjacency_matrix = np.where(similarity_matrix > threshold, 1, 0)

# Create a graph from the adjacency matrix
graph = nx.from_numpy_array(adjacency_matrix)

In [None]:
import dgl
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import GATConv

# Convert the NetworkX graph to a DGL graph
dgl_graph = dgl.from_networkx(graph)

class GATClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, num_classes):
        super(GATClassifier, self).__init__()
        self.gatconv1 = GATConv(in_dim, hidden_dim, num_heads=4)
        self.gatconv2 = GATConv(hidden_dim * 4, num_classes, num_heads=1)

    def forward(self, g, h):
        h = F.elu(self.gatconv1(g, h).flatten(1))
        h = self.gatconv2(g, h).mean(1)
        return h

# Define GAT model
gat_model = GATClassifier(in_dim=sentence_embeddings.shape[1], hidden_dim=64, num_classes=2)

# Training
optimizer = torch.optim.Adam(gat_model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Assuming you have train and test data split
train_mask = torch.rand(len(sentences)) < 0.8
test_mask = ~train_mask

def train_model(model, optimizer, criterion, train_mask, test_mask):
    for epoch in range(epochs):
        model.train()
        logits = model(dgl_graph, sentence_embeddings)
        optimizer.zero_grad()
        loss = criterion(logits[train_mask], labels[train_mask])
        loss.backward()
        optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            test_logits = model(dgl_graph, sentence_embeddings)
            test_loss = criterion(test_logits[test_mask], labels[test_mask])
            print(f"Epoch {epoch + 1}/{epochs}, Test Loss: {test_loss.item()}")

train_model(gat_model, optimizer, criterion, train_mask, test_mask)
