## Imports

In [1]:
import pandas as pd
import regex as re
import torch
import torch.nn as nn
import torch.nn.functional as F
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from itertools import combinations
import numpy as np
from scipy.spatial.distance import euclidean
from transformers import RobertaTokenizer, RobertaModel
from imblearn.under_sampling import RandomUnderSampler
import random
from sklearn.metrics import roc_auc_score
import dgl
import dgl.nn.pytorch as dglnn
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from dgl.nn import GATConv
import matplotlib.pyplot as plt



  from .autonotebook import tqdm as notebook_tqdm


## Submission Flag

In [2]:
is_submission = False

## Read Datasets

In [3]:
train_path1 = r"data\train_v2_drcat_02.csv" if not is_submission else r"/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv"
train_path2 = r"data\train_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/train_essays.csv"
test_path = r"data\test_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/test_essays.csv"
train_data1 = pd.read_csv(train_path1)
train_data1.rename(columns={'label': 'generated'}, inplace=True)
train_data2 = pd.read_csv(train_path2)
test_data = pd.read_csv(test_path)

In [4]:
train = pd.concat([train_data1[['text','generated']], train_data2[['text','generated']]])
train['text'] = train['text'].str.replace('\n', '')
test_data['text'] = test_data['text'].str.replace('\n', '')
train['generated'].value_counts()

generated
0    28746
1    17500
Name: count, dtype: int64

In [5]:
rus = RandomUnderSampler(random_state=42)
train_text, train_label = rus.fit_resample(train['text'].to_numpy().reshape(-1,1), train['generated'].to_numpy().reshape(-1,1))
print('0: ', np.count_nonzero(train_label == 0))
print('1: ', np.count_nonzero(train_label == 1))

data = {'text': train_text.reshape(-1), 'generated': train_label.reshape(-1)}
all_data = pd.DataFrame(data)

if not is_submission:
    seed=202
    random.seed(seed)
    np.random.seed(seed)
    mask = np.random.rand(len(all_data)) < 0.8
    test_data = all_data[~mask]
    train_data = all_data[mask]

0:  17500
1:  17500


## Embeddings

In [15]:
# Step 1: Data Preparation
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta = RobertaModel.from_pretrained('roberta-base')
# Sample list of texts with labels
texts_list = train_data['text'].to_list()

labels_list = train_data['generated'].to_list()  # Sample labels (0 or 1 for classification)

# Step 2: RoBERTa Embedding

# Tokenize and obtain embeddings for each text entry
roberta_embeddings_list = []


for text in texts_list:
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = roberta(**encoded_input)
    last_hidden_states = output.last_hidden_state
    # For simplicity, let's use the [CLS] token representation as the sentence embedding
    cls_embedding = last_hidden_states[:, 0, :].squeeze().numpy()
    roberta_embeddings_list.append(cls_embedding)
# Step 3: Graph Construction

# Step 1: Convert texts to TaggedDocuments
tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(texts_list)]

# Step 2: Train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=768, window=5, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Step 3: Calculate similarity/distance
doc2vec_embeddings_list = [doc2vec_model.infer_vector(doc.words) for doc in tagged_data]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Create Model

In [13]:
class TwoInputClassifier(nn.Module):
    def __init__(self, bert_input_size, doc2vec_input_size, hidden_size):
        super(TwoInputClassifier, self).__init__()
        
        # Define layers for BERT input
        self.bert_fc = nn.Linear(bert_input_size, hidden_size)
        
        # Define layers for Doc2Vec input
        self.doc2vec_fc = nn.Linear(doc2vec_input_size, hidden_size)
        
        # Stack 10 hidden layers
        hidden_layers = []
        hidden_layers.append(nn.Linear(hidden_size * 2, hidden_size * 2))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(hidden_size * 2, hidden_size * 2))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(hidden_size * 2, hidden_size))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(hidden_size, 600))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(600, 500))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(500, 400))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(400, 300))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(300, 200))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(200, 100))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(100, 80))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(80, 60))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        hidden_layers.append(nn.Linear(60, 40))  # Adjusted size
        hidden_layers.append(nn.ReLU())
        self.hidden_layers = nn.Sequential(*hidden_layers)
        
        # Output layer
        self.output = nn.Linear(40, 1)  # Adjusted size
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, bert_input, doc2vec_input):
        # Pass BERT input through fully connected layers and ReLU activation
        bert_output = F.relu(self.bert_fc(bert_input))
        
        # Pass Doc2Vec input through fully connected layers and ReLU activation
        doc2vec_output = F.relu(self.doc2vec_fc(doc2vec_input))
        
        # Concatenate the outputs from BERT and Doc2Vec
        combined_output = torch.cat((bert_output, doc2vec_output), dim=1)
        
        # Pass through 10 hidden layers
        combined_output = self.hidden_layers(combined_output)
        
        # Output layer with softmax activation
        output = self.sigmoid(self.output(combined_output), dim=1)
        
        return output


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Assuming you have your data in the form of BERT and Doc2Vec embeddings, and labels
# Replace these tensors with your actual data
bert_data = torch.tensor(roberta_embeddings_list)  # Replace with your BERT data
print(bert_data.shape)
doc2vec_data = torch.tensor(doc2vec_embeddings_list)  # Replace with your Doc2Vec data
print(doc2vec_data.shape)
labels = torch.tensor(labels_list)  # Replace with your labels
print(labels.shape)

# Hyperparameters
bert_input_size = 768  # Example input size for BERT embeddings
doc2vec_input_size = 768  # Example input size for Doc2Vec embeddings
hidden_size = 768
num_classes = 2
learning_rate = 0.001
batch_size = 32
epochs = 10

# Create DataLoader
dataset = TensorDataset(bert_data, doc2vec_data, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the model
model = TwoInputClassifier(bert_input_size, doc2vec_input_size, hidden_size, num_classes)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(dataloader, 0):
        # Get the inputs; data is a list of [bert_input, doc2vec_input, labels]
        bert_input, doc2vec_input, labels = data
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward + backward + optimize
        outputs = model(bert_input, doc2vec_input)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        # Print statistics
        running_loss += loss.item()
        if i % 10 == 9:  # Print every 10 mini-batches
            print(f"Epoch [{epoch + 1}/{epochs}], "
                  f"Batch [{i + 1}/{len(dataloader)}], "
                  f"Loss: {running_loss / 10:.4f}")
            running_loss = 0.0

print("Finished Training")


torch.Size([27983, 768])
torch.Size([27983, 100])
torch.Size([27983])
Epoch [1/10], Batch [10/875], Loss: 0.6953
Epoch [1/10], Batch [20/875], Loss: 0.5778
Epoch [1/10], Batch [30/875], Loss: 0.7695
Epoch [1/10], Batch [40/875], Loss: 0.8351
Epoch [1/10], Batch [50/875], Loss: 0.7851
Epoch [1/10], Batch [60/875], Loss: 0.7914
Epoch [1/10], Batch [70/875], Loss: 0.8164
Epoch [1/10], Batch [80/875], Loss: 0.7851
Epoch [1/10], Batch [90/875], Loss: 0.8101
Epoch [1/10], Batch [100/875], Loss: 0.7976
Epoch [1/10], Batch [110/875], Loss: 0.8289
Epoch [1/10], Batch [120/875], Loss: 0.8508
Epoch [1/10], Batch [130/875], Loss: 0.7789
Epoch [1/10], Batch [140/875], Loss: 0.8008
Epoch [1/10], Batch [150/875], Loss: 0.8383
Epoch [1/10], Batch [160/875], Loss: 0.8039
Epoch [1/10], Batch [170/875], Loss: 0.8726
Epoch [1/10], Batch [180/875], Loss: 0.7945
Epoch [1/10], Batch [190/875], Loss: 0.8195
Epoch [1/10], Batch [200/875], Loss: 0.8445
Epoch [1/10], Batch [210/875], Loss: 0.8976
Epoch [1/10], B

## Predict Test Set

In [None]:
# Tokenize and obtain embeddings for each text entry
test_roberta_embeddings_list = []

test_text_list = test_data['text'].to_list()
for text in test_text_list:
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = roberta(**encoded_input)
    last_hidden_states = output.last_hidden_state
    # For simplicity, let's use the [CLS] token representation as the sentence embedding
    cls_embedding = last_hidden_states[:, 0, :].squeeze().numpy()
    roberta_embeddings_list.append(cls_embedding)
# Step 3: Graph Construction
    
unlabeled_features = torch.tensor(test_roberta_embeddings_list, dtype=torch.float32)

# Step 2: Set model to evaluation mode
model.eval()

# Step 3: Predict logits for unlabeled nodes
with torch.no_grad():
    logits_unlabeled = model(dgl_G, unlabeled_features)
    logits_unlabeled = logits_unlabeled.squeeze(1)  # Squeeze unnecessary dimensions

# Step 4: Apply threshold (e.g., sigmoid) for binary predictions
preds_val = (torch.sigmoid(logits_unlabeled) > 0.5).float()


## Performance and Create Submission

In [None]:
if not is_submission:
    print('ROC AUC val:', roc_auc_score(test_data.generated, preds_val))
else:
    submission = pd.DataFrame({'id':test_data["id"], 'generated':predictions})
    submission_path = r"/kaggle/working/submission.csv"
    submission.to_csv(submission_path, index=False)

ROC AUC train: 0.9930497268286299
ROC AUC val: 0.5104081652548534
