## Imports

In [14]:
import pandas as pd
import numpy as np
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
import random
from sklearn.metrics import roc_auc_score
from transformers import RobertaModel, RobertaTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset

## Submission Flag

In [15]:
is_submission = False

## Read Datasets

In [16]:
train_path1 = r"data\train_v2_drcat_02.csv" if not is_submission else r"/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv"
train_path2 = r"data\train_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/train_essays.csv"
test_path = r"data\test_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/test_essays.csv"
train_data1 = pd.read_csv(train_path1)
train_data1.rename(columns={'label': 'generated'}, inplace=True)
train_data2 = pd.read_csv(train_path2)
test_data = pd.read_csv(test_path)

In [17]:
train = pd.concat([train_data1[['text','generated']], train_data2[['text','generated']]])
train['text'] = train['text'].str.replace('\n', '')
test_data['text'] = test_data['text'].str.replace('\n', '')
train['generated'].value_counts()

generated
0    28746
1    17500
Name: count, dtype: int64

In [18]:
rus = RandomUnderSampler(random_state=42)
train_text, train_label = rus.fit_resample(train['text'].to_numpy().reshape(-1,1), train['generated'].to_numpy().reshape(-1,1))
print('0: ', np.count_nonzero(train_label == 0))
print('1: ', np.count_nonzero(train_label == 1))

data = {'text': train_text.reshape(-1), 'generated': train_label.reshape(-1)}
train_data = pd.DataFrame(data)

if not is_submission:
    seed=202
    random.seed(seed)
    np.random.seed(seed)
    mask = np.random.rand(len(train_data)) < 0.8
    test_data = train_data[~mask]
    train_data = train_data[mask]

0:  17500
1:  17500


## Embeddings

In [19]:
# Load pre-trained RoBERTa model and tokenizer
model = RobertaModel.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the text in train data
tokenized_train_texts = tokenizer(train_data['text'].to_list(), padding=True, truncation=True, return_tensors='pt')

# Tokenize the text in train data
tokenized_test_texts = tokenizer(test_data['text'].to_list(), padding=True, truncation=True, return_tensors='pt')


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
## Create Dataloader
# Convert train labels to tensor
train_labels_tensor = torch.tensor(train_data['generated'].values)

# Create a train TensorDataset
train_dataset = TensorDataset(
    tokenized_train_texts['input_ids'],
    tokenized_train_texts['attention_mask'],
    train_labels_tensor
)

if is_submission:

    # Create a test TensorDataset
    test_dataset = TensorDataset(
        tokenized_test_texts['input_ids'],
        tokenized_test_texts['attention_mask']
    )

else:
    # Convert text labels to tensor
    test_labels_tensor = torch.tensor(test_data['generated'].values)

    # Create a test TensorDataset
    test_dataset = TensorDataset(
        tokenized_test_texts['input_ids'],
        tokenized_test_texts['attention_mask'],
        test_labels_tensor
    )

# Define batch size
batch_size = 16  # You can adjust this based on your system's memory capacity

# Create a DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [21]:
def extract_features(concrete_data_loader):
    features_list = []
    labels_list = []

    with torch.no_grad():
        for batch in concrete_data_loader:
            batch_tokenized_texts = {
                'input_ids': batch[0],
                'attention_mask': batch[1]
            }
            
            # Extract token embeddings for the batch
            batch_outputs = model(**batch_tokenized_texts)
            batch_embeddings = batch_outputs.last_hidden_state
            
            # Flatten the embeddings to use as features for the batch
            batch_features = batch_embeddings.mean(dim=1).numpy()
            features_list.append(batch_features)
            
            # Get labels for the batch
            batch_labels = batch[2].numpy()
            labels_list.append(batch_labels)
    
    # Concatenate features from all batches
    return np.concatenate(features_list, axis=0), np.concatenate(labels_list, axis=0)

def extract_features_test_submission(concrete_data_loader):
    features_list = []

    with torch.no_grad():
        for batch in concrete_data_loader:
            batch_tokenized_texts = {
                'input_ids': batch[0],
                'attention_mask': batch[1]
            }
            
            # Extract token embeddings for the batch
            batch_outputs = model(**batch_tokenized_texts)
            batch_embeddings = batch_outputs.last_hidden_state
            
            # Flatten the embeddings to use as features for the batch
            batch_features = batch_embeddings.mean(dim=1).numpy()
            features_list.append(batch_features)
            
    
    # Concatenate features from all batches
    return np.concatenate(features_list, axis=0)

# Extract features using DataLoader
if is_submission:
    train_features, train_labels = extract_features(train_loader)
    test_features = extract_features_test_submission(tokenized_test_texts)
    print('Len train_features: ', len(train_features), ' Len train_labels: ', len(train_labels))
    print('Len train_features: ', len(test_features))
else: 
    train_features, train_labels = extract_features(train_loader)


In [None]:
test_features, test_labels = extract_features(test_loader)
print('Len train_features: ', len(train_features), ' Len train_labels: ', len(train_labels))
print('Len train_features: ', len(test_features), ' Len train_labels: ', len(test_labels))

Len train_features:  27983  Len train_labels:  27983
Len train_features:  7017  Len train_labels:  7017


## Create Model

In [None]:
classifier = XGBClassifier(objective = 'binary:logistic', n_estimators = 100, n_jobs = -1)

## Fit Model

In [None]:
classifier.fit(train_features, train_data.generated)

## Predict Test Set

In [None]:
predictions = classifier.predict_proba(test_features)[:,1]

## Performance and Create Submission

In [None]:
if not is_submission:
    preds_train = classifier.predict_proba(train_features)[:,1]
    preds_val = classifier.predict_proba(test_features)[:,1]
    print('ROC AUC train:', roc_auc_score(train_data.generated, preds_train))
    print('ROC AUC val:', roc_auc_score(test_data.generated, preds_val))
else:
    submission = pd.DataFrame({'id':test_data["id"], 'generated':predictions})
    submission_path = r"/kaggle/working/submission.csv"
    submission.to_csv(submission_path, index=False)

ROC AUC train: 0.9930497268286299
ROC AUC val: 0.5104081652548534
