## Imports

In [1]:
import pandas as pd
import numpy as np
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
import random
from sklearn.metrics import roc_auc_score

## Submission Flag

In [2]:
is_submission = False

## Read Datasets

In [3]:
train_path1 = r"data\train_v2_drcat_02.csv" if not is_submission else r"/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv"
train_path2 = r"data\train_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/train_essays.csv"
test_path = r"data\test_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/test_essays.csv"
train_data1 = pd.read_csv(train_path1)
train_data1.rename(columns={'label': 'generated'}, inplace=True)
train_data2 = pd.read_csv(train_path2)
test_data = pd.read_csv(test_path)

In [4]:
train = pd.concat([train_data1[['text','generated']], train_data2[['text','generated']]])
train['text'] = train['text'].str.replace('\n', '')
test_data['text'] = test_data['text'].str.replace('\n', '')
train['generated'].value_counts()

generated
0    28746
1    17500
Name: count, dtype: int64

In [5]:
rus = RandomUnderSampler(random_state=42)
train_text, train_label = rus.fit_resample(train['text'].to_numpy().reshape(-1,1), train['generated'].to_numpy().reshape(-1,1))
print('0: ', np.count_nonzero(train_label == 0))
print('1: ', np.count_nonzero(train_label == 1))

data = {'text': train_text.reshape(-1), 'generated': train_label.reshape(-1)}
train_data = pd.DataFrame(data)

if not is_submission:
    seed=202
    random.seed(seed)
    np.random.seed(seed)
    mask = np.random.rand(len(train_data)) < 0.8
    test_data = train_data[~mask]
    train_data = train_data[mask]

0:  17500
1:  17500


## Embeddings

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             tokenizer=lambda x: re.findall(r'[^\W]+', x),
                             token_pattern=None,
                             strip_accents='unicode',)
vectorized_train_data = vectorizer.fit_transform(train_data["text"])
vectorized_test_data = vectorizer.transform(test_data["text"])

## Create Model

In [7]:
classifier = XGBClassifier(objective = 'binary:logistic', n_estimators = 100, n_jobs = -1)

## Fit Model

In [8]:
classifier.fit(vectorized_train_data[:train_data.shape[0]], train_data.generated)

## Predict Test Set

In [9]:
predictions = classifier.predict_proba(vectorized_test_data)[:,1]

## Performance and Create Submission

In [10]:
if not is_submission:
    preds_train = classifier.predict_proba(vectorized_train_data)[:,1]
    preds_val = classifier.predict_proba(vectorized_test_data)[:,1]
    print('ROC AUC train:', roc_auc_score(train_data.generated, preds_train))
    print('ROC AUC val:', roc_auc_score(test_data.generated, preds_val))
else:
    submission = pd.DataFrame({'id':test_data["id"], 'generated':predictions})
    submission_path = r"/kaggle/working/submission.csv"
    submission.to_csv(submission_path, index=False)

ROC AUC train: 1.0
ROC AUC val: 0.9985557179449892
