In [1]:
import pandas as pd
import numpy as np

import re

In [2]:
RANDOM_SEED = 42

In [3]:
# Loading datasets
daigtv2_train = pd.read_csv('/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv')

sample_submission = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
train_prompts = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv')
train_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
test_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

# Data Exploration

In [4]:
train_essays['generated'].value_counts(normalize = False)

generated
0    1375
1       3
Name: count, dtype: int64

In [5]:
# Since there are only 3 samples of the positive class, I will import an external dataset to help us train a model.

In [6]:
daigtv2_train.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False


In [7]:
# More balanced
daigtv2_train['label'].value_counts(normalize = True)

label
0    0.610034
1    0.389966
Name: proportion, dtype: float64

In [8]:
# Lots of samples!
daigtv2_train['label'].value_counts(normalize = False)

label
0    27371
1    17497
Name: count, dtype: int64

In [9]:
# There are a huge number of data points from the 'persuade_corpus' model (?), lets try and undersample these to reduce any potential bias
daigtv2_train['source'].value_counts()

source
persuade_corpus                       25996
mistral7binstruct_v1                   2421
mistral7binstruct_v2                   2421
chat_gpt_moth                          2421
llama2_chat                            2421
kingki19_palm                          1384
train_essays                           1378
llama_70b_v1                           1172
falcon_180b_v1                         1055
darragh_claude_v6                      1000
darragh_claude_v7                      1000
radek_500                               500
NousResearch/Llama-2-7b-chat-hf         400
mistralai/Mistral-7B-Instruct-v0.1      400
cohere-command                          350
palm-text-bison1                        349
radekgpt4                               200
Name: count, dtype: int64

In [10]:
# Dropping 'train_essays' rows to avoid leakage
daigtv2_train = daigtv2_train[daigtv2_train['source'] != 'train_essays']

# Creating an artificial dataset

## Resampling `persuade_corpus` examples

persuade_corpus represent examples generated by humans (not LLMs)

In [11]:
# Filter for dataframes with only 'persuade_corpus' examples, and completely without
non_persuade_corpus = daigtv2_train[daigtv2_train['source'] != 'persuade_corpus']
persuade_corpus = daigtv2_train[daigtv2_train['source'] == 'persuade_corpus']

print(len(non_persuade_corpus), len(persuade_corpus))

17494 25996


In [12]:
# Setting random seed before numpy random operation
np.random.seed(RANDOM_SEED)

undersampled_persuade_corpus_indices = np.random.choice(persuade_corpus.index, size=15000, replace=False, )
daigtv2_train_usamp = pd.concat([non_persuade_corpus, persuade_corpus.iloc[undersampled_persuade_corpus_indices]])

In [13]:
# Much more balanced!
daigtv2_train_usamp['source'].value_counts(normalize = False)

source
persuade_corpus                       15000
mistral7binstruct_v2                   2421
chat_gpt_moth                          2421
mistral7binstruct_v1                   2421
llama2_chat                            2421
kingki19_palm                          1384
llama_70b_v1                           1172
falcon_180b_v1                         1055
darragh_claude_v7                      1000
darragh_claude_v6                      1000
radek_500                               500
NousResearch/Llama-2-7b-chat-hf         400
mistralai/Mistral-7B-Instruct-v0.1      400
cohere-command                          350
palm-text-bison1                        349
radekgpt4                               200
Name: count, dtype: int64

In [14]:
daigtv2_train_usamp['label'].value_counts()

label
1    17494
0    15000
Name: count, dtype: int64

# Cleaning the text

In [15]:
# Checking vocabulary of human vs non human
llm_vocab = set(''.join(daigtv2_train_usamp['text'].to_list()).lower())
print(sorted(llm_vocab))

['\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x80', '\x82', '\x83', '\x85', '\x88', '\x92', '\x93', '\x94', '\x97', '\x99', '\x9f', '\xa0', '¢', '£', '¨', '©', '«', '¬', '\xad', '®', '°', '²', '´', '¶', '¸', '¹', 'º', 'á', 'â', 'ã', 'å', 'ç', 'é', 'ê', 'ë', 'í', 'ï', 'ó', '÷', 'ü', 'þ', 'ā', 'а', 'д', 'е', 'з', 'о', 'п', 'р', 'с', '\u200b', '\u200d', '–', '—', '‘', '’', '“', '”', '…', '⏰', '─', '╯', '□', '☀', '☹', '♀', '♂', '⚽', '✨', '❄', '。', 'あ', 'う', 'が', 'こ', 'す', 'せ', 'ち', 'と', 'に', 'は', 'ま', 'み', 'り', 'ん', '一', '上', '中', '使', '保', '全', '力', '取', '司', '合', '和', '响', '唯', '在', '安', '完', '将', '应', '影', '必', '意', '所', '手', '护', '择', '时', '是', '有', '机', '止', '法', '注', '用', '的

In [16]:
# Checking vocabulary of human vs non human
human_df = daigtv2_train_usamp[daigtv2_train_usamp['source'] == 'persuade_corpus']

human_vocab = set(''.join(human_df['text'].to_list()).lower())
print(sorted(human_vocab))

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x80', '\x82', '\x83', '\x85', '\x88', '\x92', '\x93', '\x94', '\x97', '\x99', '\x9f', '\xa0', '¢', '£', '¨', '©', '«', '\xad', '®', '²', '´', '¶', '¸', '¹', 'º', 'á', 'â', 'ã', 'å', 'ë', 'ï', 'ó', 'þ']


In [17]:
# Looks like there are characters which the LLM generated text uses, that the non-LLM text never uses
# We should remove this, as we would want the model to learn something a bit more profound than emojis, or characters from another language

characters_to_strip = sorted(llm_vocab - human_vocab)
print(characters_to_strip)

['\r', '^', '¬', '°', 'ç', 'é', 'ê', 'í', '÷', 'ü', 'ā', 'а', 'д', 'е', 'з', 'о', 'п', 'р', 'с', '\u200b', '\u200d', '–', '—', '‘', '’', '“', '”', '…', '⏰', '─', '╯', '□', '☀', '☹', '♀', '♂', '⚽', '✨', '❄', '。', 'あ', 'う', 'が', 'こ', 'す', 'せ', 'ち', 'と', 'に', 'は', 'ま', 'み', 'り', 'ん', '一', '上', '中', '使', '保', '全', '力', '取', '司', '合', '和', '响', '唯', '在', '安', '完', '将', '应', '影', '必', '意', '所', '手', '护', '择', '时', '是', '有', '机', '止', '法', '注', '用', '的', '禁', '者', '该', '路', '选', '道', '部', '都', '集', '须', '驶', '驾', '️', '�', '🇧', '🇪', '🇫', '🇯', '🇵', '🇷', '🇸', '🇺', '🌃', '🌄', '🌅', '🌈', '🌊', '🌌', '🌎', '🌏', '🌐', '🌞', '🌟', '🌠', '🌧', '🌨', '🌫', '🌭', '🌮', '🌯', '🌱', '🌲', '🌳', '🌴', '🌷', '🌸', '🌻', '🌽', '🌿', '🍁', '🍄', '🍋', '🍎', '🍓', '🍔', '🍕', '🍖', '🍗', '🍜', '🍝', '🍞', '🍟', '🍣', '🍭', '🍮', '🍰', '🍲', '🍳', '🍴', '🍷', '🍽', '🍿', '🎃', '🎄', '🎅', '🎈', '🎉', '🎊', '🎓', '🎠', '🎢', '🎣', '🎤', '🎥', '🎧', '🎨', '🎩', '🎬', '🎭', '🎮', '🎯', '🎵', '🎶', '🎸', '🎹', '🎾', '🏀', '🏃', '🏄', '🏆', '🏈', '🏊', '🏋', '🏏', '🏔', '🏕', '🏖', '🏙', '🏛', '🏜'

In [18]:
# I will also append any non-ASCII (?) character remanants (e.g. \x80, \xad)

def remove_non_ascii_characters(string, repl = ''):
    
    pat = re.compile(r'\\x[0-9a-fA-F]{1,2}\S*')
    subbed_string = re.sub(pat, repl, string)
    subbed_string = ' '.join([word.strip() for word in subbed_string.split()])
    
    return subbed_string

# Test
remove_non_ascii_characters(r"\xad HELLO! \x0 \xa0 My name is \x99 Shivam")

'HELLO! My name is Shivam'