In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
train_essays = pd.read_csv('train_essays.csv')

# Merge dataset
# Assuming `train_essays` contains both train and dev data
# If not, load the dev dataset separately and merge
# merged_data = pd.concat([train_essays, dev_essays], ignore_index=True)
train_essays.columns

Index(['id', 'prompt_id', 'text', 'generated'], dtype='object')

Here ID and prompt id are not required, hence drop them from the data

In [9]:

# Divide into train and dev
train_essays = train_essays.drop(['prompt_id','id'],axis=1)

train_data, dev_data = train_test_split(train_essays, test_size=0.2, random_state=42)
train_data.shape, dev_data.shape

train_data.columns

Index(['text', 'generated'], dtype='object')

In [10]:
train_data['generated'].value_counts()

0    1100
1       2
Name: generated, dtype: int64

So Only 2 essays are AI Generated and the remaining are Human Essays, so we need to make the number of AI Generated Essays and Human Essays equal

In [12]:
# new_data = pd.read_csv("train_v2_drcat_02.csv")

In [21]:
# selected_data = new_data[new_data['label'] == 1].sample(n=600, random_state=42)[['text', 'label']]
ai_generated_data = pd.read_csv('ai_generated_data.csv')


In [22]:
ai_generated_data.columns

Index(['text', 'generated'], dtype='object')

In [28]:
df = pd.concat([train_data, ai_generated_data], ignore_index=True)


In [29]:
df.shape

(1702, 2)

In [30]:
df['generated'].value_counts()

0    1100
1     602
Name: generated, dtype: int64

In [31]:
from collections import Counter

# Create a list of all words in the dataset
all_words = ' '.join(df['text']).lower().split()

# Count word occurrences
word_counts = Counter(all_words)

# Build vocabulary
vocab = [word for word, count in word_counts.items() if count >= 5]
vocab_reverse_index = {word: idx for idx, word in enumerate(vocab)}

In [32]:
# Calculate P[word]
num_documents = len(df)
word_probabilities = {word: count / num_documents for word, count in word_counts.items()}

# Assuming `generated` column is the target
# Separate data into human and LLM essays
human_data = df[df['generated'] == 0]
llm_data = df[df['generated'] == 1]

# Calculate P[word | LLM]
llm_word_probabilities = {word: df['text'].apply(lambda essay: word in essay.lower()).mean() for word in vocab}
