In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
train_essays = pd.read_csv('train_essays.csv')

# Merge dataset
# Assuming `train_essays` contains both train and dev data
# If not, load the dev dataset separately and merge
# merged_data = pd.concat([train_essays, dev_essays], ignore_index=True)
train_essays.columns

Index(['id', 'prompt_id', 'text', 'generated'], dtype='object')

Here ID and prompt id are not required, hence drop them from the data

In [9]:

# Divide into train and dev
train_essays = train_essays.drop(['prompt_id','id'],axis=1)

train_data, dev_data = train_test_split(train_essays, test_size=0.2, random_state=42)
train_data.shape, dev_data.shape

train_data.columns

Index(['text', 'generated'], dtype='object')

In [10]:
train_data['generated'].value_counts()

0    1100
1       2
Name: generated, dtype: int64

So Only 2 essays are AI Generated and the remaining are Human Essays, so we need to make the number of AI Generated Essays and Human Essays equal

In [12]:
# new_data = pd.read_csv("train_v2_drcat_02.csv")

In [21]:
# selected_data = new_data[new_data['label'] == 1].sample(n=600, random_state=42)[['text', 'label']]
ai_generated_data = pd.read_csv('ai_generated_data.csv')


In [22]:
ai_generated_data.columns

Index(['text', 'generated'], dtype='object')

In [28]:
df = pd.concat([train_data, ai_generated_data], ignore_index=True)


In [29]:
df.shape

(1702, 2)

In [30]:
df['generated'].value_counts()

0    1100
1     602
Name: generated, dtype: int64

In [31]:
from collections import Counter

# Create a list of all words in the dataset
all_words = ' '.join(df['text']).lower().split()

# Count word occurrences
word_counts = Counter(all_words)

# Build vocabulary
vocab = [word for word, count in word_counts.items() if count >= 5]
vocab_reverse_index = {word: idx for idx, word in enumerate(vocab)}

In [33]:
# Calculate P[word]
num_documents = len(df)
word_probabilities = {word: count / num_documents for word, count in word_counts.items()}

human_data = df[df['generated'] == 0]
llm_data = df[df['generated'] == 1]

# Calculate P[word | LLM]
llm_word_probabilities = {word: df['text'].apply(lambda essay: word in essay.lower()).mean() for word in vocab}


In [38]:
llm_word_probabilities

{'cars,': 0.2044653349001175,
 'they': 0.7914218566392479,
 'make': 0.554641598119859,
 'life': 0.35663924794359575,
 'so': 0.9876615746180963,
 'much': 0.43772032902467684,
 'easier,': 0.010575793184488837,
 'or,': 0.2009400705052879,
 'do': 0.8584018801410106,
 'them': 0.5282021151586369,
 'the': 1.0,
 'amount': 0.27262044653349,
 'of': 0.9964747356051704,
 'green': 0.22796709753231492,
 'house': 0.32784958871915393,
 'gasses': 0.031139835487661575,
 'has': 0.6650998824911868,
 'increased': 0.04230317273795535,
 'over': 0.5916568742655699,
 'past': 0.0846063454759107,
 'years,': 0.06462984723854288,
 'due': 0.26498237367802585,
 'to': 0.999412455934195,
 'emitted': 0.012338425381903642,
 'through': 0.23795534665099882,
 'cars.': 0.20329024676850763,
 'however,': 0.23149236192714454,
 'there': 0.690951821386604,
 'been': 0.40481786133960046,
 'ways': 0.4782608695652174,
 'that': 0.964159811985899,
 'people': 0.7802585193889542,
 'are': 0.9512338425381903,
 'trying': 0.1139835487661574

In [34]:
train_data, dev_data = train_test_split(df, test_size=0.2, random_state=42)


In [35]:
# Function to classify essays based on probabilities
def classify_essays(essays, word_probabilities, llm_word_probabilities, vocab):
    predictions = []

    for essay in essays:
        # Tokenize essay into words
        words = essay.lower().split()

        # Initialize probabilities for each class
        prob_human = 1.0
        prob_llm = 1.0

        for word in words:
            # Check if the word is in the vocabulary
            if word in vocab:
                # Calculate P(word | Human) using Laplace smoothing
                prob_word_human = (word_probabilities.get(word, 0) + 1) / (len(vocab) + len(words))

                # Calculate P(word | LLM) using Laplace smoothing
                prob_word_llm = (llm_word_probabilities.get(word, 0) + 1) / (len(vocab) + len(words))

                # Update class probabilities
                prob_human *= prob_word_human
                prob_llm *= prob_word_llm

        # Classify based on the probabilities
        prediction = 1 if prob_llm > prob_human else 0
        predictions.append(prediction)

    return predictions

# Apply the classifier on the dev dataset
dev_predictions = classify_essays(dev_data['text'], word_probabilities, llm_word_probabilities, vocab)

# Calculate accuracy
accuracy = (dev_predictions == dev_data['generated']).mean()
print(f"Accuracy on dev dataset: {accuracy}")


Accuracy on dev dataset: 0.6598240469208211


In [39]:
import numpy as np

# Laplace Smoothing function
def laplace_smoothing(count, total_count, vocab_size, alpha=1):
    return (count + alpha) / (total_count + alpha * vocab_size)

# Implement Naive Bayes Classifier with Laplace Smoothing
def classify_essays_with_smoothing(essays, word_probabilities, llm_word_probabilities, vocab, alpha=1):
    predictions = []

    for essay in essays:
        # Initialize probabilities
        human_prob = 0.0
        llm_prob = 0.0

        for word in essay.lower().split():
            if word in vocab:
                human_prob += np.log(laplace_smoothing(word_probabilities[word], len(train_data), len(vocab), alpha))
                llm_prob += np.log(laplace_smoothing(llm_word_probabilities[word], len(llm_data), len(vocab), alpha))

        # Assign the class with higher probability
        predictions.append(0 if human_prob > llm_prob else 1)

    return np.array(predictions)

# Apply the classifier with Laplace smoothing on the dev dataset
dev_predictions_with_smoothing = classify_essays_with_smoothing(dev_data['text'], word_probabilities, llm_word_probabilities, vocab, alpha=1)

# Find accuracy
accuracy_with_smoothing = (dev_predictions_with_smoothing == dev_data['generated']).mean()
print(f"Accuracy on dev dataset with Laplace smoothing: {accuracy_with_smoothing}")

# Top 10 words predicting each class
top_10_human_words = sorted(vocab, key=lambda word: -np.log(laplace_smoothing(word_probabilities[word], len(train_data), len(vocab))))
top_10_llm_words = sorted(vocab, key=lambda word: -np.log(laplace_smoothing(llm_word_probabilities[word], len(llm_data), len(vocab))))

print("\nTop 10 words predicting Human essays:")
print(top_10_human_words[:10])

print("\nTop 10 words predicting LLM essays:")
print(top_10_llm_words[:10])


Accuracy on dev dataset with Laplace smoothing: 0.6598240469208211

Top 10 words predicting Human essays:
['the', 'to', 'of', 'and', 'a', 'in', 'is', 'that', 'for', 'it']

Top 10 words predicting LLM essays:
['the', 'a', 'he', 'i', 'u', 't', 'c', 'to', 'in', 'and']


In [40]:
# Apply the classifier with Laplace smoothing on the test dataset

test_data = pd.read_csv('test_data.csv')
test_predictions = classify_essays_with_smoothing(test_data['text'], word_probabilities, llm_word_probabilities, vocab, alpha=1)

# Prepare the Kaggle submission
kaggle_submission = pd.DataFrame({'id': test_data['id'], 'generated': test_predictions})

# Save the submission to a CSV file
kaggle_submission.to_csv('kaggle_submission.csv', index=False)

# Print the submission
print(kaggle_submission)


FileNotFoundError: [Errno 2] No such file or directory: 'test_data.csv'