In [1]:
# Step 1: Data Preprocessing
import pandas as pd

# Load the datasets
train_essays = pd.read_csv('train_essays.csv')
test_essays = pd.read_csv('test_essays.csv')
train_prompts = pd.read_csv('train_prompts.csv')

# Merge training essays and prompts based on prompt_id
train_data = train_essays.merge(train_prompts, on='prompt_id')

# Check for missing data
train_data.isnull().sum()  # Check for missing values in the merged dataset
train_data.dropna(subset=['text'], inplace=True)  # Remove rows with missing text

In [3]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Function for text preprocessing
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Removing punctuation and special characters
    table = str.maketrans('', '', string.punctuation)
    tokens = [word.translate(table) for word in tokens]
    
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

In [4]:
# Apply text preprocessing to the 'text' column
train_data['text'] = train_data['text'].apply(preprocess_text)
test_essays['text'] = test_essays['text'].apply(preprocess_text)

# Encode labels (0 for student essays, 1 for LLM-generated essays)
train_data['generated'] = train_data['generated'].astype(int)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform on training data
X = tfidf_vectorizer.fit_transform(train_data['text'])
y = train_data['generated']

In [6]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Model Building
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier()

In [7]:
# Train the model
rf_classifier.fit(X_train, y_train)

In [12]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the validation set
y_pred = rf_classifier.predict(X_val)

# Assuming y_test and predictions are available from your previous code
print("Classification Report:")
print(classification_report(y_val, y_pred, zero_division='warn'))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       274
           1       0.00      0.00      0.00         2

    accuracy                           0.99       276
   macro avg       0.50      0.50      0.50       276
weighted avg       0.99      0.99      0.99       276



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Transform test data using the same TF-IDF vectorizer
X_test = tfidf_vectorizer.transform(test_essays['text'])

# Use the trained model to predict on the test data
test_predictions = rf_classifier.predict(X_test)

# Prepare for submission
submission = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions})

# Save the submission to a CSV file
submission.to_csv('essay_predictions.csv', index=False)