In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
import spacy

# Load spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")

# Load training data
train_essays = pd.read_csv("train_essays.csv")
train_prompts = pd.read_csv("train_prompts.csv")
train_data = pd.merge(train_essays, train_prompts, on="prompt_id")

# Tokenization function using spaCy
def tokenize(text):
    return [token.text.lower() for token in nlp(text) if not token.is_stop and token.is_alpha]

# Feature engineering
train_data['word_count'] = train_data['text'].apply(lambda x: len(x.split()))
train_data['avg_word_length'] = train_data['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Combine essay text and prompt information
train_data['combined_text'] = train_data['text'] + ' ' + train_data['instructions'] + ' ' + train_data['source_text']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    train_data[['combined_text', 'word_count', 'avg_word_length']],
    train_data['generated'], test_size=0.2, random_state=42
)

# Create a pipeline with a TF-IDF vectorizer and a Random Forest classifier
model = make_pipeline(
    TfidfVectorizer(tokenizer=tokenize),
    RandomForestClassifier(n_estimators=100, random_state=42)
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print(f"Model Accuracy: {accuracy}")
print("Classification Report:\n", report)

# Load test data
test_essays = pd.read_csv("test_essays.csv")
test_essays = pd.merge(test_essays, train_prompts, on="prompt_id")
test_essays['combined_text'] = test_essays['text'] + ' ' + test_essays['instructions'] + ' ' + test_essays['source_text']

# Feature engineering for test data
test_essays['word_count'] = test_essays['text'].apply(lambda x: len(x.split()))
test_essays['avg_word_length'] = test_essays['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Make predictions on the test set
test_predictions = model.predict_proba(test_essays[['combined_text', 'word_count', 'avg_word_length']])[:, 1]

# Save the predictions to a CSV file
result_df = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions})
result_df.to_csv("predictions.csv", index=False)




ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
import spacy

# Load spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")

# Load training data
train_essays = pd.read_csv("train_essays.csv")
train_prompts = pd.read_csv("train_prompts.csv")
train_data = pd.merge(train_essays, train_prompts, on="prompt_id")

# Tokenization function using spaCy
def tokenize(text):
    return [token.text.lower() for token in nlp(text) if not token.is_stop and token.is_alpha]

# Feature engineering
train_data['word_count'] = train_data['text'].apply(lambda x: len(x.split()))
train_data['avg_word_length'] = train_data['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Combine essay text and prompt information
train_data['combined_text'] = train_data['text'] + ' ' + train_data['instructions'] + ' ' + train_data['source_text']

# Filter out rows with empty texts
train_data = train_data[train_data['combined_text'].notna()]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    train_data[['combined_text', 'word_count', 'avg_word_length']],
    train_data['generated'], test_size=0.2, random_state=42
)

# Create a pipeline with a TF-IDF vectorizer and a Random Forest classifier
model = make_pipeline(
    TfidfVectorizer(tokenizer=tokenize),
    RandomForestClassifier(n_estimators=100, random_state=42)
)

# Train the model
model.fit(X_train['combined_text'], y_train)

# Make predictions on the test set
predictions = model.predict(X_test['combined_text'])

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print(f"Model Accuracy: {accuracy}")
print("Classification Report:\n", report)

# Load test data
test_essays = pd.read_csv("test_essays.csv")
test_essays = pd.merge(test_essays, train_prompts, on="prompt_id")
test_essays['combined_text'] = test_essays['text'] + ' ' + test_essays['instructions'] + ' ' + test_essays['source_text']

# Feature engineering for test data
test_essays['word_count'] = test_essays['text'].apply(lambda x: len(x.split()))
test_essays['avg_word_length'] = test_essays['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Filter out rows with empty texts
test_essays = test_essays[test_essays['combined_text'].notna()]

# Make predictions on the test set
test_predictions = model.predict_proba(test_essays[['combined_text', 'word_count', 'avg_word_length']])[:, 1]

# Save the predictions to a CSV file
result_df = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions})
result_df.to_csv("predictions.csv", index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import spacy

# Load spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")

# Load training data
train_essays = pd.read_csv("train_essays.csv")
train_prompts = pd.read_csv("train_prompts.csv")
train_data = pd.merge(train_essays, train_prompts, on="prompt_id")

# Tokenization function using spaCy
def tokenize(text):
    return [token.text.lower() for token in nlp(text) if not token.is_stop and token.is_alpha]

# Feature engineering
train_data['word_count'] = train_data['text'].apply(lambda x: len(x.split()))
train_data['avg_word_length'] = train_data['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Combine essay text and prompt information
train_data['combined_text'] = train_data['text'] + ' ' + train_data['instructions'] + ' ' + train_data['source_text']

# Filter out rows with empty texts
train_data = train_data[train_data['combined_text'].notna()]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    train_data[['combined_text', 'word_count', 'avg_word_length']],
    train_data['generated'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, max_features=5000)  # You can adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['combined_text'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['combined_text'])

# Create a LightGBM dataset
train_dataset = lgb.Dataset(X_train_tfidf, label=y_train)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'num_leaves': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': -1,
    'min_child_samples': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'scale_pos_weight': 1,
    'metric': 'binary_logloss',
    'num_boost_round': 100
}

# Train the model
model = lgb.train(params, train_dataset, num_boost_round=params['num_boost_round'])

# Make predictions on the test set
X_test_tfidf = tfidf_vectorizer.transform(X_test['combined_text'])
predictions = model.predict(X_test_tfidf, num_iteration=model.best_iteration)

# Convert probability predictions to binary
binary_predictions = [1 if pred >= 0.5 else 0 for pred in predictions]

# Evaluate the model
accuracy = accuracy_score(y_test, binary_predictions)
report = classification_report(y_test, binary_predictions)

print(f"Model Accuracy: {accuracy}")
print("Classification Report:\n", report)

# Load test data
test_essays = pd.read_csv("test_essays.csv")
test_essays = pd.merge(test_essays, train_prompts, on="prompt_id")
test_essays['combined_text'] = test_essays['text'] + ' ' + test_essays['instructions'] + ' ' + test_essays['source_text']

# Feature engineering for test data
test_essays['word_count'] = test_essays['text'].apply(lambda x: len(x.split()))
test_essays['avg_word_length'] = test_essays['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Filter out rows with empty texts
test_essays = test_essays[test_essays['combined_text'].notna()]

# Make predictions on the test set
X_test_tfidf = tfidf_vectorizer.transform(test_essays['combined_text'])
test_predictions = model.predict(X_test_tfidf, num_iteration=model.best_iteration)

# Save the predictions to a CSV file
result_df = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions})
result_df.to_csv("predictions_lightgbm.csv", index=False)


