In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import spacy

# Load spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")

# Load training data
train_essays = pd.read_csv("train_essays.csv")
train_prompts = pd.read_csv("train_prompts.csv")
train_data = pd.merge(train_essays, train_prompts, on="prompt_id")

# Tokenization function using spaCy
def tokenize(text):
    return [token.text.lower() for token in nlp(text) if not token.is_stop and token.is_alpha]

# Feature engineering
train_data['word_count'] = train_data['text'].apply(lambda x: len(x.split()))
train_data['avg_word_length'] = train_data['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Combine essay text and prompt information
train_data['combined_text'] = train_data['text'] + ' ' + train_data['instructions'] + ' ' + train_data['source_text']

# Filter out rows with empty texts
train_data = train_data[train_data['combined_text'].notna()]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    train_data[['combined_text', 'word_count', 'avg_word_length']],
    train_data['generated'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, max_features=5000)  # You can adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['combined_text'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['combined_text'])

# Create a LightGBM dataset
train_dataset = lgb.Dataset(X_train_tfidf, label=y_train)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'num_leaves': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': -1,
    'min_child_samples': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'scale_pos_weight': 1,
    'metric': 'binary_logloss',
    'num_boost_round': 100
}

# Train the model
model = lgb.train(params, train_dataset, num_boost_round=params['num_boost_round'])

# Make predictions on the test set
X_test_tfidf = tfidf_vectorizer.transform(X_test['combined_text'])
predictions = model.predict(X_test_tfidf, num_iteration=model.best_iteration)

# Convert probability predictions to binary
binary_predictions = [1 if pred >= 0.5 else 0 for pred in predictions]

# Evaluate the model
accuracy = accuracy_score(y_test, binary_predictions)
report = classification_report(y_test, binary_predictions)

print(f"Model Accuracy: {accuracy}")
print("Classification Report:\n", report)

# Load test data
test_essays = pd.read_csv("test_essays.csv")
test_essays = pd.merge(test_essays, train_prompts, on="prompt_id")
test_essays['combined_text'] = test_essays['text'] + ' ' + test_essays['instructions'] + ' ' + test_essays['source_text']

# Feature engineering for test data
test_essays['word_count'] = test_essays['text'].apply(lambda x: len(x.split()))
test_essays['avg_word_length'] = test_essays['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Filter out rows with empty texts
test_essays = test_essays[test_essays['combined_text'].notna()]

# Make predictions on the test set
X_test_tfidf = tfidf_vectorizer.transform(test_essays['combined_text'])
test_predictions = model.predict(X_test_tfidf, num_iteration=model.best_iteration)

# Save the predictions to a CSV file
result_df = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions})
result_df.to_csv("predictions_lightgbm.csv", index=False)


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import spacy

# Load spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")

# Load training data
train_essays = pd.read_csv("train_essays.csv")
train_prompts = pd.read_csv("train_prompts.csv")
train_data = pd.merge(train_essays, train_prompts, on="prompt_id")

# Tokenization function using spaCy
def tokenize(text):
    return [token.text.lower() for token in nlp(text) if not token.is_stop and token.is_alpha]

# Feature engineering
train_data['word_count'] = train_data['text'].apply(lambda x: len(x.split()))
train_data['avg_word_length'] = train_data['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Combine essay text and prompt information
train_data['combined_text'] = train_data['text'] + ' ' + train_data['instructions'] + ' ' + train_data['source_text']

# Filter out rows with empty texts
train_data = train_data[train_data['combined_text'].notna()]

# Use a smaller subset for initial testing
train_data_subset, _ = train_test_split(train_data, test_size=0.9, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    train_data_subset[['combined_text', 'word_count', 'avg_word_length']],
    train_data_subset['generated'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization with reduced features
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, max_features=1000)  # Adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['combined_text'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['combined_text'])

# Create a LightGBM dataset
train_dataset = lgb.Dataset(X_train_tfidf, label=y_train)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'num_leaves': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': -1,
    'min_child_samples': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'scale_pos_weight': 1,
    'metric': 'binary_logloss',
    'num_boost_round': 50  # Reduced the number of boosting rounds
}

# Train the model
model = lgb.train(params, train_dataset, num_boost_round=params['num_boost_round'])

# Make predictions on the test set
X_test_tfidf = tfidf_vectorizer.transform(X_test['combined_text'])
predictions = model.predict(X_test_tfidf, num_iteration=model.best_iteration)

# Convert probability predictions to binary
binary_predictions = [1 if pred >= 0.5 else 0 for pred in predictions]

# Evaluate the model on the test set
accuracy = accuracy_score(y_test, binary_predictions)
report = classification_report(y_test, binary_predictions)

print(f"Model Accuracy on Test Set: {accuracy}")
print("Classification Report on Test Set:\n", report)

# Load test data
test_essays = pd.read_csv("test_essays.csv")
test_essays = pd.merge(test_essays, train_prompts, on="prompt_id")
test_essays['combined_text'] = test_essays['text'] + ' ' + test_essays['instructions'] + ' ' + test_essays['source_text']

# Feature engineering for test data
test_essays['word_count'] = test_essays['text'].apply(lambda x: len(x.split()))
test_essays['avg_word_length'] = test_essays['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Filter out rows with empty texts
test_essays = test_essays[test_essays['combined_text'].notna()]

# Make predictions on the test set
X_test_tfidf_test = tfidf_vectorizer.transform(test_essays['combined_text'])
test_predictions = model.predict(X_test_tfidf_test, num_iteration=model.best_iteration)

# Save the predictions to a CSV file
result_df = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions})
result_df.to_csv("submissions.csv", index=False)




[LightGBM] [Info] Number of positive: 0, number of negative: 109
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21257
[LightGBM] [Info] Number of data points in the train set: 109, number of used features: 999
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] Start training from score -34.538776
Model Accuracy on Test Set: 1.0
Classification Report on Test Set:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        28

    accuracy                           1.00        28
   macro avg       1.00      1.00      1.00        28
weighted avg       1.00      1.00      1.00        28



ValueError: Found array with 0 sample(s) (shape=(0, 1000)) while a minimum of 1 is required by TfidfTransformer.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import spacy

# Load spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")

# Load training data
train_essays = pd.read_csv("train_essays.csv")
train_prompts = pd.read_csv("train_prompts.csv")
train_data = pd.merge(train_essays, train_prompts, on="prompt_id")

# Tokenization function using spaCy
def tokenize(text):
    return [token.text.lower() for token in nlp(text) if not token.is_stop and token.is_alpha]

# Feature engineering
train_data['word_count'] = train_data['text'].apply(lambda x: len(x.split()))
train_data['avg_word_length'] = train_data['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Combine essay text and prompt information
train_data['combined_text'] = train_data['text'] + ' ' + train_data['instructions'] + ' ' + train_data['source_text']

# Filter out rows with empty texts
train_data = train_data[train_data['combined_text'].notna()]

# Check if there are still samples in the dataset
if train_data.empty:
    raise ValueError("No samples remaining after preprocessing.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    train_data[['combined_text', 'word_count', 'avg_word_length']],
    train_data['generated'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, max_features=5000)  # You can adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['combined_text'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['combined_text'])

# Create a LightGBM dataset
train_dataset = lgb.Dataset(X_train_tfidf, label=y_train)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'num_leaves': 50,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': -1,
    'min_child_samples': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'scale_pos_weight': 1,
    'metric': 'binary_logloss',
    'num_boost_round': 50  # Reduced the number of boosting rounds
}

# Train the model
model = lgb.train(params, train_dataset, num_boost_round=params['num_boost_round'])

# Make predictions on the test set
X_test_tfidf = tfidf_vectorizer.transform(X_test['combined_text'])
predictions = model.predict(X_test_tfidf, num_iteration=model.best_iteration)

# Convert probability predictions to binary
binary_predictions = [1 if pred >= 0.5 else 0 for pred in predictions]

# Evaluate the model
accuracy = accuracy_score(y_test, binary_predictions)
report = classification_report(y_test, binary_predictions)

print(f"Model Accuracy: {accuracy}")
print("Classification Report:\n", report)

# Load test data
test_essays = pd.read_csv("test_essays.csv")
test_essays = pd.merge(test_essays, train_prompts, on="prompt_id")
test_essays['combined_text'] = test_essays['text'] + ' ' + test_essays['instructions'] + ' ' + test_essays['source_text']

# Filter out rows with empty texts
test_essays = test_essays[test_essays['combined_text'].notna()]

# Make predictions on the test set
X_test_tfidf_test = tfidf_vectorizer.transform(test_essays['combined_text'])
test_predictions = model.predict(X_test_tfidf_test, num_iteration=model.best_iteration)

# Save the predictions to a CSV file in the required format
submission_df = pd.DataFrame({'id': test_essays['id'], 'generated': test_predictions})
submission_df['generated'] = submission_df['generated'].round(1)  # Round to one decimal place
submission_df.to_csv("submissions.csv", index=False)




[LightGBM] [Info] Number of positive: 1, number of negative: 1101
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 228569
[LightGBM] [Info] Number of data points in the train set: 1102, number of used features: 2385
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000907 -> initscore=-7.003974
[LightGBM] [Info] Start training from score -7.003974


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model Accuracy: 0.9927536231884058
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00       274
           1       0.00      0.00      0.00         2

    accuracy                           0.99       276
   macro avg       0.50      0.50      0.50       276
weighted avg       0.99      0.99      0.99       276



ValueError: Found array with 0 sample(s) (shape=(0, 5000)) while a minimum of 1 is required by TfidfTransformer.

In [5]:
final=pd.read_csv('submission.csv')
final.head()

Unnamed: 0,id,generated
0,0000aaaa,0.012896
1,1111bbbb,0.012896
2,2222cccc,0.012896
