## Imports

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier

## Submission Flag

In [None]:
is_submission = False

## Read Datasets

In [None]:
train_path = r"data\train_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/train_essays.csv"
test_path = r"data\test_essays.csv" if not is_submission else r"/kaggle/input/llm-detect-ai-generated-text/test_essays.csv"
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

## Embeddings

In [None]:
vectorizer = TfidfVectorizer()
vectorized_train_data = vectorizer.fit_transform(train_data["text"])
vectorized_test_data = vectorizer.transform(test_data["text"])

## Create Model

In [None]:
sgd_model1 = SGDClassifier(max_iter=5000, tol=1e-3, loss="modified_huber") 
sgd_model2 = SGDClassifier(max_iter=5000, tol=1e-3, loss="modified_huber", class_weight="balanced")
sgd_model3 = SGDClassifier(max_iter=10000, tol=5e-4, loss="modified_huber", early_stopping=True)

In [None]:
model = VotingClassifier(estimators=[("sgd1", sgd_model1), ("sgd2", sgd_model2),("sgd3", sgd_model3)], voting="soft")

## Fit Model

In [None]:
model.fit(vectorized_train_data, train_data["generated"])

## Predict Test Set

In [None]:
predictions = model.predict(vectorized_test_data).astype(float)

## Create Submission

In [None]:
submission = pd.DataFrame({"id": test_data["id"], "generated": predictions})
submission_path = r"data\submission.csv" if not is_submission else r"/kaggle/working/submission.csv"
submission.to_csv(submission_path, index=False)