In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pickle

# Load dataset
df = pd.read_csv("../data/prompt_data.csv")

# Split
X_train, X_test, y_train, y_test = train_test_split(df["prompt"], df["label"], test_size=0.2, random_state=42)

# Vectorize
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Evaluate
print("Train Accuracy:", model.score(X_train_vec, y_train))
print("Test Accuracy:", model.score(X_test_vec, y_test))

# Save vectorizer and model
with open("../models/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("../models/classifier.pkl", "wb") as f:
    pickle.dump(model, f)


Train Accuracy: 1.0
Test Accuracy: 1.0
