# Modeling demo

Train a simple classifier (TF-IDF + LogisticRegression) on the synthetic dataset and inspect cross-validation results and feature importance.

In [None]:
from pathlib import Path
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from src.analysis import preprocess_df, predictive_terms_logistic
DATA = Path('data') / 'synthetic_texts.csv'
if not DATA.exists():
    from generate_data import generate
    generate(600)
df = pd.read_csv(DATA)
df = preprocess_df(df)
X = df['joined_tokens']
y = df['label']
print('Shape:', X.shape)

In [None]:
# Cross-validation accuracy
pipe = make_pipeline(TfidfVectorizer(max_features=2000, stop_words='english'), LogisticRegression(max_iter=500))
scores = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
print('CV accuracies:', scores)
print('Mean accuracy:', scores.mean())

In [None]:
# Train / test split and classification report
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipe.fit(Xtr, ytr)
pred = pipe.predict(Xte)
print(classification_report(yte, pred))
print('Confusion matrix:', confusion_matrix(yte, pred))

In [None]:
# Inspect top predictive terms for the negative class using the helper (this returns top_k features)
preds = predictive_terms_logistic(X, y, top_k=40)
import pandas as pd
pd.DataFrame(preds, columns=['term','coef']).head(12)