In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from scipy import sparse
from unicodedata import normalize as uni_norm
import re
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
le = LabelEncoder()
y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def simple_clean(s):
    if pd.isna(s): return ""
    s = str(s)
    s = s.replace("’","'") .replace("‘","'") .replace("“","\"") .replace("”","\"") .replace("—","-") .replace("–","-")
    s = uni_norm("NFKC", s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

train['text'] = train['text'].map(simple_clean)
test['text']  = test['text'].map(simple_clean)

print('Data loaded and cleaned. Train shape:', train.shape, 'Classes:', classes)

# 1. Word TF-IDF + LR (strong baseline)
def run_word_tfidf_lr(ngram=(1,2), min_df=2, C=2.0):
    vec = TfidfVectorizer(
        analyzer='word',
        ngram_range=ngram,
        lowercase=True,
        sublinear_tf=True,
        min_df=min_df,
        max_df=1.0,
        strip_accents='unicode',
        stop_words=None,
        token_pattern=r'(?u)\b\w+\b',   # keep 1-char tokens
        preprocessor=simple_clean,
        dtype=np.float32
    )
    oof = np.zeros((len(train), 3), dtype=np.float32)
    test_preds = np.zeros((len(test), 3), dtype=np.float32)
    scores = []
    for fold, (tr_idx, va_idx) in enumerate(skf.split(train, y), 1):
        X_tr = vec.fit_transform(train['text'].iloc[tr_idx])
        X_va = vec.transform(train['text'].iloc[va_idx])
        X_te = vec.transform(test['text'])
        clf = LogisticRegression(
            solver='saga', C=C, penalty='l2',
            max_iter=5000, tol=1e-4, multi_class='multinomial',
            random_state=42+fold
        )
        clf.fit(X_tr, y[tr_idx])
        oof[va_idx] = clf.predict_proba(X_va)
        test_preds += clf.predict_proba(X_te)
        sc = log_loss(y[va_idx], oof[va_idx]); scores.append(sc)
        print(f'Word TF-IDF LR Fold {fold}: {sc:.4f}')
    test_preds /= skf.n_splits
    oof_score = float(np.mean(scores))
    print(f'Word TF-IDF LR OOF: {oof_score:.4f}')
    return oof_score, oof, test_preds

for C in [1.0, 2.0, 4.0]:
    sc, oof_lr, te_lr = run_word_tfidf_lr(ngram=(1,2), min_df=2, C=C)
    if sc < 0.40:
        pd.DataFrame(oof_lr, columns=classes).to_csv(f'oof_word_tfidf_lr_C{C}.csv', index=False)
        pd.DataFrame(te_lr, columns=classes).to_csv(f'test_word_tfidf_lr_C{C}.csv', index=False)

# 2. Word NB-SVM (count binary, class-normalized log-ratio, try SVC+calibration; try with/without L2 row-norm)
def run_word_nbsvm(ngram=(1,2), min_df=2, alpha=0.5, C=2.0, use_svc=True, l2norm=True):
    vec = CountVectorizer(
        analyzer='word',
        ngram_range=ngram,
        lowercase=True,
        min_df=min_df,
        max_df=1.0,
        binary=True,
        token_pattern=r'(?u)\b\w+\b',
        strip_accents='unicode',
        preprocessor=simple_clean,
        dtype=np.float32
    )
    oof = np.zeros((len(train), 3), dtype=np.float32)
    test_preds = np.zeros((len(test), 3), dtype=np.float32)
    scores = []
    for fold, (tr_idx, va_idx) in enumerate(skf.split(train, y), 1):
        X_tr = vec.fit_transform(train['text'].iloc[tr_idx])
        X_va = vec.transform(train['text'].iloc[va_idx])
        X_te = vec.transform(test['text'])

        R_cols = []
        for c in range(3):
            y_bin = (y[tr_idx] == c).astype(int)
            pos = np.array(X_tr[y_bin==1].sum(axis=0)).ravel() + alpha
            neg = np.array(X_tr[y_bin==0].sum(axis=0)).ravel() + alpha
            pos /= pos.sum()
            neg /= neg.sum()
            r = np.log(pos / neg)
            R_cols.append(r.astype(np.float32))
        R = np.vstack(R_cols).T  # (vocab, classes)

        def apply_r(X):
            blocks = [X.multiply(R[:, c]) for c in range(3)]
            Xr = sparse.hstack(blocks).tocsr()
            if l2norm:
                norms = np.sqrt(Xr.power(2).sum(axis=1).A.ravel() + 1e-8)
                Xr = Xr.multiply(1.0 / norms[:, None])
            return Xr

        X_tr_nb = apply_r(X_tr)
        X_va_nb = apply_r(X_va)
        X_te_nb = apply_r(X_te)

        if use_svc:
            base = LinearSVC(C=C, max_iter=5000, random_state=42+fold)
            clf = CalibratedClassifierCV(base, method='sigmoid', cv=3)
        else:
            clf = LogisticRegression(solver='saga', C=C, max_iter=5000, tol=1e-4,
                                     multi_class='ovr', random_state=42+fold)
        clf.fit(X_tr_nb, y[tr_idx])
        oof[va_idx] = clf.predict_proba(X_va_nb)
        test_preds += clf.predict_proba(X_te_nb)
        sc = log_loss(y[va_idx], oof[va_idx]); scores.append(sc)
        print(f'Word NB-SVM Fold {fold}: {sc:.4f}')
    test_preds /= skf.n_splits
    oof_score = float(np.mean(scores))
    print(f'Word NB-SVM OOF: {oof_score:.4f}')
    return oof_score, oof, test_preds

# Small sweep: alpha and normalization/backend
configs = [
    dict(alpha=0.5, C=2.0, use_svc=True,  l2norm=True),
    dict(alpha=0.75,C=2.0, use_svc=True,  l2norm=True),
    dict(alpha=0.5, C=2.0, use_svc=True,  l2norm=False),  # A/B the norm
    dict(alpha=0.75,C=2.0, use_svc=False, l2norm=False),  # LR backend, no norm
]
for cfg in configs:
    sc, oof_nb, te_nb = run_word_nbsvm(ngram=(1,2), min_df=2, **cfg)
    tag = f"a{cfg['alpha']}_C{cfg['C']}_svc{cfg['use_svc']}_l2{cfg['l2norm']}"
    if sc < 0.41:
        pd.DataFrame(oof_nb, columns=classes).to_csv(f'oof_word_nbsvm_{tag}.csv', index=False)
        pd.DataFrame(te_nb, columns=classes).to_csv(f'test_word_nbsvm_{tag}.csv', index=False)

print('Word models sweep complete. Check for OOF <0.40 files to integrate into final_ensemble.ipynb. If none, drop words and focus on char/ensemble improvements for bronze.')

Data loaded and cleaned. Train shape: (17621, 3) Classes: ['EAP', 'HPL', 'MWS']


Word TF-IDF LR Fold 1: 0.5604


Word TF-IDF LR Fold 2: 0.5474


Word TF-IDF LR Fold 3: 0.5521


Word TF-IDF LR Fold 4: 0.5681


Word TF-IDF LR Fold 5: 0.5656


Word TF-IDF LR Fold 6: 0.5678


Word TF-IDF LR Fold 7: 0.5560


Word TF-IDF LR Fold 8: 0.5450


Word TF-IDF LR Fold 9: 0.5530


Word TF-IDF LR Fold 10: 0.5469
Word TF-IDF LR OOF: 0.5562


Word TF-IDF LR Fold 1: 0.4974


Word TF-IDF LR Fold 2: 0.4831


Word TF-IDF LR Fold 3: 0.4889


Word TF-IDF LR Fold 4: 0.5075


Word TF-IDF LR Fold 5: 0.5047


Word TF-IDF LR Fold 6: 0.5059


KeyboardInterrupt: 