In [None]:
import subprocess
import sys
result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
print(result.stdout if result.returncode == 0 else 'GPU not available')

In [None]:
import subprocess
import sys
import os
import shutil
from pathlib import Path

def pip_install(*args):
    print('>', *args, flush=True)
    subprocess.run([sys.executable, '-m', 'pip', *args], check=True)

# Hard reset any prior torch stacks
for pkg in ('torch', 'torchvision', 'torchaudio'):
    subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', pkg], check=False)

# Clean stray site dirs
stray_dirs = [
    '/app/.pip-target/torch',
    '/app/.pip-target/torch-2.8.0.dist-info',
    '/app/.pip-target/torch-2.4.1.dist-info',
    '/app/.pip-target/torchvision',
    '/app/.pip-target/torchvision-0.23.0.dist-info',
    '/app/.pip-target/torchvision-0.19.1.dist-info',
    '/app/.pip-target/torchaudio',
    '/app/.pip-target/torchaudio-2.8.0.dist-info',
    '/app/.pip-target/torchaudio-2.4.1.dist-info',
    '/app/.pip-target/torchgen',
    '/app/.pip-target/functorch'
]
for d in stray_dirs:
    if os.path.exists(d):
        print('Removing', d)
        shutil.rmtree(d, ignore_errors=True)

# Install the EXACT cu121 torch stack
pip_install('install',
    '--index-url', 'https://download.pytorch.org/whl/cu121',
    '--extra-index-url', 'https://pypi.org/simple',
    'torch==2.4.1', 'torchvision==0.19.1', 'torchaudio==2.4.1')

# Create constraints file
Path('constraints.txt').write_text(
    'torch==2.4.1\n'
    'torchvision==0.19.1\n'
    'torchaudio==2.4.1\n')

# Install other deps
pip_install('install', '-c', 'constraints.txt',
    'transformers==4.44.2', 'accelerate==0.34.2',
    'datasets==2.21.0', 'evaluate==0.4.2',
    'sentencepiece', 'scikit-learn',
    '--upgrade-strategy', 'only-if-needed')

# Sanity check
import torch
print('torch:', torch.__version__, 'built CUDA:', getattr(torch.version, 'cuda', None))
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
else:
    print('ERROR: CUDA not available')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string

# Load data
train = pd.read_csv('train.csv')
print('Train shape:', train.shape)
print(train.head())

# Author distribution
print('\nAuthor distribution:')
print(train['author'].value_counts(normalize=True))

# Text length stats
train['text_length'] = train['text'].str.len()
print('\nText length stats:')
print(train['text_length'].describe())

# Word count stats
train['word_count'] = train['text'].str.split().str.len()
print('\nWord count stats:')
print(train['word_count'].describe())

# Punctuation rate by author
def punct_rate(text):
    if len(text) == 0:
        return 0
    return sum(1 for c in text if c in string.punctuation) / len(text)

train['punct_rate'] = train['text'].apply(punct_rate)
print('\nPunctuation rate by author:')
print(train.groupby('author')['punct_rate'].agg(['mean', 'std']).round(4))

# Sample texts per author
print('\nSample texts:')
for author in train['author'].unique():
    sample = train[train['author'] == author]['text'].iloc[0]
    print(f'{author}: {sample[:200]}...')

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import numpy as np, pandas as pd

# Fresh scope for Char NB baseline - using CountVectorizer with binary=False and norm=True
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author'])
print('Label map:', dict(zip(le.classes_, le.transform(le.classes_))))
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run_cv(pipe, X, y, name):
    oof = np.zeros((len(X), 3)); scores=[]
    for f,(tr,va) in enumerate(skf.split(X, y)):
        pipe.fit(X.iloc[tr], y[tr])
        if f==0:
            step = next((k for k in pipe.named_steps if hasattr(pipe.named_steps[k],'get_feature_names_out')), None)
            if step:
                print(name, 'features:', len(pipe.named_steps[step].get_feature_names_out()))
        p = pipe.predict_proba(X.iloc[va]); oof[va]=p
        s = log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f+1}: {s:.4f}')
        if f==0:
            print('Sample probs Fold 1:', np.round(p[:3],3))
    score = float(np.mean(scores))
    print(f'{name} OOF: {score:.4f} | prob sum sanity:', np.allclose(oof.sum(1).mean(), 1.0, atol=1e-6))
    return score, oof, pipe

# Char Count NB baseline (ComplementNB alpha=0.5, norm=True, binary=False, expect 0.33-0.38 OOF)
char_nb = Pipeline([
    ('cv', CountVectorizer(analyzer='char', ngram_range=(3,5), lowercase=False, min_df=3, max_features=200000, binary=False)),
    ('nb', ComplementNB(alpha=0.5, norm=True))
])
sc, oof, pipe = run_cv(char_nb, train['text'], y, 'Char Count NB')

print('Char NB OOF:', round(sc,4))
oof_preds_char = oof
char_pipe = pipe
pd.DataFrame(oof_preds_char, columns=le.classes_).to_csv('oof_char.csv', index=False)

Label map: {'EAP': 0, 'HPL': 1, 'MWS': 2}


Char Count NB features: 105935


Char Count NB Fold 1: 1.0986
Sample probs Fold 1: [[0.333 0.333 0.333]
 [0.333 0.333 0.333]
 [0.333 0.333 0.333]]


Char Count NB Fold 2: 1.0986


Char Count NB Fold 3: 1.0986


Char Count NB Fold 4: 1.0986


Char Count NB Fold 5: 1.0986
Char Count NB OOF: 1.0986 | prob sum sanity: True
Char NB OOF: 1.0986


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import numpy as np, pandas as pd

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author'])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run_cv(pipe, X, y, name):
    oof = np.zeros((len(X), 3)); scores=[]
    for f,(tr,va) in enumerate(skf.split(X, y)):
        pipe.fit(X.iloc[tr], y[tr])
        p = pipe.predict_proba(X.iloc[va]); oof[va]=p
        s = log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f+1}: {s:.4f}')
    score = float(np.mean(scores)); print(f'{name} OOF: {score:.4f}')
    return score, oof, pipe

word_lr = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2),
                              lowercase=True, sublinear_tf=True,
                              min_df=2, max_df=0.95)),  # float64 default
    ('lr', LogisticRegression(solver='lbfgs', C=1.5,
                              max_iter=5000, tol=1e-4,
                              random_state=42, n_jobs=1))
])
sc_word, oof_word, word_pipe = run_cv(word_lr, train['text'], y, 'Tweaked Word LR')
pd.DataFrame(oof_word, columns=le.classes_).to_csv('oof_word_tweaked.csv', index=False)

# Fit full and save test preds for blending
word_pipe.fit(train['text'], y)
test_word = word_pipe.predict_proba(test['text'])
pd.DataFrame(test_word, columns=le.classes_).to_csv('test_word_tweaked.csv', index=False)

Tweaked Word LR Fold 1: 0.5316


Tweaked Word LR Fold 2: 0.5416


Tweaked Word LR Fold 3: 0.5424


Tweaked Word LR Fold 4: 0.5308


Tweaked Word LR Fold 5: 0.5312
Tweaked Word LR OOF: 0.5355


In [39]:
import numpy as np, pandas as pd
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import itertools

# Load OOF predictions for the three best models
oof_char_lr_df = pd.read_csv('oof_char_lr.csv')
oof_char_lr_df = oof_char_lr_df.reindex(columns=['EAP', 'HPL', 'MWS'])
oof_char_lr = oof_char_lr_df.values

oof_char_wb_lr_df = pd.read_csv('oof_char_wb_lr.csv')
oof_char_wb_lr_df = oof_char_wb_lr_df.reindex(columns=['EAP', 'HPL', 'MWS'])
oof_char_wb_lr = oof_char_wb_lr_df.values

oof_word_df = pd.read_csv('oof_word.csv')
oof_word_df = oof_word_df.reindex(columns=['EAP', 'HPL', 'MWS'])
oof_word = oof_word_df.values

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author'])

# Grid search for optimal weights (3 models, sum to 1)
best_score = float('inf')
best_weights = None
oof_list = [oof_char_lr, oof_char_wb_lr, oof_word]
labels = ['char_lr', 'char_wb_lr', 'word']

for weights in itertools.product(np.linspace(0, 1, 11), repeat=3):
    if abs(sum(weights) - 1.0) > 1e-6: continue
    blend_oof = sum(w * oof for w, oof in zip(weights, oof_list))
    score = log_loss(y, blend_oof)
    if score < best_score:
        best_score = score
        best_weights = weights

print(f'Best blend weights: {dict(zip(labels, best_weights))}', f'Blended OOF: {best_score:.4f}')

# Refit original word_pipe (from early Cell 4 config)
word_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2),
                              lowercase=True, sublinear_tf=True,
                              min_df=3, max_df=0.95, dtype=np.float32)),
    ('lr', LogisticRegression(solver='lbfgs',
                              C=4.0, max_iter=3000, tol=1e-3,
                              random_state=42, n_jobs=1))
])
word_pipe.fit(train['text'], y)

# Refit original char_wb_lr pipe (from early Cell 9 config)
char_wb_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(
        analyzer='char_wb',
        ngram_range=(3,5),
        lowercase=False,
        sublinear_tf=True,
        min_df=2,
        max_df=0.98,
        max_features=200000
    )),
    ('lr', LogisticRegression(
        solver='lbfgs',
        C=4.0,
        max_iter=3000,
        tol=1e-3,
        random_state=42,
        n_jobs=1
    ))
])
char_wb_pipe.fit(train['text'], y)

# Use char_pipe from Cell 12 (already fitted)

# Generate test predictions
test_char_lr_df = pd.DataFrame(char_pipe.predict_proba(test['text']), columns=le.classes_)
test_char_lr_df = test_char_lr_df.reindex(columns=['EAP', 'HPL', 'MWS'])
test_char_lr = test_char_lr_df.values

test_char_wb_lr_df = pd.DataFrame(char_wb_pipe.predict_proba(test['text']), columns=le.classes_)
test_char_wb_lr_df = test_char_wb_lr_df.reindex(columns=['EAP', 'HPL', 'MWS'])
test_char_wb_lr = test_char_wb_lr_df.values

test_word_df = pd.DataFrame(word_pipe.predict_proba(test['text']), columns=le.classes_)
test_word_df = test_word_df.reindex(columns=['EAP', 'HPL', 'MWS'])
test_word = test_word_df.values

test_list = [test_char_lr, test_char_wb_lr, test_word]
blend_test = sum(w * test_preds for w, test_preds in zip(best_weights, test_list))

# Ensure probs sum to 1 and clip extremes
blend_test = np.clip(blend_test, 1e-9, 1 - 1e-9)
blend_test /= blend_test.sum(axis=1, keepdims=True)

# Submission
sub = pd.read_csv('sample_submission.csv')
sub[['EAP', 'HPL', 'MWS']] = blend_test
sub.to_csv('submission.csv', index=False)
print('Blended submission saved with OOF:', round(best_score, 4))
print('Test probs shape:', blend_test.shape)

Best blend weights: {'char_lr': 0.2, 'char_wb_lr': 0.5, 'word': 0.30000000000000004} Blended OOF: 0.4219


Blended submission saved with OOF: 0.4219
Test probs shape: (1958, 3)


In [1]:
import subprocess
import sys

# Uninstall current scikit-learn
subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', 'scikit-learn'], check=False)
print('Uninstalled scikit-learn')

# Install stable version 1.3.0 with force-reinstall to overwrite existing
subprocess.run([sys.executable, '-m', 'pip', 'install', '--force-reinstall', 'scikit-learn==1.3.0'], check=True)
print('Installed scikit-learn==1.3.0 with force-reinstall')

# Verify
import sklearn
print('Sklearn version after install:', sklearn.__version__)

Found existing installation: scikit-learn 1.3.0
Uninstalling scikit-learn-1.3.0:
  Successfully uninstalled scikit-learn-1.3.0
Uninstalled scikit-learn


Collecting scikit-learn==1.3.0
  Downloading scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.9/10.9 MB 120.4 MB/s eta 0:00:00


Collecting scipy>=1.5.0
  Downloading scipy-1.16.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.9 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 35.9/35.9 MB 173.4 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)


Collecting numpy>=1.17.3
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 270.4 MB/s eta 0:00:00
Collecting joblib>=1.1.1
  Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 308.4/308.4 KB 513.3 MB/s eta 0:00:00


Installing collected packages: threadpoolctl, numpy, joblib, scipy, scikit-learn


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.21.0 requires fsspec[http]<=2024.6.1,>=2023.1.0, but you have fsspec 2025.9.0 which is incompatible.


Successfully installed joblib-1.5.2 numpy-1.26.4 scikit-learn-1.3.0 scipy-1.16.2 threadpoolctl-3.6.0


Installed scikit-learn==1.3.0 with force-reinstall
Sklearn version after install: 1.7.2


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import numpy as np, pandas as pd, os

os.environ['OPENBLAS_NUM_THREADS']='1'; os.environ['MKL_NUM_THREADS']='1'; os.environ['NUMEXPR_NUM_THREADS']='1'
train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
le=LabelEncoder(); y=le.fit_transform(train['author'])
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run_cv(pipe,X,y,name):
    oof=np.zeros((len(X),3)); scores=[]
    for f,(tr,va) in enumerate(skf.split(X,y)):
        pipe.fit(X.iloc[tr], y[tr])
        p=pipe.predict_proba(X.iloc[va]); oof[va]=p
        s=log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f+1}: {s:.4f}')
    sc=float(np.mean(scores)); print(f'{name} OOF: {sc:.4f}'); return sc,oof,pipe

char_wb_lr=Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6),
                              lowercase=False, sublinear_tf=True,
                              min_df=7, max_df=0.95, max_features=120_000)),
    ('lr', LogisticRegression(solver='saga', multi_class='multinomial',
                              C=1.2, max_iter=2500, tol=1e-3,
                              random_state=42, n_jobs=1))
])
sc,oof,pipe=run_cv(char_wb_lr, train['text'], y, 'Char_wb LR')
pd.DataFrame(oof, columns=le.classes_).to_csv('oof_char_wb.csv', index=False)
char_wb_pipe=pipe



Char_wb LR Fold 1: 0.4860




Char_wb LR Fold 2: 0.4936




Char_wb LR Fold 3: 0.4960




In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import numpy as np, pandas as pd, os

os.environ['OPENBLAS_NUM_THREADS']='1'; os.environ['MKL_NUM_THREADS']='1'; os.environ['NUMEXPR_NUM_THREADS']='1'
train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
le=LabelEncoder(); y=le.fit_transform(train['author'])
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run_cv(pipe,X,y,name):
    oof=np.zeros((len(X),3)); scores=[]
    for f,(tr,va) in enumerate(skf.split(X,y)):
        pipe.fit(X.iloc[tr], y[tr])
        p=pipe.predict_proba(X.iloc[va]); oof[va]=p
        s=log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f+1}: {s:.4f}')
    sc=float(np.mean(scores)); print(f'{name} OOF: {sc:.4f}'); return sc,oof,pipe

char_sgd=Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(3,5),
                              lowercase=False, sublinear_tf=True,
                              min_df=7, max_df=0.95, max_features=150_000)),
    ('sgd', SGDClassifier(loss='log_loss', penalty='l2',
                          alpha=1.2e-4, max_iter=2000, tol=1e-3,
                          early_stopping=True, validation_fraction=0.1,
                          n_iter_no_change=5, random_state=42))
])
sc,oof,pipe=run_cv(char_sgd, train['text'], y, 'Char SGD')
pd.DataFrame(oof, columns=le.classes_).to_csv('oof_char.csv', index=False)
char_pipe=pipe

Char SGD Fold 1: 0.6422


Char SGD Fold 2: 0.6453


Char SGD Fold 3: 0.6455


Char SGD Fold 4: 0.6416


Char SGD Fold 5: 0.6333
Char SGD OOF: 0.6416


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import pandas as pd, numpy as np, os

os.environ['OPENBLAS_NUM_THREADS']='1'; os.environ['MKL_NUM_THREADS']='1'; os.environ['NUMEXPR_NUM_THREADS']='1'
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author'])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run_cv(pipe, X, y, name):
    oof = np.zeros((len(X), 3)); scores=[]
    for f,(tr,va) in enumerate(skf.split(X, y)):
        pipe.fit(X.iloc[tr], y[tr])
        p = pipe.predict_proba(X.iloc[va]); oof[va]=p
        s = log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f+1}: {s:.4f}')
    score = float(np.mean(scores)); print(f'{name} OOF: {score:.4f}')
    return score, oof, pipe

char_wb_lr = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6),
                              lowercase=False, sublinear_tf=True,
                              min_df=6, max_df=0.95)),  # prune more
    ('lr', LogisticRegression(solver='lbfgs', C=1.5,
                              max_iter=5000, tol=1e-4,
                              random_state=42, n_jobs=1))
])
sc_char, oof_char, fitted_pipe = run_cv(char_wb_lr, train['text'], y, 'Tweaked Char_wb LR')
pd.DataFrame(oof_char, columns=le.classes_).to_csv('oof_char_wb_tweaked.csv', index=False)

# Fit full and save test preds for blending
fitted_pipe.fit(train['text'], y)
test_char = fitted_pipe.predict_proba(test['text'])
pd.DataFrame(test_char, columns=le.classes_).to_csv('test_char_wb_tweaked.csv', index=False)

Tweaked Char_wb LR Fold 1: 0.4684


Tweaked Char_wb LR Fold 2: 0.4775


Tweaked Char_wb LR Fold 3: 0.4787


Tweaked Char_wb LR Fold 4: 0.4747


Tweaked Char_wb LR Fold 5: 0.4664
Tweaked Char_wb LR OOF: 0.4731


In [35]:
import numpy as np, pandas as pd, os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack

os.environ['OPENBLAS_NUM_THREADS']='1'; os.environ['MKL_NUM_THREADS']='1'; os.environ['NUMEXPR_NUM_THREADS']='1'
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

vec = CountVectorizer(analyzer='char_wb', ngram_range=(3,5),
                      lowercase=False, min_df=3, binary=True)

def nb_ratio(X, y_bin, alpha=0.1):
    pos = np.asarray(X[y_bin==1].sum(axis=0)).ravel() + alpha
    neg = np.asarray(X[y_bin==0].sum(axis=0)).ravel() + alpha
    return np.log(pos/neg)

oof = np.zeros((len(train), 3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y)):
    Xtr = vec.fit_transform(train['text'].iloc[tr])
    Xva = vec.transform(train['text'].iloc[va])

    R = []
    for c in range(3):
        yb = (y[tr]==c).astype(int)
        R.append(nb_ratio(Xtr, yb, alpha=0.1))
    R = np.vstack(R).T  # shape: n_features x 3

    # Transform features per class by elementwise multiply, then stack
    Xtr_nb = []
    Xva_nb = []
    for c in range(3):
        Xtr_nb.append(Xtr.multiply(R[:,c]))
        Xva_nb.append(Xva.multiply(R[:,c]))
    Xtr_nb = hstack(Xtr_nb)
    Xva_nb = hstack(Xva_nb)

    clf = LogisticRegression(solver='lbfgs', C=1.0, max_iter=4000, tol=1e-4, random_state=42, n_jobs=1)
    clf.fit(Xtr_nb, y[tr])

    p = clf.predict_proba(Xva_nb)
    oof[va] = p
    s = log_loss(y[va], p); scores.append(s); print(f'NB-SVM Fold {f+1}: {s:.4f}')

nbsvm_oof = float(np.mean(scores))
print(f'NB-SVM OOF: {nbsvm_oof:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_nbsvm.csv', index=False)

# Fit full for test
Xfull = vec.fit_transform(train['text'])
R = []
for c in range(3):
    yb = (y==c).astype(int)
    R.append(nb_ratio(Xfull, yb, alpha=0.1))
R = np.vstack(R).T
Xfull_nb = hstack([Xfull.multiply(R[:,c]) for c in range(3)])
Xtest_nb = hstack([vec.transform(test['text']).multiply(R[:,c]) for c in range(3)])

clf_full = LogisticRegression(solver='lbfgs', C=1.0, max_iter=4000, tol=1e-4, random_state=42, n_jobs=1)
clf_full.fit(Xfull_nb, y)
test_nbsvm = clf_full.predict_proba(Xtest_nb)
pd.DataFrame(test_nbsvm, columns=classes).to_csv('test_nbsvm.csv', index=False)

NB-SVM Fold 1: 0.6337


NB-SVM Fold 2: 0.6080


NB-SVM Fold 3: 0.6855


NB-SVM Fold 4: 0.6477


NB-SVM Fold 5: 0.6429
NB-SVM OOF: 0.6436


In [36]:
import numpy as np, pandas as pd, os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import issparse

os.environ['OPENBLAS_NUM_THREADS']='1'; os.environ['MKL_NUM_THREADS']='1'; os.environ['NUMEXPR_NUM_THREADS']='1'
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

vec = CountVectorizer(analyzer='word', ngram_range=(1,2),
                      lowercase=True, min_df=2, max_df=0.98,
                      max_features=200_000, binary=True)

def log_count_ratio(X, y_bin, alpha=0.5):
    pos = np.asarray(X[y_bin==1].sum(axis=0)).ravel() + alpha
    neg = np.asarray(X[y_bin==0].sum(axis=0)).ravel() + alpha
    return np.log(pos/neg)

def normalize_ovr_probs(P, eps=1e-9):
    P = np.clip(P, eps, 1 - eps)
    odds = P / (1.0 - P)
    return odds / (odds.sum(axis=1, keepdims=True) + eps)

oof = np.zeros((len(train), 3)); scores = []
X = vec.fit_transform(train['text']); Xtest = vec.transform(test['text'])

for f,(tr,va) in enumerate(skf.split(X, y)):
    Xtr, Xva = X[tr], X[va]
    Pva = np.zeros((len(va), 3))
    for c in range(3):
        yb = (y[tr]==c).astype(int)
        r = log_count_ratio(Xtr, yb, alpha=0.5)
        clf = LogisticRegression(solver='liblinear', C=2.0, penalty='l2',
                                 max_iter=2000, tol=1e-4, random_state=42+c)
        clf.fit(Xtr.multiply(r), yb)
        Pva[:, c] = clf.predict_proba(Xva.multiply(r))[:,1]
    Pva = normalize_ovr_probs(Pva)
    oof[va] = Pva
    s = log_loss(y[va], Pva); scores.append(s); print(f'NB-SVM-LR Fold {f+1}: {s:.4f}')

sc_nb = float(np.mean(scores)); print(f'NB-SVM-LR OOF: {sc_nb:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_nbsvm_lr.csv', index=False)

# Fit full and predict test
Ptest = np.zeros((len(test), 3))
for c in range(3):
    yb = (y==c).astype(int)
    r = log_count_ratio(X, yb, alpha=0.5)
    clf = LogisticRegression(solver='liblinear', C=2.0, penalty='l2',
                             max_iter=2000, tol=1e-4, random_state=42+c)
    clf.fit(X.multiply(r), yb)
    Ptest[:, c] = clf.predict_proba(Xtest.multiply(r))[:,1]
Ptest = normalize_ovr_probs(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_nbsvm_lr.csv', index=False)

  Ground truth (correct) target values.


NB-SVM-LR Fold 1: 0.7337


  Ground truth (correct) target values.


NB-SVM-LR Fold 2: 0.6839


  Ground truth (correct) target values.


NB-SVM-LR Fold 3: 0.7686


  Ground truth (correct) target values.


NB-SVM-LR Fold 4: 0.7149


  Ground truth (correct) target values.


NB-SVM-LR Fold 5: 0.7448
NB-SVM-LR OOF: 0.7292


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import pandas as pd, numpy as np, os

os.environ['OPENBLAS_NUM_THREADS']='1'; os.environ['MKL_NUM_THREADS']='1'; os.environ['NUMEXPR_NUM_THREADS']='1'
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author'])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run_cv(pipe, X, y, name):
    oof = np.zeros((len(X), 3)); scores=[]
    for f,(tr,va) in enumerate(skf.split(X, y)):
        pipe.fit(X.iloc[tr], y[tr])
        p = pipe.predict_proba(X.iloc[va]); oof[va]=p
        s = log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f+1}: {s:.4f}')
    score = float(np.mean(scores)); print(f'{name} OOF: {score:.4f}')
    return score, oof, pipe

char_lr = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(3,5),
                              lowercase=False, sublinear_tf=True,
                              min_df=1, max_df=1.0)),  # no max_features
    ('lr', LogisticRegression(solver='lbfgs', C=4.0,
                              max_iter=3000, tol=1e-3,
                              random_state=42, n_jobs=1))
])
sc_char, oof_char, char_pipe = run_cv(char_lr, train['text'], y, 'Char LR')
pd.DataFrame(oof_char, columns=le.classes_).to_csv('oof_char_lr.csv', index=False)

char_pipe.fit(train['text'], y)
test_char = char_pipe.predict_proba(test['text'])
pd.DataFrame(test_char, columns=le.classes_).to_csv('test_char_lr.csv', index=False)

Char LR Fold 1: 0.4327


Char LR Fold 2: 0.4519


Char LR Fold 3: 0.4496


Char LR Fold 4: 0.4313


Char LR Fold 5: 0.4414
Char LR OOF: 0.4414


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import pandas as pd, numpy as np, os

os.environ['OPENBLAS_NUM_THREADS']='1'; os.environ['MKL_NUM_THREADS']='1'; os.environ['NUMEXPR_NUM_THREADS']='1'
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author'])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run_cv(pipe, X, y, name):
    oof = np.zeros((len(X), 3)); scores=[]
    print(f'--- {name} ---')
    for f,(tr,va) in enumerate(skf.split(X, y)):
        pipe.fit(X.iloc[tr], y[tr])
        p = pipe.predict_proba(X.iloc[va]); oof[va]=p
        s = log_loss(y[va], p); scores.append(s); print(f'Fold {f+1}: {s:.4f}')
    sc = float(np.mean(scores)); print(f'OOF: {sc:.4f}')
    return sc, oof, pipe

lr_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,3),
                              lowercase=True, sublinear_tf=True,
                              min_df=3, max_df=0.9, max_features=150_000)),
    ('lr', LogisticRegression(solver='lbfgs', C=2.0,
                              max_iter=3000, tol=1e-4, random_state=42, n_jobs=1))
])

sc_lr, oof_lr, fitted_lr_pipe = run_cv(lr_pipe, train['text'], y, 'LR_word')
pd.DataFrame(oof_lr, columns=le.classes_).to_csv('oof_lr_word.csv', index=False)

fitted_lr_pipe.fit(train['text'], y)
test_lr = fitted_lr_pipe.predict_proba(test['text'])
pd.DataFrame(test_lr, columns=le.classes_).to_csv('test_lr_word.csv', index=False)

--- LR_word ---


Fold 1: 0.5062


Fold 2: 0.5160


Fold 3: 0.5146


Fold 4: 0.5049


Fold 5: 0.5050
OOF: 0.5094


In [40]:
import os, numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['MKL_NUM_THREADS']='1'
os.environ['NUMEXPR_NUM_THREADS']='1'

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Manual Platt-calibrated LinearSVC
class ManualPlattSVC:
    def __init__(self, C=0.5, max_iter=4000, tol=1e-4, random_state=42):
        self.svc = LinearSVC(C=C, dual='auto', max_iter=max_iter, tol=tol, random_state=random_state)
        self.calibs = []
    def fit(self, X, y):
        self.svc.fit(X, y)
        F = self.svc.decision_function(X)
        if F.ndim == 1: F = F[:, None]
        self.n_classes_ = F.shape[1]
        self.calibs = []
        for c in range(self.n_classes_):
            lr = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000)
            lr.fit(F[:, [c]], (y==c).astype(int))
            self.calibs.append(lr)
        return self
    def predict_proba(self, X):
        F = self.svc.decision_function(X)
        if F.ndim == 1: F = F[:, None]
        P = np.zeros((F.shape[0], self.n_classes_), dtype=float)
        for c, lr in enumerate(self.calibs):
            P[:, c] = lr.predict_proba(F[:, [c]])[:, 1]
        P = np.clip(P, 1e-9, 1-1e-9)
        odds = P/(1-P)
        return odds/(odds.sum(axis=1, keepdims=True)+1e-12)

def odds_normalize(P, eps=1e-9):
    P = np.clip(P, eps, 1-eps)
    odds = P/(1-P)
    return odds/(odds.sum(axis=1, keepdims=True)+eps)

In [43]:
import os, numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Threads
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['MKL_NUM_THREADS']='1'
os.environ['NUMEXPR_NUM_THREADS']='1'

# Data
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf_outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def odds_normalize(P, eps=1e-9):
    P = np.clip(P, eps, 1-eps)
    odds = P/(1-P)
    return odds/(odds.sum(axis=1, keepdims=True)+eps)

# Strong, safe vectorizer and SVC settings
vec_params = dict(
    analyzer='char_wb', ngram_range=(2,5),
    lowercase=False, sublinear_tf=True,
    min_df=3, max_df=0.98, max_features=250_000
)
svc_params = dict(C=0.5, loss='squared_hinge', dual='auto', max_iter=3000, tol=1e-4, random_state=42)
inner_cv_splits = 3  # cross-fit Platt calibrators

def run_calsvc_ovr_platt(X_text, y):
    oof = np.zeros((len(X_text), 3)); scores=[]
    for f,(tr,va) in enumerate(skf_outer.split(X_text, y), 1):
        vec = TfidfVectorizer(**vec_params)
        Xtr = vec.fit_transform(X_text.iloc[tr]); Xva = vec.transform(X_text.iloc[va])

        Pva = np.zeros((len(va), 3))
        for c in range(3):
            yb_tr = (y[tr]==c).astype(int)

            # Inner CV: build Platt on out-of-fold decision scores
            skf_inner = StratifiedKFold(n_splits=inner_cv_splits, shuffle=True, random_state=42+c)
            F_cal = []; z_cal = []
            for itr, iva in skf_inner.split(Xtr, yb_tr):
                svc = LinearSVC(**svc_params)
                svc.fit(Xtr[itr], yb_tr[itr])
                s = svc.decision_function(Xtr[iva])
                if s.ndim > 1: s = s[:,0]
                F_cal.append(s.reshape(-1,1)); z_cal.append(yb_tr[iva])
            F_cal = np.vstack(F_cal); z_cal = np.concatenate(z_cal)

            platt = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, random_state=1337)
            platt.fit(F_cal, z_cal)

            # Final binary SVC on full tr -> score outer va -> calibrate
            svc_full = LinearSVC(**svc_params)
            svc_full.fit(Xtr, yb_tr)
            s_va = svc_full.decision_function(Xva)
            if s_va.ndim > 1: s_va = s_va[:,0]
            Pva[:, c] = platt.predict_proba(s_va.reshape(-1,1))[:,1]

        Pva = odds_normalize(Pva)
        oof[va] = Pva
        fold_ll = log_loss(y[va], Pva); scores.append(fold_ll)
        print(f'CalSVC(OvR+Platt) Fold {f}: {fold_ll:.4f}')

    sc = float(np.mean(scores))
    print(f'CalSVC(OvR+Platt) OOF: {sc:.4f}')
    return sc, oof

# CV + save OOF
sc, oof = run_calsvc_ovr_platt(train['text'], y)
pd.DataFrame(oof, columns=classes).to_csv('oof_calsvc_char.csv', index=False)

# Fit full for test: cross-fit Platt on full train, then final SVCs and test preds
vec_full = TfidfVectorizer(**vec_params)
Xfull = vec_full.fit_transform(train['text']); Xtest = vec_full.transform(test['text'])

Ptest = np.zeros((len(test), 3))
for c in range(3):
    yb = (y==c).astype(int)
    skf_inner = StratifiedKFold(n_splits=inner_cv_splits, shuffle=True, random_state=4242+c)
    F_cal = []; z_cal = []
    for tr_in, va_in in skf_inner.split(Xfull, yb):
        svc = LinearSVC(**svc_params)
        svc.fit(Xfull[tr_in], yb[tr_in])
        s = svc.decision_function(Xfull[va_in])
        if s.ndim > 1: s = s[:,0]
        F_cal.append(s.reshape(-1,1)); z_cal.append(yb[va_in])
    F_cal = np.vstack(F_cal); z_cal = np.concatenate(z_cal)
    platt = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, random_state=2025+c)
    platt.fit(F_cal, z_cal)

    svc_final = LinearSVC(**svc_params)
    svc_final.fit(Xfull, yb)
    s_test = svc_final.decision_function(Xtest)
    if s_test.ndim > 1: s_test = s_test[:,0]
    Ptest[:, c] = platt.predict_proba(s_test.reshape(-1,1))[:,1]

Ptest = odds_normalize(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_calsvc_char.csv', index=False)

CalSVC(OvR+Platt) Fold 1: 0.4303


CalSVC(OvR+Platt) Fold 2: 0.4316


CalSVC(OvR+Platt) Fold 3: 0.4623


CalSVC(OvR+Platt) Fold 4: 0.4453


CalSVC(OvR+Platt) Fold 5: 0.4318
CalSVC(OvR+Platt) OOF: 0.4403


In [45]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def log_count_ratio(X, y_bin, alpha=0.5):
    pos = np.asarray(X[y_bin==1].sum(0)).ravel() + alpha
    neg = np.asarray(X[y_bin==0].sum(0)).ravel() + alpha
    return np.log(pos / neg)

def odds_norm(P, eps=1e-9):
    P = np.clip(P, eps, 1-eps)
    odds = P/(1-P)
    return odds/(odds.sum(axis=1, keepdims=True)+eps)

def run_nbsvm(X_text, y, analyzer='char_wb', ngram_range=(3,5), min_df=3, max_features=200000, C=3.0, alpha=0.5):
    oof = np.zeros((len(X_text), 3)); scores=[]
    for f,(tr,va) in enumerate(skf.split(X_text, y), 1):
        vec = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range, lowercase=False,
                              min_df=min_df, max_features=max_features, binary=True)
        Xtr = vec.fit_transform(X_text.iloc[tr])
        Xva = vec.transform(X_text.iloc[va])
        y_tr, y_va = y[tr], y[va]
        Pva = np.zeros((len(va), 3))
        for c in range(3):
            yb = (y_tr==c).astype(int)
            r = log_count_ratio(Xtr, yb, alpha=alpha)
            clf = LogisticRegression(solver='liblinear', C=C, max_iter=2000, tol=1e-4, random_state=42+c)
            Xtr_r = Xtr.multiply(csr_matrix(r))
            Xva_r = Xva.multiply(csr_matrix(r))
            clf.fit(Xtr_r, yb)
            Pva[:, c] = clf.predict_proba(Xva_r)[:, 1]
        Pva = odds_norm(Pva)
        Pva = Pva / Pva.sum(axis=1, keepdims=True)  # Force exact sum to 1
        oof[va] = Pva
        s = log_loss(y_va, Pva); scores.append(s); print(f'NB-SVM Fold {f}: {s:.4f}')
    sc = float(np.mean(scores)); print(f'NB-SVM OOF: {sc:.4f}')
    # Full fit
    vec_full = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range, lowercase=False,
                               min_df=min_df, max_features=max_features, binary=True)
    Xfull = vec_full.fit_transform(X_text)
    Xtest = vec_full.transform(test['text'])
    Ptest = np.zeros((len(test), 3))
    for c in range(3):
        yb = (y==c).astype(int)
        r = log_count_ratio(Xfull, yb, alpha=alpha)
        clf = LogisticRegression(solver='liblinear', C=C, max_iter=2000, tol=1e-4, random_state=42+c)
        clf.fit(Xfull.multiply(csr_matrix(r)), yb)
        Ptest[:, c] = clf.predict_proba(Xtest.multiply(csr_matrix(r)))[:, 1]
    Ptest = odds_norm(Ptest)
    Ptest = Ptest / Ptest.sum(axis=1, keepdims=True)  # Force exact sum to 1
    pd.DataFrame(oof, columns=classes).to_csv('oof_nbsvm_charwb.csv', index=False)
    pd.DataFrame(Ptest, columns=classes).to_csv('test_nbsvm_charwb.csv', index=False)
    return sc

# Run NB-SVM
sc_nbsvm = run_nbsvm(train['text'], y, analyzer='char_wb', ngram_range=(3,5), min_df=3, max_features=200000, C=3.0, alpha=0.5)

NB-SVM Fold 1: 0.8142


NB-SVM Fold 2: 0.7629


NB-SVM Fold 3: 0.8488


NB-SVM Fold 4: 0.8000


NB-SVM Fold 5: 0.8269
NB-SVM OOF: 0.8106


In [47]:
import os, numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

os.environ['OPENBLAS_NUM_THREADS']='1'; os.environ['MKL_NUM_THREADS']='1'; os.environ['NUMEXPR_NUM_THREADS']='1'

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def log_count_ratio(X, y_bin, alpha=1.0):
    pos = np.asarray(X[y_bin==1].sum(0)).ravel() + alpha
    neg = np.asarray(X[y_bin==0].sum(0)).ravel() + alpha
    r = np.log(pos/neg)
    r[~np.isfinite(r)] = 0.0
    return r

def odds_norm(P, eps=1e-9):
    P = np.clip(P, eps, 1-eps)
    odds = P/(1-P)
    return odds/(odds.sum(axis=1, keepdims=True)+eps)

# Robust TF-IDF setup for NB-SVM
vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,5), lowercase=False,
                      min_df=1, max_df=0.99, max_features=200_000, sublinear_tf=True)
X_all = vec.fit_transform(train['text']); X_test = vec.transform(test['text'])

oof = np.zeros((len(train), 3)); scores=[]
for f,(tr,va) in enumerate(skf.split(X_all, y), 1):
    Xtr, Xva = X_all[tr], X_all[va]
    Pva = np.zeros((len(va), 3))
    for c in range(3):
        yb = (y[tr]==c).astype(int)
        r = log_count_ratio(Xtr, yb, alpha=1.0)
        clf = LogisticRegression(solver='liblinear', C=1.5, penalty='l2',
                                 max_iter=2000, tol=1e-4, random_state=42+c)
        clf.fit(Xtr.multiply(csr_matrix(r)), yb)
        Pva[:, c] = clf.predict_proba(Xva.multiply(csr_matrix(r)))[:, 1]
    Pva = odds_norm(Pva)  # single, correct normalization
    oof[va] = Pva
    s = log_loss(y[va], Pva); scores.append(s); print(f'NB-SVM Fold {f}: {s:.4f}')
sc = float(np.mean(scores)); print(f'NB-SVM OOF: {sc:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_nbsvm_fixed.csv', index=False)

# Full fit for test
Ptest = np.zeros((len(test), 3))
for c in range(3):
    yb = (y==c).astype(int)
    r = log_count_ratio(X_all, yb, alpha=1.0)
    clf = LogisticRegression(solver='liblinear', C=1.5, penalty='l2',
                             max_iter=2000, tol=1e-4, random_state=99+c)
    clf.fit(X_all.multiply(csr_matrix(r)), yb)
    Ptest[:, c] = clf.predict_proba(X_test.multiply(csr_matrix(r)))[:, 1]
Ptest = odds_norm(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_nbsvm_fixed.csv', index=False)

NB-SVM Fold 1: 0.4757


NB-SVM Fold 2: 0.4817


NB-SVM Fold 3: 0.5009


NB-SVM Fold 4: 0.4813


NB-SVM Fold 5: 0.4701
NB-SVM OOF: 0.4819


In [63]:
import os, numpy as np, pandas as pd, itertools
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)

# Ensure test_word.csv exists (refit if needed)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
vec_w = TfidfVectorizer(analyzer='word', ngram_range=(1,2), lowercase=True, sublinear_tf=True, min_df=3, max_df=0.95, dtype=np.float32)
Xw = vec_w.fit_transform(train['text']); Xw_te = vec_w.transform(test['text'])
lr_w = LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-3, n_jobs=1, random_state=42).fit(Xw, y)
test_word = lr_w.predict_proba(Xw_te)
pd.DataFrame(test_word, columns=classes).to_csv('test_word.csv', index=False)

# Load best OOFs and tests
oof1 = pd.read_csv('oof_10fold_uncal_char_wb_lr.csv')[classes].values  # 10-fold char_wb 0.4082
test1 = pd.read_csv('test_10fold_uncal_char_wb_lr.csv')[classes].values
oof2 = pd.read_csv('oof_char_wb_lr.csv')[classes].values  # 0.4361
test2 = pd.read_csv('test_char_wb_lr.csv')[classes].values if os.path.exists('test_char_wb_lr.csv') else pd.read_csv('test_char_wb_tweaked.csv')[classes].values
oof3 = pd.read_csv('oof_calsvc_char.csv')[classes].values  # 0.4403
test3 = pd.read_csv('test_calsvc_char.csv')[classes].values
oof4 = pd.read_csv('oof_word.csv')[classes].values  # 0.4602
test4 = pd.read_csv('test_word.csv')[classes].values
oof5 = pd.read_csv('oof_10fold_char_lr.csv')[classes].values  # 10-fold char 0.4284
test5 = pd.read_csv('test_10fold_char_lr.csv')[classes].values

oof_list = [oof1, oof2, oof3, oof4, oof5]
test_list = [test1, test2, test3, test4, test5]
labels = ['10fold_Uncal_Char_wb', 'Char_wb_LR', 'CalSVC', 'Word_LR', '10fold_Char_LR']

# 5-way grid search weights summing to 1
best_score = 1e9; best_w = None
for w in itertools.product(np.arange(0, 1.05, 0.05), repeat=5):
    if abs(sum(w) - 1.0) > 1e-9: continue
    blend_oof = sum(ww * oof for ww, oof in zip(w, oof_list))
    s = log_loss(y, blend_oof)
    if s < best_score:
        best_score, best_w = s, w

print(f'Best 5-way weights: {{dict(zip(labels, best_w))}}  Blended OOF: {best_score:.4f}')

if best_score <= 0.34:
    blend_test = sum(ww * tst for ww, tst in zip(best_w, test_list))
    blend_test = np.clip(blend_test, 1e-15, 1-1e-15)
    blend_test /= blend_test.sum(axis=1, keepdims=True)
    sub = pd.read_csv('sample_submission.csv')
    sub[classes] = blend_test
    sub.to_csv('submission.csv', index=False)
    print('Medal-ready submission.csv saved! Ready to submit_final_answer.')
else:
    print('Blended OOF >0.34; need further improvement (e.g., stylometrics or 10-fold).')

Best 4-way weights: {dict(zip(labels, best_w))}  Blended OOF: 0.3783
Blended OOF >0.34; need further improvement (e.g., stylometrics or 10-fold).


In [54]:
# Cell 19 — Fixed Calibrated Char_wb LR (true OvR binary base + leak-free inner-CV sigmoid calibration)
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def odds_norm(P, eps=1e-9):
    P = np.clip(P, eps, 1-eps)
    odds = P/(1-P)
    return odds/(odds.sum(axis=1, keepdims=True)+eps)

vec_params = dict(analyzer='char_wb', ngram_range=(2,5), lowercase=False,
                  sublinear_tf=True, min_df=2, max_df=0.98, max_features=250_000)

# Calibration: 'sigmoid' (Platt)
calibration = 'sigmoid'
inner_cv_splits = 3

oof = np.zeros((len(train), 3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y), 1):
    vec = TfidfVectorizer(**vec_params)
    Xtr = vec.fit_transform(train['text'].iloc[tr])
    Xva = vec.transform(train['text'].iloc[va])

    Pva = np.zeros((len(va), 3))
    for c in range(3):
        yb_tr = (y[tr]==c).astype(int)

        # Inner CV for leak-free calibration data
        skf_inner = StratifiedKFold(n_splits=inner_cv_splits, shuffle=True, random_state=42 + c*10)
        F_cal = []
        z_cal = []
        for i_tr, i_va in skf_inner.split(Xtr, yb_tr):
            base_inner = LogisticRegression(solver='liblinear', penalty='l2', C=2.0, max_iter=2000, tol=1e-4, random_state=42 + c)
            base_inner.fit(Xtr[i_tr], yb_tr[i_tr])
            s_inner = base_inner.decision_function(Xtr[i_va])
            F_cal.append(s_inner.reshape(-1, 1))
            z_cal.append(yb_tr[i_va])
        F_cal = np.vstack(F_cal)
        z_cal = np.concatenate(z_cal)

        # Fit calibrator on inner OOF margins
        calib = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, random_state=2025 + c).fit(F_cal, z_cal)

        # Final base on full outer tr
        base = LogisticRegression(solver='liblinear', penalty='l2', C=2.0, max_iter=2000, tol=1e-4, random_state=42 + c)
        base.fit(Xtr, yb_tr)
        s_va = base.decision_function(Xva).reshape(-1, 1)

        Pva[:, c] = calib.predict_proba(s_va)[:, 1]

    Pva = odds_norm(Pva)
    oof[va] = Pva
    s = log_loss(y[va], Pva); scores.append(s)
    print(f'Fixed Cal-Char_wb-LR Fold {f}: {s:.4f}')

sc = float(np.mean(scores)); print(f'Fixed Cal-Char_wb-LR OOF: {sc:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_cal_lr_char_wb_fixed.csv', index=False)

# Full fit for test
vec_full = TfidfVectorizer(**vec_params)
Xfull = vec_full.fit_transform(train['text'])
Xtest = vec_full.transform(test['text'])
Ptest = np.zeros((len(test), 3))
for c in range(3):
    yb = (y==c).astype(int)
    # Inner CV on full train
    skf_inner = StratifiedKFold(n_splits=inner_cv_splits, shuffle=True, random_state=42 + c*10)
    F_cal = []
    z_cal = []
    indices = np.arange(len(train))
    for i_tr, i_va in skf_inner.split(indices, yb):
        base_inner = LogisticRegression(solver='liblinear', penalty='l2', C=2.0, max_iter=2000, tol=1e-4, random_state=42 + c)
        base_inner.fit(Xfull[i_tr], yb[i_tr])
        s_inner = base_inner.decision_function(Xfull[i_va])
        F_cal.append(s_inner.reshape(-1, 1))
        z_cal.append(yb[i_va])
    F_cal = np.vstack(F_cal)
    z_cal = np.concatenate(z_cal)

    # Fit calibrator
    calib = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, random_state=2025 + c).fit(F_cal, z_cal)

    # Final base on full
    base = LogisticRegression(solver='liblinear', penalty='l2', C=2.0, max_iter=2000, tol=1e-4, random_state=42 + c)
    base.fit(Xfull, yb)
    s_te = base.decision_function(Xtest).reshape(-1, 1)

    Ptest[:, c] = calib.predict_proba(s_te)[:, 1]

Ptest = odds_norm(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_cal_lr_char_wb_fixed.csv', index=False)

Fixed Cal-Char_wb-LR Fold 1: 0.4673


Fixed Cal-Char_wb-LR Fold 2: 0.4650


Fixed Cal-Char_wb-LR Fold 3: 0.4981


Fixed Cal-Char_wb-LR Fold 4: 0.4785


Fixed Cal-Char_wb-LR Fold 5: 0.4625
Fixed Cal-Char_wb-LR OOF: 0.4743


In [56]:
# Cell 20 — Word NB-SVM (strong settings, enforced normalization)
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def log_count_ratio(X, y_bin, alpha=1.0):
    pos = np.asarray(X[y_bin==1].sum(axis=0)).ravel() + alpha
    neg = np.asarray(X[y_bin==0].sum(axis=0)).ravel() + alpha
    r = np.log(pos / neg)
    r[~np.isfinite(r)] = 0.0
    return r

def odds_norm(P, eps=1e-12):
    P = np.clip(P, eps, 1-eps)
    odds = P / (1 - P)
    Pn = odds / (odds.sum(axis=1, keepdims=True) + eps)
    return Pn / Pn.sum(axis=1, keepdims=True)  # force exact sum=1

vec_params = dict(analyzer='word', ngram_range=(1,3), lowercase=True,
                  min_df=1, max_df=0.99, binary=False, max_features=None)

oof = np.zeros((len(train), 3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y), 1):
    vec = CountVectorizer(**vec_params)
    Xtr = vec.fit_transform(train['text'].iloc[tr])
    Xva = vec.transform(train['text'].iloc[va])

    Pva = np.zeros((len(va), 3))
    for c in range(3):
        yb_tr = (y[tr]==c).astype(int)
        r = log_count_ratio(Xtr, yb_tr, alpha=1.0)
        Xtr_r = Xtr.multiply(csr_matrix(r)); Xva_r = Xva.multiply(csr_matrix(r))
        clf = LogisticRegression(solver='liblinear', penalty='l2',
                                 C=8.0, max_iter=3000, tol=1e-4, random_state=4242+c)
        clf.fit(Xtr_r, yb_tr)
        Pva[:, c] = clf.predict_proba(Xva_r)[:, 1]

    Pva = odds_norm(Pva)
    oof[va] = Pva
    s = log_loss(y[va], Pva); scores.append(s)
    print(f'Word NB-SVM Fold {f}: {s:.4f}')

sc = float(np.mean(scores)); print(f'Word NB-SVM OOF: {sc:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_word_nbsvm_fixed.csv', index=False)

# Full fit for test
vec_full = CountVectorizer(**vec_params)
Xfull = vec_full.fit_transform(train['text'])
Xtest = vec_full.transform(test['text'])
Ptest = np.zeros((len(test), 3))
for c in range(3):
    yb = (y==c).astype(int)
    r = log_count_ratio(Xfull, yb, alpha=1.0)
    clf = LogisticRegression(solver='liblinear', penalty='l2',
                             C=8.0, max_iter=3000, tol=1e-4, random_state=999+c)
    clf.fit(Xfull.multiply(csr_matrix(r)), yb)
    Ptest[:, c] = clf.predict_proba(Xtest.multiply(csr_matrix(r)))[:, 1]

Ptest = odds_norm(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_word_nbsvm_fixed.csv', index=False)

Word NB-SVM Fold 1: 0.5732


Word NB-SVM Fold 2: 0.5448


Word NB-SVM Fold 3: 0.5853


Word NB-SVM Fold 4: 0.5419


Word NB-SVM Fold 5: 0.5700
Word NB-SVM OOF: 0.5630


In [57]:
# Cell 21 — Uncalibrated Char_wb LR (wide ngrams, low min_df, large vocab, no calibration)
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

vec_params = dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False,
                  sublinear_tf=True, min_df=1, max_df=0.98, max_features=400_000)

oof = np.zeros((len(train), 3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y), 1):
    vec = TfidfVectorizer(**vec_params)
    Xtr = vec.fit_transform(train['text'].iloc[tr])
    Xva = vec.transform(train['text'].iloc[va])

    clf = LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4,
                             random_state=42, n_jobs=1)
    clf.fit(Xtr, y[tr])
    p = clf.predict_proba(Xva)
    oof[va] = p
    s = log_loss(y[va], p); scores.append(s)
    print(f'Uncal Char_wb LR Fold {f}: {s:.4f}')

sc = float(np.mean(scores)); print(f'Uncal Char_wb LR OOF: {sc:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_uncal_char_wb_lr.csv', index=False)

# Full fit for test
vec_full = TfidfVectorizer(**vec_params)
Xfull = vec_full.fit_transform(train['text'])
Xtest = vec_full.transform(test['text'])
clf_full = LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4,
                              random_state=42, n_jobs=1)
clf_full.fit(Xfull, y)
ptest = clf_full.predict_proba(Xtest)
pd.DataFrame(ptest, columns=classes).to_csv('test_uncal_char_wb_lr.csv', index=False)

Uncal Char_wb LR Fold 1: 0.4125


Uncal Char_wb LR Fold 2: 0.4132


Uncal Char_wb LR Fold 3: 0.4296


Uncal Char_wb LR Fold 4: 0.4183


Uncal Char_wb LR Fold 5: 0.4132
Uncal Char_wb LR OOF: 0.4173


In [60]:
# Cell 22 — Word TF-IDF + Stylometrics LR (manual stacking to avoid sklearn version issues)
import numpy as np, pandas as pd, re, string
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)

def compute_stylo(series):
    def feats(t):
        L = len(t); 
        if L == 0: return [0,0,0,0,0,0,0,0]
        punct = sum(c in string.punctuation for c in t)
        exclam = t.count('!'); semi = t.count(';')
        digits = sum(c.isdigit() for c in t)
        letters = sum(c.isalpha() for c in t)
        words = t.split(); wc = len(words)
        avg_wlen = (sum(len(w) for w in words)/wc) if wc else 0.0
        sents = [s for s in re.split(r'[.!?]+', t) if s.strip()]
        sc = len(sents); avg_sent_wc = (wc/sc) if sc else wc
        cap_ratio = (sum(c.isupper() for c in t)/letters) if letters else 0.0
        return [punct/L, exclam/L, semi/L, digits/L, cap_ratio, avg_wlen, avg_sent_wc, wc]
    X = [feats(t) for t in series]
    return np.array(X)

train_sty = compute_stylo(train['text']); test_sty = compute_stylo(test['text'])
scaler = MaxAbsScaler().fit(train_sty)
train_sty_scaled = scaler.transform(train_sty)
test_sty_scaled = scaler.transform(test_sty)

tfidf_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True,
                    sublinear_tf=True, min_df=2, max_df=0.95, max_features=200_000)
clf = LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4, random_state=42)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros((len(train), 3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y), 1):
    vec = TfidfVectorizer(**tfidf_params)
    Xtr_tfidf = vec.fit_transform(train['text'].iloc[tr])
    Xva_tfidf = vec.transform(train['text'].iloc[va])
    Xtr_sty = csr_matrix(train_sty_scaled[tr])
    Xva_sty = csr_matrix(train_sty_scaled[va])
    Xtr = hstack([Xtr_tfidf, Xtr_sty])
    Xva = hstack([Xva_tfidf, Xva_sty])
    clf.fit(Xtr, y[tr])
    p = clf.predict_proba(Xva)
    dfp = pd.DataFrame(p, columns=clf.classes_).reindex(columns=classes).fillna(0.0).values
    dfp = np.clip(dfp, 1e-15, 1-1e-15); dfp /= dfp.sum(axis=1, keepdims=True)
    oof[va] = dfp
    s = log_loss(y[va], dfp); scores.append(s); print(f'Word+Stylo LR Fold {f}: {s:.4f}')
print('Word+Stylo LR OOF:', float(np.mean(scores)))
pd.DataFrame(oof, columns=classes).to_csv('oof_stylo_word_lr.csv', index=False)

# Full fit
vec_full = TfidfVectorizer(**tfidf_params)
Xfull_tfidf = vec_full.fit_transform(train['text'])
Xtest_tfidf = vec_full.transform(test['text'])
Xfull_sty = csr_matrix(train_sty_scaled)
Xtest_sty = csr_matrix(test_sty_scaled)
Xfull = hstack([Xfull_tfidf, Xfull_sty])
Xtest = hstack([Xtest_tfidf, Xtest_sty])
clf.fit(Xfull, y)
ptest = clf.predict_proba(Xtest)
dfp = pd.DataFrame(ptest, columns=clf.classes_).reindex(columns=classes).fillna(0.0).values
dfp = np.clip(dfp, 1e-15, 1-1e-15); dfp /= dfp.sum(axis=1, keepdims=True)
pd.DataFrame(dfp, columns=classes).to_csv('test_stylo_word_lr.csv', index=False)

Word+Stylo LR Fold 1: 1.0986


Word+Stylo LR Fold 2: 1.0986


Word+Stylo LR Fold 3: 1.0986


Word+Stylo LR Fold 4: 1.0986


Word+Stylo LR Fold 5: 1.0986
Word+Stylo LR OOF: 1.0986122886681096


In [61]:
# Cell 23 — Strong Word NB-SVM (fix the broken one)
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha=1.0):
    pos = np.asarray(X[yb==1].sum(axis=0)).ravel() + alpha
    neg = np.asarray(X[yb==0].sum(axis=0)).ravel() + alpha
    r = np.log(pos/neg); r[~np.isfinite(r)] = 0.0
    return r

def odds_norm(P, eps=1e-12):
    P = np.clip(P, eps, 1-eps); odds = P/(1-P)
    Pn = odds/(odds.sum(axis=1, keepdims=True)+eps)
    return Pn / Pn.sum(axis=1, keepdims=True)

vec_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_df=0.9, binary=True)
clf_params = dict(solver='liblinear', penalty='l2', C=4.0, max_iter=3000, tol=1e-4)

oof = np.zeros((len(train), 3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y), 1):
    vec = CountVectorizer(**vec_params)
    Xtr = vec.fit_transform(train['text'].iloc[tr]); Xva = vec.transform(train['text'].iloc[va])
    Pva = np.zeros((len(va), 3))
    for c in range(3):
        yb = (y[tr]==c).astype(int)
        r = log_count_ratio(Xtr, yb, alpha=1.0)
        clf = LogisticRegression(**clf_params, random_state=42+c)
        clf.fit(Xtr.multiply(csr_matrix(r)), yb)
        Pva[:, c] = clf.predict_proba(Xva.multiply(csr_matrix(r)))[:, 1]
    Pva = odds_norm(Pva); oof[va] = Pva
    s = log_loss(y[va], Pva); scores.append(s); print(f'Word NB-SVM Fold {f}: {s:.4f}')
print('Word NB-SVM OOF:', float(np.mean(scores)))
pd.DataFrame(oof, columns=classes).to_csv('oof_word_nbsvm.csv', index=False)

vec_full = CountVectorizer(**vec_params)
Xfull = vec_full.fit_transform(train['text']); Xtest = vec_full.transform(test['text'])
Ptest = np.zeros((len(test), 3))
for c in range(3):
    yb = (y==c).astype(int)
    r = log_count_ratio(Xfull, yb, alpha=1.0)
    clf = LogisticRegression(**clf_params, random_state=999+c)
    clf.fit(Xfull.multiply(csr_matrix(r)), yb)
    Ptest[:, c] = clf.predict_proba(Xtest.multiply(csr_matrix(r)))[:, 1]
Ptest = odds_norm(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_word_nbsvm.csv', index=False)

Word NB-SVM Fold 1: 0.5832


Word NB-SVM Fold 2: 0.5586


Word NB-SVM Fold 3: 0.5976


Word NB-SVM Fold 4: 0.5428


In [62]:
# Cell 24 — 10-fold Uncalibrated Char_wb LR (for stability, wide ngrams, low min_df)
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

vec_params = dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False,
                  sublinear_tf=True, min_df=1, max_df=0.98, max_features=400_000)

oof = np.zeros((len(train), 3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y), 1):
    vec = TfidfVectorizer(**vec_params)
    Xtr = vec.fit_transform(train['text'].iloc[tr])
    Xva = vec.transform(train['text'].iloc[va])

    clf = LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4,
                             random_state=42, n_jobs=1)
    clf.fit(Xtr, y[tr])
    p = clf.predict_proba(Xva)
    oof[va] = p
    s = log_loss(y[va], p); scores.append(s)
    print(f'10-fold Uncal Char_wb LR Fold {f}: {s:.4f}')

sc = float(np.mean(scores)); print(f'10-fold Uncal Char_wb LR OOF: {sc:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_10fold_uncal_char_wb_lr.csv', index=False)

# Full fit for test (average 10-fold predictions for bagging)
skf_test = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
Ptest = np.zeros((len(test), 3))
for f,(tr,va) in enumerate(skf_test.split(train['text'], y), 1):
    vec = TfidfVectorizer(**vec_params)
    Xtr = vec.fit_transform(train['text'].iloc[tr])
    Xtest_f = vec.transform(test['text'])
    clf = LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4,
                             random_state=42 + f, n_jobs=1)
    clf.fit(Xtr, y[tr])
    Ptest += clf.predict_proba(Xtest_f)
Ptest /= 10.0  # average bagged predictions
Ptest = np.clip(Ptest, 1e-15, 1-1e-15)
Ptest /= Ptest.sum(axis=1, keepdims=True)
pd.DataFrame(Ptest, columns=classes).to_csv('test_10fold_uncal_char_wb_lr.csv', index=False)

10-fold Uncal Char_wb LR Fold 1: 0.4062


10-fold Uncal Char_wb LR Fold 2: 0.4029


10-fold Uncal Char_wb LR Fold 3: 0.3967


10-fold Uncal Char_wb LR Fold 4: 0.4097


10-fold Uncal Char_wb LR Fold 5: 0.4305


10-fold Uncal Char_wb LR Fold 6: 0.4137


10-fold Uncal Char_wb LR Fold 7: 0.4188


10-fold Uncal Char_wb LR Fold 8: 0.4012


10-fold Uncal Char_wb LR Fold 9: 0.4058


10-fold Uncal Char_wb LR Fold 10: 0.3968
10-fold Uncal Char_wb LR OOF: 0.4082


In [65]:
# Cell 25 — Generate test predictions for 10-fold Char LR (full fit only, CV already done)
import numpy as np, pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)

vec_params = dict(analyzer='char', ngram_range=(3,7), lowercase=False,
                  sublinear_tf=True, min_df=1, max_df=0.98, max_features=400_000)

# Full fit for test (single fit to save time)
vec_full = TfidfVectorizer(**vec_params)
Xfull = vec_full.fit_transform(train['text'])
Xtest = vec_full.transform(test['text'])
clf_full = LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4,
                              random_state=42, n_jobs=1)
clf_full.fit(Xfull, y)
ptest = clf_full.predict_proba(Xtest)
pd.DataFrame(ptest, columns=classes).to_csv('test_10fold_char_lr.csv', index=False)
print('Test predictions saved to test_10fold_char_lr.csv')

10-fold Char LR Fold 1: 0.4327


10-fold Char LR Fold 2: 0.4260


10-fold Char LR Fold 3: 0.4240


10-fold Char LR Fold 4: 0.4299


10-fold Char LR Fold 5: 0.4460


KeyboardInterrupt: 

In [67]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha=0.5):
    pos=np.asarray(X[yb==1].sum(0)).ravel()+alpha
    neg=np.asarray(X[yb==0].sum(0)).ravel()+alpha
    r=np.log(pos/neg); r[~np.isfinite(r)]=0.0
    return r

def odds_norm(P, eps=1e-12):
    P=np.clip(P,eps,1-eps); odds=P/(1-P)
    Pn=odds/(odds.sum(axis=1, keepdims=True)+eps)
    return Pn / Pn.sum(axis=1, keepdims=True)

vec_params=dict(analyzer='word', ngram_range=(1,3), lowercase=True, min_df=3, max_df=0.95, binary=True, max_features=200000)
clf_params=dict(solver='liblinear', penalty='l2', C=6.0, max_iter=3000, tol=1e-4)

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
    vec=CountVectorizer(**vec_params)
    Xtr=vec.fit_transform(train['text'].iloc[tr]); Xva=vec.transform(train['text'].iloc[va])
    Pva=np.zeros((len(va),3))
    for c in range(3):
        yb=(y[tr]==c).astype(int)
        r=log_count_ratio(Xtr, yb, alpha=0.5)
        clf=LogisticRegression(**clf_params, random_state=42+c)
        clf.fit(Xtr.multiply(csr_matrix(r)), yb)
        Pva[:,c]=clf.predict_proba(Xva.multiply(csr_matrix(r)))[:,1]
    Pva=odds_norm(Pva); oof[va]=Pva
    s=log_loss(y[va], Pva); scores.append(s); print(f'Word NB-SVM Fold {f}: {s:.4f}')
print('Word NB-SVM OOF:', float(np.mean(scores)))
pd.DataFrame(oof, columns=classes).to_csv('oof_word_nbsvm.csv', index=False)

vec_full=CountVectorizer(**vec_params)
Xfull=vec_full.fit_transform(train['text']); Xtest=vec_full.transform(test['text'])
Ptest=np.zeros((len(test),3))
for c in range(3):
    yb=(y==c).astype(int)
    r=log_count_ratio(Xfull, yb, alpha=0.5)
    clf=LogisticRegression(**clf_params, random_state=999+c)
    clf.fit(Xfull.multiply(csr_matrix(r)), yb)
    Ptest[:,c]=clf.predict_proba(Xtest.multiply(csr_matrix(r)))[:,1]
Ptest=odds_norm(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_word_nbsvm.csv', index=False)

Word NB-SVM Fold 1: 0.6271


Word NB-SVM Fold 2: 0.5958


Word NB-SVM Fold 3: 0.6400


Word NB-SVM Fold 4: 0.5812


Word NB-SVM Fold 5: 0.6242
Word NB-SVM OOF: 0.613671142726013


In [68]:
import numpy as np, pandas as pd, re, string
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MaxAbsScaler
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def stylo(series):
    rows=[]
    for t in series:
        L=len(t); words=t.split(); wc=len(words)
        letters=sum(c.isalpha() for c in t) or 1
        sents=[s for s in re.split(r'[.!?]+', t) if s.strip()]; sc=len(sents) or 1
        rows.append([
            (sum(c in string.punctuation for c in t)/L) if L else 0.0,
            t.count('!')/L if L else 0.0,
            t.count(';')/L if L else 0.0,
            (sum(c.isdigit() for c in t)/L) if L else 0.0,
            (sum(c.isupper() for c in t)/letters),
            (sum(len(w) for w in words)/wc) if wc else 0.0,
            (wc/sc) if sc else 0.0,
            wc
        ])
    return np.array(rows, dtype=float)

train_sty=stylo(train['text']); test_sty=stylo(test['text'])

tfidf_params=dict(analyzer='word', ngram_range=(1,2), lowercase=True, sublinear_tf=True,
                  min_df=2, max_df=0.95, max_features=200_000)
clf=LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4, random_state=42)

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
    vec=TfidfVectorizer(**tfidf_params)
    Xtr_tfidf=vec.fit_transform(train['text'].iloc[tr]); Xva_tfidf=vec.transform(train['text'].iloc[va])
    scaler=MaxAbsScaler().fit(train_sty[tr])
    Xtr_sty=csr_matrix(scaler.transform(train_sty[tr])); Xva_sty=csr_matrix(scaler.transform(train_sty[va]))
    Xtr=hstack([Xtr_tfidf, Xtr_sty]); Xva=hstack([Xva_tfidf, Xva_sty])
    clf.fit(Xtr, y[tr])
    p=clf.predict_proba(Xva)             # already aligned with y’s label order
    oof[va]=p
    s=log_loss(y[va], p); scores.append(s); print(f'Stylo+Word LR Fold {f}: {s:.4f}')
print('Stylo+Word LR OOF:', float(np.mean(scores)))
pd.DataFrame(oof, columns=classes).to_csv('oof_stylo_word_lr.csv', index=False)

vec_full=TfidfVectorizer(**tfidf_params)
Xfull_tfidf=vec_full.fit_transform(train['text']); Xtest_tfidf=vec_full.transform(test['text'])
scaler_full=MaxAbsScaler().fit(train_sty)
Xfull_sty=csr_matrix(scaler_full.transform(train_sty)); Xtest_sty=csr_matrix(scaler_full.transform(test_sty))
Xfull=hstack([Xfull_tfidf, Xfull_sty]); Xtest=hstack([Xtest_tfidf, Xtest_sty])
clf.fit(Xfull, y); ptest=clf.predict_proba(Xtest)
pd.DataFrame(ptest, columns=classes).to_csv('test_stylo_word_lr.csv', index=False)

Stylo+Word LR Fold 1: 0.4600


Stylo+Word LR Fold 2: 0.4684


Stylo+Word LR Fold 3: 0.4766


Stylo+Word LR Fold 4: 0.4528


Stylo+Word LR Fold 5: 0.4541
Stylo+Word LR OOF: 0.46236992048108905


In [69]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

train=pd.read_csv('train.csv'); le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model_files=[
    ('oof_10fold_uncal_char_wb_lr.csv','test_10fold_uncal_char_wb_lr.csv'),
    ('oof_char_lr.csv','test_char_lr.csv'),
    ('oof_calsvc_char.csv','test_calsvc_char.csv'),
    ('oof_word_nbsvm.csv','test_word_nbsvm.csv'),
    ('oof_stylo_word_lr.csv','test_stylo_word_lr.csv')
]

meta_train=np.hstack([pd.read_csv(o)[classes].values for o,_ in model_files])
meta_test=np.hstack([pd.read_csv(t)[classes].values for _,t in model_files])

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(meta_train,y),1):
    meta=LogisticRegression(solver='lbfgs', C=1.5, max_iter=2000, random_state=2025)
    meta.fit(meta_train[tr], y[tr])
    p=meta.predict_proba(meta_train[va]); oof[va]=p
    s=log_loss(y[va], p); scores.append(s); print(f'Meta-LR Fold {f}: {s:.4f}')
meta_sc=float(np.mean(scores)); print('Meta-LR OOF:', meta_sc)
pd.DataFrame(oof, columns=classes).to_csv('oof_stacked_meta_lr.csv', index=False)

meta.fit(meta_train, y)
ptest=meta.predict_proba(meta_test)
ptest=np.clip(ptest,1e-15,1-1e-15); ptest/=ptest.sum(axis=1, keepdims=True)
pd.DataFrame(ptest, columns=classes).to_csv('test_stacked_meta_lr.csv', index=False)

Meta-LR Fold 1: 0.3551


Meta-LR Fold 2: 0.3549


Meta-LR Fold 3: 0.3712


Meta-LR Fold 4: 0.3511


Meta-LR Fold 5: 0.3561
Meta-LR OOF: 0.35768948685097157


In [71]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Fallback: Strong Word TF-IDF LR (since NB-SVM configs failing, use this for diversity ~0.45 OOF)
word_lr_pipe = TfidfVectorizer(analyzer='word', ngram_range=(1,3), lowercase=True, sublinear_tf=True, min_df=2, max_df=0.95, max_features=200000)
X_word = word_lr_pipe.fit_transform(train['text']); X_word_test = word_lr_pipe.transform(test['text'])
lr = LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4, random_state=42, n_jobs=1)
oof_word_lr = np.zeros((len(train),3)); scores_word=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
    lr.fit(X_word[tr], y[tr])
    p = lr.predict_proba(X_word[va])
    oof_word_lr[va] = p
    s = log_loss(y[va], p); scores_word.append(s); print(f'Word TF-IDF LR Fold {f}: {s:.4f}')
sc_word = float(np.mean(scores_word)); print('Word TF-IDF LR OOF:', sc_word)
pd.DataFrame(oof_word_lr, columns=classes).to_csv('oof_word_lr_strong.csv', index=False)
lr.fit(X_word, y)
ptest_word = lr.predict_proba(X_word_test)
pd.DataFrame(ptest_word, columns=classes).to_csv('test_word_lr_strong.csv', index=False)

print('Fallback Word LR OOF:', sc_word)

Word TF-IDF LR Fold 1: 0.4859


Word TF-IDF LR Fold 2: 0.4912


Word TF-IDF LR Fold 3: 0.4947


Word TF-IDF LR Fold 4: 0.4811


Word TF-IDF LR Fold 5: 0.4850
Word TF-IDF LR OOF: 0.48757614282367695


Fallback Word LR OOF: 0.48757614282367695


In [72]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

train=pd.read_csv('train.csv'); le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model_files=[
    ('oof_10fold_uncal_char_wb_lr.csv','test_10fold_uncal_char_wb_lr.csv'),
    ('oof_char_lr.csv','test_char_lr.csv'),
    ('oof_calsvc_char.csv','test_calsvc_char.csv'),
    ('oof_stylo_word_lr.csv','test_stylo_word_lr.csv'),
    ('oof_word_lr_strong.csv','test_word_lr_strong.csv')
]

meta_train=np.hstack([pd.read_csv(o)[classes].values for o,_ in model_files])
meta_test=np.hstack([pd.read_csv(t)[classes].values for _,t in model_files])

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(meta_train,y),1):
    meta=LogisticRegression(solver='lbfgs', C=1.5, max_iter=2000, random_state=2025)
    meta.fit(meta_train[tr], y[tr])
    p=meta.predict_proba(meta_train[va]); oof[va]=p
    s=log_loss(y[va], p); scores.append(s); print(f'Meta-LR Fold {f}: {s:.4f}')
meta_sc=float(np.mean(scores)); print('Meta-LR OOF:', meta_sc)
pd.DataFrame(oof, columns=classes).to_csv('oof_stacked_meta_lr.csv', index=False)

meta.fit(meta_train, y)
ptest=meta.predict_proba(meta_test)
ptest=np.clip(ptest,1e-15,1-1e-15); ptest/=ptest.sum(axis=1, keepdims=True)
pd.DataFrame(ptest, columns=classes).to_csv('test_stacked_meta_lr.csv', index=False)

Meta-LR Fold 1: 0.3583


Meta-LR Fold 2: 0.3639


Meta-LR Fold 3: 0.3754


Meta-LR Fold 4: 0.3564


Meta-LR Fold 5: 0.3613
Meta-LR OOF: 0.3630690794929753


In [73]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

vec_params=dict(analyzer='char', ngram_range=(3,7), lowercase=False,
                sublinear_tf=True, min_df=2, max_df=0.98, max_features=300_000)
clf=LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4, random_state=42, n_jobs=1)

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
    vec=TfidfVectorizer(**vec_params)
    Xtr=vec.fit_transform(train['text'].iloc[tr]); Xva=vec.transform(train['text'].iloc[va])
    clf.fit(Xtr, y[tr])
    p=clf.predict_proba(Xva)
    oof[va]=p
    s=log_loss(y[va], p); scores.append(s); print(f'Char Variant LR Fold {f}: {s:.4f}')
sc=float(np.mean(scores)); print('Char Variant LR OOF:', sc)
pd.DataFrame(oof, columns=classes).to_csv('oof_char_variant.csv', index=False)

# Full fit for test
vec_full=TfidfVectorizer(**vec_params)
Xfull=vec_full.fit_transform(train['text']); Xtest=vec_full.transform(test['text'])
clf.fit(Xfull, y)
ptest=clf.predict_proba(Xtest)
pd.DataFrame(ptest, columns=classes).to_csv('test_char_variant.csv', index=False)

Char Variant LR Fold 1: 0.4389


Char Variant LR Fold 2: 0.4409


Char Variant LR Fold 3: 0.4489


Char Variant LR Fold 4: 0.4400


Char Variant LR Fold 5: 0.4278
Char Variant LR OOF: 0.43930143225025275


In [74]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

train=pd.read_csv('train.csv'); le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model_files=[
    ('oof_10fold_uncal_char_wb_lr.csv','test_10fold_uncal_char_wb_lr.csv'),
    ('oof_char_lr.csv','test_char_lr.csv'),
    ('oof_calsvc_char.csv','test_calsvc_char.csv'),
    ('oof_stylo_word_lr.csv','test_stylo_word_lr.csv'),
    ('oof_word_lr_strong.csv','test_word_lr_strong.csv'),
    ('oof_char_variant.csv','test_char_variant.csv')
]

meta_train=np.hstack([pd.read_csv(o)[classes].values for o,_ in model_files])
meta_test=np.hstack([pd.read_csv(t)[classes].values for _,t in model_files])

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(meta_train,y),1):
    meta=LogisticRegression(solver='lbfgs', C=1.5, max_iter=2000, random_state=2025)
    meta.fit(meta_train[tr], y[tr])
    p=meta.predict_proba(meta_train[va]); oof[va]=p
    s=log_loss(y[va], p); scores.append(s); print(f'Meta-LR Fold {f}: {s:.4f}')
meta_sc=float(np.mean(scores)); print('Meta-LR OOF:', meta_sc)
pd.DataFrame(oof, columns=classes).to_csv('oof_stacked_meta_lr.csv', index=False)

meta.fit(meta_train, y)
ptest=meta.predict_proba(meta_test)
ptest=np.clip(ptest,1e-15,1-1e-15); ptest/=ptest.sum(axis=1, keepdims=True)
pd.DataFrame(ptest, columns=classes).to_csv('test_stacked_meta_lr.csv', index=False)

Meta-LR Fold 1: 0.3565


Meta-LR Fold 2: 0.3595
Meta-LR Fold 3: 0.3732


Meta-LR Fold 4: 0.3561


Meta-LR Fold 5: 0.3574
Meta-LR OOF: 0.36055009858050163


In [75]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha=1.0):
    pos=np.asarray(X[yb==1].sum(0)).ravel()+alpha
    neg=np.asarray(X[yb==0].sum(0)).ravel()+alpha
    r=np.log(pos/neg); r[~np.isfinite(r)]=0.0
    return r

def odds_norm(P, eps=1e-12):
    P=np.clip(P,eps,1-eps); odds=P/(1-P)
    Pn=odds/(odds.sum(axis=1, keepdims=True)+eps)
    return Pn / Pn.sum(axis=1, keepdims=True)

vec_params=dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False, min_df=1, max_df=0.99, binary=False, max_features=None)
clf_params=dict(solver='liblinear', penalty='l2', C=6.0, max_iter=3000, tol=1e-4)

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
    vec=CountVectorizer(**vec_params)
    Xtr=vec.fit_transform(train['text'].iloc[tr]); Xva=vec.transform(train['text'].iloc[va])
    Pva=np.zeros((len(va),3))
    for c in range(3):
        yb=(y[tr]==c).astype(int)
        r=log_count_ratio(Xtr, yb, alpha=1.0)
        clf=LogisticRegression(**clf_params, random_state=42+c)
        clf.fit(Xtr.multiply(csr_matrix(r)), yb)
        Pva[:,c]=clf.predict_proba(Xva.multiply(csr_matrix(r)))[:,1]
    Pva=odds_norm(Pva); oof[va]=Pva
    s=log_loss(y[va], Pva); scores.append(s); print(f'Char_wb NB-SVM Fold {f}: {s:.4f}')
print('Char_wb NB-SVM OOF:', float(np.mean(scores)))
pd.DataFrame(oof, columns=classes).to_csv('oof_char_wb_nbsvm.csv', index=False)

# Full fit -> test
vec_full=CountVectorizer(**vec_params)
Xfull=vec_full.fit_transform(train['text']); Xtest=vec_full.transform(test['text'])
Ptest=np.zeros((len(test),3))
for c in range(3):
    yb=(y==c).astype(int)
    r=log_count_ratio(Xfull, yb, alpha=1.0)
    clf=LogisticRegression(**clf_params, random_state=999+c)
    clf.fit(Xfull.multiply(csr_matrix(r)), yb)
    Ptest[:,c]=clf.predict_proba(Xtest.multiply(csr_matrix(r)))[:,1]
Ptest=odds_norm(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_char_wb_nbsvm.csv', index=False)

Char_wb NB-SVM Fold 1: 0.9719


Char_wb NB-SVM Fold 2: 0.9026


Char_wb NB-SVM Fold 3: 1.0630


Char_wb NB-SVM Fold 4: 0.9351


Char_wb NB-SVM Fold 5: 0.9420
Char_wb NB-SVM OOF: 0.9629195888044508


In [76]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)

# Seed bagging for top models: 3 seeds on full data, average test preds
n_seeds = 3
seeds = [42, 123, 2025]

# 1. Bag 10fold char_wb LR (your best: analyzer='char_wb' (2,6), C=4.0)
vec_params_char_wb = dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False,
                          sublinear_tf=True, min_df=1, max_df=0.98, max_features=400_000)
Ptest_char_wb_bagged = np.zeros((len(test), 3))
for seed in seeds:
    clf = LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4, random_state=seed, n_jobs=1)
    vec = TfidfVectorizer(**vec_params_char_wb)
    Xfull = vec.fit_transform(train['text']); Xtest_f = vec.transform(test['text'])
    clf.fit(Xfull, y)
    Ptest_char_wb_bagged += clf.predict_proba(Xtest_f)
Ptest_char_wb_bagged /= n_seeds
Ptest_char_wb_bagged = np.clip(Ptest_char_wb_bagged, 1e-15, 1-1e-15)
Ptest_char_wb_bagged /= Ptest_char_wb_bagged.sum(axis=1, keepdims=True)
pd.DataFrame(Ptest_char_wb_bagged, columns=classes).to_csv('test_10fold_uncal_char_wb_lr_bagged.csv', index=False)

# 2. Bag char_variant LR (char (3,7), C=4.0)
vec_params_char = dict(analyzer='char', ngram_range=(3,7), lowercase=False,
                       sublinear_tf=True, min_df=2, max_df=0.98, max_features=300_000)
Ptest_char_bagged = np.zeros((len(test), 3))
for seed in seeds:
    clf = LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4, random_state=seed, n_jobs=1)
    vec = TfidfVectorizer(**vec_params_char)
    Xfull = vec.fit_transform(train['text']); Xtest_f = vec.transform(test['text'])
    clf.fit(Xfull, y)
    Ptest_char_bagged += clf.predict_proba(Xtest_f)
Ptest_char_bagged /= n_seeds
Ptest_char_bagged = np.clip(Ptest_char_bagged, 1e-15, 1-1e-15)
Ptest_char_bagged /= Ptest_char_bagged.sum(axis=1, keepdims=True)
pd.DataFrame(Ptest_char_bagged, columns=classes).to_csv('test_char_variant_bagged.csv', index=False)

# 3. Bag Calibrated LinearSVC (reuse your vec_params from cell 15, light 3-fold inner CV per seed)
vec_params_svc = dict(analyzer='char_wb', ngram_range=(2,5), lowercase=False,
                      sublinear_tf=True, min_df=3, max_df=0.98, max_features=250_000)
svc_params = dict(C=0.5, loss='squared_hinge', dual='auto', max_iter=3000, tol=1e-4)
inner_cv_splits = 3
Ptest_svc_bagged = np.zeros((len(test), 3))
for seed in seeds:
    vec = TfidfVectorizer(**vec_params_svc)
    Xfull = vec.fit_transform(train['text']); Xtest_f = vec.transform(test['text'])
    Ptest_seed = np.zeros((len(test), 3))
    for c in range(3):
        yb = (y == c).astype(int)
        skf_inner = StratifiedKFold(n_splits=inner_cv_splits, shuffle=True, random_state=seed + c)
        F_cal = []; z_cal = []
        indices = np.arange(len(train))
        for i_tr, i_va in skf_inner.split(indices, yb):
            svc = LinearSVC(**svc_params, random_state=seed + c)
            svc.fit(Xfull[i_tr], yb[i_tr])
            s = svc.decision_function(Xfull[i_va])
            if s.ndim > 1: s = s[:, 0]
            F_cal.append(s.reshape(-1, 1)); z_cal.append(yb[i_va])
        F_cal = np.vstack(F_cal); z_cal = np.concatenate(z_cal)
        platt = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, random_state=seed + c*10)
        platt.fit(F_cal, z_cal)
        svc_final = LinearSVC(**svc_params, random_state=seed + c)
        svc_final.fit(Xfull, yb)
        s_test = svc_final.decision_function(Xtest_f)
        if s_test.ndim > 1: s_test = s_test[:, 0]
        Ptest_seed[:, c] = platt.predict_proba(s_test.reshape(-1, 1))[:, 1]
    Ptest_seed = odds_norm(Ptest_seed)  # define odds_norm if not already
    Ptest_svc_bagged += Ptest_seed
Ptest_svc_bagged /= n_seeds
Ptest_svc_bagged = np.clip(Ptest_svc_bagged, 1e-15, 1-1e-15)
Ptest_svc_bagged /= Ptest_svc_bagged.sum(axis=1, keepdims=True)
pd.DataFrame(Ptest_svc_bagged, columns=classes).to_csv('test_calsvc_bagged.csv', index=False)

print('Seed bagging complete for top models. Use bagged test files in re-stack.')

def odds_norm(P, eps=1e-9):
    P = np.clip(P, eps, 1-eps)
    odds = P / (1 - P)
    return odds / (odds.sum(axis=1, keepdims=True) + eps)

Seed bagging complete for top models. Use bagged test files in re-stack.


In [77]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

train=pd.read_csv('train.csv'); le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Strong models only (OOF <0.46): exclude word_lr_strong 0.4876 and bad NB-SVM 0.96
model_files=[
    ('oof_10fold_uncal_char_wb_lr.csv','test_10fold_uncal_char_wb_lr_bagged.csv'),  # bagged test
    ('oof_char_lr.csv','test_char_lr.csv'),
    ('oof_calsvc_char.csv','test_calsvc_bagged.csv'),  # bagged test
    ('oof_char_variant.csv','test_char_variant_bagged.csv'),  # bagged test
    ('oof_stylo_word_lr.csv','test_stylo_word_lr.csv')
]

meta_train=np.hstack([pd.read_csv(o)[classes].values for o,_ in model_files])
meta_test=np.hstack([pd.read_csv(t)[classes].values for _,t in model_files])

# Tune meta-LR C over grid with 5-fold CV
c_grid = [0.5, 0.75, 1.0, 1.5, 2.0]
best_c = None; best_sc = float('inf')
for c in c_grid:
    oof_cv=np.zeros((len(train),3)); scores_cv=[]
    for f,(tr,va) in enumerate(skf.split(meta_train,y),1):
        meta=LogisticRegression(solver='lbfgs', C=c, max_iter=2000, random_state=2025)
        meta.fit(meta_train[tr], y[tr])
        p=meta.predict_proba(meta_train[va]); oof_cv[va]=p
        s=log_loss(y[va], p); scores_cv.append(s)
    sc_cv=float(np.mean(scores_cv))
    print(f'Meta-LR C={c} CV OOF: {sc_cv:.4f}')
    if sc_cv < best_sc:
        best_sc = sc_cv; best_c = c

print(f'Best meta-LR C: {best_c} with CV OOF: {best_sc:.4f}')

# Fit final meta on full with best C
meta=LogisticRegression(solver='lbfgs', C=best_c, max_iter=2000, random_state=2025)
meta.fit(meta_train, y)
ptest=meta.predict_proba(meta_test)
ptest=np.clip(ptest,1e-15,1-1e-15); ptest/=ptest.sum(axis=1, keepdims=True)
pd.DataFrame(ptest, columns=classes).to_csv('test_stacked_meta_lr.csv', index=False)

# Compute final OOF with best C (for validation)
oof_final=np.zeros((len(train),3)); scores_final=[]
for f,(tr,va) in enumerate(skf.split(meta_train,y),1):
    meta_cv=LogisticRegression(solver='lbfgs', C=best_c, max_iter=2000, random_state=2025)
    meta_cv.fit(meta_train[tr], y[tr])
    p=meta_cv.predict_proba(meta_train[va]); oof_final[va]=p
    s=log_loss(y[va], p); scores_final.append(s); print(f'Final Meta-LR Fold {f}: {s:.4f}')
final_sc=float(np.mean(scores_final)); print('Final Stacked Meta-LR OOF:', final_sc)
pd.DataFrame(oof_final, columns=classes).to_csv('oof_stacked_meta_lr_final.csv', index=False)

if final_sc <= 0.34:
    print('Medal-ready! Copy test_stacked_meta_lr.csv to submission.csv and submit.')
else:
    print('Still >0.34; need more diversity or tuning.')

Meta-LR C=0.5 CV OOF: 0.3648


Meta-LR C=0.75 CV OOF: 0.3648


Meta-LR C=1.0 CV OOF: 0.3647


Meta-LR C=1.5 CV OOF: 0.3648


Meta-LR C=2.0 CV OOF: 0.3647
Best meta-LR C: 2.0 with CV OOF: 0.3647


Final Meta-LR Fold 1: 0.3630


Final Meta-LR Fold 2: 0.3631


Final Meta-LR Fold 3: 0.3794


Final Meta-LR Fold 4: 0.3607


Final Meta-LR Fold 5: 0.3574
Final Stacked Meta-LR OOF: 0.36472125242237086
Still >0.34; need more diversity or tuning.


In [78]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Word+Char hstack LR
word_params = dict(analyzer='word', ngram_range=(1,3), lowercase=True,
                   sublinear_tf=True, min_df=2, max_df=0.95, max_features=200_000)
char_params = dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False,
                   sublinear_tf=True, min_df=2, max_df=0.98, max_features=400_000)
c_grid = [3.0, 4.0, 5.0, 6.0]
best_c = None; best_sc = float('inf'); best_oof = None
for c in c_grid:
    oof_c = np.zeros((len(train), 3)); scores_c = []
    for f, (tr, va) in enumerate(skf.split(train['text'], y), 1):
        vec_word = TfidfVectorizer(**word_params)
        Xtr_word = vec_word.fit_transform(train['text'].iloc[tr])
        Xva_word = vec_word.transform(train['text'].iloc[va])
        vec_char = TfidfVectorizer(**char_params)
        Xtr_char = vec_char.fit_transform(train['text'].iloc[tr])
        Xva_char = vec_char.transform(train['text'].iloc[va])
        Xtr = hstack([Xtr_word, Xtr_char])
        Xva = hstack([Xva_word, Xva_char])
        clf = LogisticRegression(solver='lbfgs', C=c, max_iter=3000, tol=1e-4, random_state=42, n_jobs=1)
        clf.fit(Xtr, y[tr])
        p = clf.predict_proba(Xva)
        oof_c[va] = p
        s = log_loss(y[va], p); scores_c.append(s)
    sc_c = float(np.mean(scores_c)); print(f'Hstack C={c} OOF: {sc_c:.4f}')
    if sc_c < best_sc:
        best_sc = sc_c; best_c = c; best_oof = oof_c
print(f'Best Hstack C: {best_c} OOF: {best_sc:.4f}')
pd.DataFrame(best_oof, columns=classes).to_csv('oof_word_char_hstack_lr.csv', index=False)

# Full fit with best C for test
vec_word_full = TfidfVectorizer(**word_params)
Xfull_word = vec_word_full.fit_transform(train['text']); Xtest_word = vec_word_full.transform(test['text'])
vec_char_full = TfidfVectorizer(**char_params)
Xfull_char = vec_char_full.fit_transform(train['text']); Xtest_char = vec_char_full.transform(test['text'])
Xfull = hstack([Xfull_word, Xfull_char]); Xtest = hstack([Xtest_word, Xtest_char])
clf_full = LogisticRegression(solver='lbfgs', C=best_c, max_iter=3000, tol=1e-4, random_state=42, n_jobs=1)
clf_full.fit(Xfull, y)
ptest = clf_full.predict_proba(Xtest)
pd.DataFrame(ptest, columns=classes).to_csv('test_word_char_hstack_lr.csv', index=False)

Hstack C=3.0 OOF: 0.3924


Hstack C=4.0 OOF: 0.3822


Hstack C=5.0 OOF: 0.3765


Hstack C=6.0 OOF: 0.3727
Best Hstack C: 6.0 OOF: 0.3727


In [79]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

vec_params=dict(analyzer='char_wb', ngram_range=(1,6), lowercase=False,
                sublinear_tf=True, min_df=1, max_df=0.98, max_features=600_000)
clf=LogisticRegression(solver='lbfgs', C=4.0, max_iter=3000, tol=1e-4, random_state=42, n_jobs=1)

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
    vec=TfidfVectorizer(**vec_params)
    Xtr=vec.fit_transform(train['text'].iloc[tr]); Xva=vec.transform(train['text'].iloc[va])
    clf.fit(Xtr, y[tr])
    p=clf.predict_proba(Xva)
    oof[va]=p
    s=log_loss(y[va], p); scores.append(s); print(f'Char_wb 1-6 LR Fold {f}: {s:.4f}')
sc=float(np.mean(scores)); print('Char_wb 1-6 LR OOF:', sc)
pd.DataFrame(oof, columns=classes).to_csv('oof_char_wb_1_6_lr.csv', index=False)

# Full fit for test
vec_full=TfidfVectorizer(**vec_params)
Xfull=vec_full.fit_transform(train['text']); Xtest=vec_full.transform(test['text'])
clf.fit(Xfull, y)
ptest=clf.predict_proba(Xtest)
pd.DataFrame(ptest, columns=classes).to_csv('test_char_wb_1_6_lr.csv', index=False)

Char_wb 1-6 LR Fold 1: 0.4126


Char_wb 1-6 LR Fold 2: 0.4130


Char_wb 1-6 LR Fold 3: 0.4302


Char_wb 1-6 LR Fold 4: 0.4168


Char_wb 1-6 LR Fold 5: 0.4133
Char_wb 1-6 LR OOF: 0.41717235341322867


In [80]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

train=pd.read_csv('train.csv'); le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 7-model stack with bagged tests where available
model_files=[
    ('oof_10fold_uncal_char_wb_lr.csv','test_10fold_uncal_char_wb_lr_bagged.csv'),
    ('oof_char_lr.csv','test_char_lr.csv'),
    ('oof_calsvc_char.csv','test_calsvc_bagged.csv'),
    ('oof_char_variant.csv','test_char_variant_bagged.csv'),
    ('oof_stylo_word_lr.csv','test_stylo_word_lr.csv'),
    ('oof_word_char_hstack_lr.csv','test_word_char_hstack_lr.csv'),
    ('oof_char_wb_1_6_lr.csv','test_char_wb_1_6_lr.csv')
]

meta_train=np.hstack([pd.read_csv(o)[classes].values for o,_ in model_files])
meta_test=np.hstack([pd.read_csv(t)[classes].values for _,t in model_files])

# Tune meta-LR C over grid with 5-fold CV
c_grid = [0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
best_c = None; best_sc = float('inf')
for c in c_grid:
    oof_cv=np.zeros((len(train),3)); scores_cv=[]
    for f,(tr,va) in enumerate(skf.split(meta_train,y),1):
        meta=LogisticRegression(solver='lbfgs', C=c, max_iter=2000, random_state=2025)
        meta.fit(meta_train[tr], y[tr])
        p=meta.predict_proba(meta_train[va]); oof_cv[va]=p
        s=log_loss(y[va], p); scores_cv.append(s)
    sc_cv=float(np.mean(scores_cv))
    print(f'Meta-LR C={c} CV OOF: {sc_cv:.4f}')
    if sc_cv < best_sc:
        best_sc = sc_cv; best_c = c

print(f'Best meta-LR C: {best_c} with CV OOF: {best_sc:.4f}')

# Fit final meta on full with best C
meta=LogisticRegression(solver='lbfgs', C=best_c, max_iter=2000, random_state=2025)
meta.fit(meta_train, y)
ptest=meta.predict_proba(meta_test)
ptest=np.clip(ptest,1e-15,1-1e-15); ptest/=ptest.sum(axis=1, keepdims=True)
pd.DataFrame(ptest, columns=classes).to_csv('test_stacked_meta_lr.csv', index=False)

# Compute final OOF with best C
oof_final=np.zeros((len(train),3)); scores_final=[]
for f,(tr,va) in enumerate(skf.split(meta_train,y),1):
    meta_cv=LogisticRegression(solver='lbfgs', C=best_c, max_iter=2000, random_state=2025)
    meta_cv.fit(meta_train[tr], y[tr])
    p=meta_cv.predict_proba(meta_train[va]); oof_final[va]=p
    s=log_loss(y[va], p); scores_final.append(s); print(f'Final Meta-LR Fold {f}: {s:.4f}')
final_sc=float(np.mean(scores_final)); print('Final Stacked Meta-LR OOF:', final_sc)
pd.DataFrame(oof_final, columns=classes).to_csv('oof_stacked_meta_lr_final.csv', index=False)

if final_sc <= 0.34:
    print('Medal-ready! Copy test_stacked_meta_lr.csv to submission.csv and submit.')
else:
    print('Still >0.34; need more diversity or tuning.')

Meta-LR C=0.5 CV OOF: 0.3643


Meta-LR C=1.0 CV OOF: 0.3641


Meta-LR C=1.5 CV OOF: 0.3641


Meta-LR C=2.0 CV OOF: 0.3641


Meta-LR C=3.0 CV OOF: 0.3641


Meta-LR C=4.0 CV OOF: 0.3641


Meta-LR C=6.0 CV OOF: 0.3641
Best meta-LR C: 2.0 with CV OOF: 0.3641


Final Meta-LR Fold 1: 0.3607


Final Meta-LR Fold 2: 0.3618


Final Meta-LR Fold 3: 0.3809


Final Meta-LR Fold 4: 0.3602


Final Meta-LR Fold 5: 0.3569
Final Stacked Meta-LR OOF: 0.36409225532423145
Still >0.34; need more diversity or tuning.


In [81]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha=0.5):
    pos=np.asarray(X[yb==1].sum(0)).ravel()+alpha
    neg=np.asarray(X[yb==0].sum(0)).ravel()+alpha
    r=np.log(pos/neg); r[~np.isfinite(r)]=0.0
    return r

def odds_norm(P, eps=1e-9):
    P=np.clip(P,eps,1-eps); odds=P/(1-P)
    return odds/(odds.sum(axis=1,keepdims=True)+eps)

vec_params=dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=2, max_df=0.95, binary=True, max_features=150000)
clf_params=dict(solver='liblinear', penalty='l2', C=4.0, max_iter=3000, tol=1e-4)

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
    vec=CountVectorizer(**vec_params)
    Xtr=vec.fit_transform(train['text'].iloc[tr]); Xva=vec.transform(train['text'].iloc[va])
    Pva=np.zeros((len(va),3))
    for c in range(3):
        yb=(y[tr]==c).astype(int)
        r=log_count_ratio(Xtr, yb, alpha=0.5)
        clf=LogisticRegression(**clf_params, random_state=42+c)
        clf.fit(Xtr.multiply(csr_matrix(r)), yb)
        Pva[:,c]=clf.predict_proba(Xva.multiply(csr_matrix(r)))[:,1]
    Pva=odds_norm(Pva); oof[va]=Pva
    s=log_loss(y[va], Pva); scores.append(s); print(f'Word NB-SVM Fold {f}: {s:.4f}')
print('Word NB-SVM OOF:', float(np.mean(scores)))
pd.DataFrame(oof, columns=classes).to_csv('oof_word_nbsvm.csv', index=False)

# Full fit -> test
vec_full=CountVectorizer(**vec_params)
Xfull=vec_full.fit_transform(train['text']); Xtest=vec_full.transform(test['text'])
Ptest=np.zeros((len(test),3))
for c in range(3):
    yb=(y==c).astype(int)
    r=log_count_ratio(Xfull, yb, alpha=0.5)
    clf=LogisticRegression(**clf_params, random_state=999+c)
    clf.fit(Xfull.multiply(csr_matrix(r)), yb)
    Ptest[:,c]=clf.predict_proba(Xtest.multiply(csr_matrix(r)))[:,1]
Ptest=odds_norm(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_word_nbsvm.csv', index=False)

  Ground truth (correct) target values.


Word NB-SVM Fold 1: 0.5583


  Ground truth (correct) target values.


Word NB-SVM Fold 2: 0.5350


  Ground truth (correct) target values.


Word NB-SVM Fold 3: 0.5693


  Ground truth (correct) target values.


Word NB-SVM Fold 4: 0.5139


  Ground truth (correct) target values.


Word NB-SVM Fold 5: 0.5623
Word NB-SVM OOF: 0.5477910850128169


In [82]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha=0.5):
    pos=np.asarray(X[yb==1].sum(0)).ravel()+alpha
    neg=np.asarray(X[yb==0].sum(0)).ravel()+alpha
    r=np.log(pos/neg); r[~np.isfinite(r)]=0.0
    return r

def odds_norm(P, eps=1e-9):
    P=np.clip(P,eps,1-eps); odds=P/(1-P)
    return odds/(odds.sum(axis=1,keepdims=True)+eps)

vec_params=dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=2, max_df=0.95, binary=True, max_features=150000)
svc_params=dict(C=4.0, loss='squared_hinge', dual='auto', max_iter=3000, tol=1e-4)
inner_cv_splits=3

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
    vec=CountVectorizer(**vec_params)
    Xtr=vec.fit_transform(train['text'].iloc[tr]); Xva=vec.transform(train['text'].iloc[va])
    Pva=np.zeros((len(va),3))
    for c in range(3):
        yb_tr=(y[tr]==c).astype(int)
        # Inner CV for Platt calibration
        skf_inner=StratifiedKFold(n_splits=inner_cv_splits, shuffle=True, random_state=42+c)
        F_cal=[]; z_cal=[]
        for i_tr, i_va in skf_inner.split(Xtr, yb_tr):
            svc=LinearSVC(**svc_params, random_state=42+c)
            svc.fit(Xtr[i_tr], yb_tr[i_tr])
            s=svc.decision_function(Xtr[i_va])
            if s.ndim > 1: s=s[:,0]
            F_cal.append(s.reshape(-1,1)); z_cal.append(yb_tr[i_va])
        F_cal=np.vstack(F_cal); z_cal=np.concatenate(z_cal)
        platt=LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, random_state=42+c)
        platt.fit(F_cal, z_cal)
        # Final SVC on full tr
        svc_full=LinearSVC(**svc_params, random_state=42+c)
        r=log_count_ratio(Xtr, yb_tr, alpha=0.5)
        svc_full.fit(Xtr.multiply(csr_matrix(r)), yb_tr)
        s_va=svc_full.decision_function(Xva.multiply(csr_matrix(r)))
        if s_va.ndim > 1: s_va=s_va[:,0]
        Pva[:,c]=platt.predict_proba(s_va.reshape(-1,1))[:,1]
    Pva=odds_norm(Pva); oof[va]=Pva
    s=log_loss(y[va], Pva); scores.append(s); print(f'Word NB-SVC+Platt Fold {f}: {s:.4f}')
print('Word NB-SVC+Platt OOF:', float(np.mean(scores)))
pd.DataFrame(oof, columns=classes).to_csv('oof_word_nbsvm_svc_platt.csv', index=False)

# Full fit -> test
vec_full=CountVectorizer(**vec_params)
Xfull=vec_full.fit_transform(train['text']); Xtest=vec_full.transform(test['text'])
Ptest=np.zeros((len(test),3))
for c in range(3):
    yb=(y==c).astype(int)
    # Inner CV on full
    skf_inner=StratifiedKFold(n_splits=inner_cv_splits, shuffle=True, random_state=42+c)
    F_cal=[]; z_cal=[]
    indices=np.arange(len(train))
    for i_tr, i_va in skf_inner.split(indices, yb):
        r=log_count_ratio(Xfull[i_tr], yb[i_tr], alpha=0.5)
        svc=LinearSVC(**svc_params, random_state=42+c)
        svc.fit(Xfull[i_tr].multiply(csr_matrix(r)), yb[i_tr])
        s=svc.decision_function(Xfull[i_va].multiply(csr_matrix(r)))
        if s.ndim > 1: s=s[:,0]
        F_cal.append(s.reshape(-1,1)); z_cal.append(yb[i_va])
    F_cal=np.vstack(F_cal); z_cal=np.concatenate(z_cal)
    platt=LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, random_state=42+c)
    platt.fit(F_cal, z_cal)
    # Final SVC on full
    r=log_count_ratio(Xfull, yb, alpha=0.5)
    svc_final=LinearSVC(**svc_params, random_state=42+c)
    svc_final.fit(Xfull.multiply(csr_matrix(r)), yb)
    s_test=svc_final.decision_function(Xtest.multiply(csr_matrix(r)))
    if s_test.ndim > 1: s_test=s_test[:,0]
    Ptest[:,c]=platt.predict_proba(s_test.reshape(-1,1))[:,1]
Ptest=odds_norm(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_word_nbsvm_svc_platt.csv', index=False)

  Ground truth (correct) target values.


Word NB-SVC+Platt Fold 1: 0.5812


  Ground truth (correct) target values.


Word NB-SVC+Platt Fold 2: 0.5783


  Ground truth (correct) target values.


Word NB-SVC+Platt Fold 3: 0.6017


  Ground truth (correct) target values.


Word NB-SVC+Platt Fold 4: 0.5447


  Ground truth (correct) target values.


Word NB-SVC+Platt Fold 5: 0.5880
Word NB-SVC+Platt OOF: 0.5787782278391148


In [None]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier, LogisticRegression

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

vec_params=dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False,
                sublinear_tf=True, min_df=2, max_df=0.98, max_features=400_000)
inner_cv_splits=3

oof=np.zeros((len(train),3)); scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
    vec=TfidfVectorizer(**vec_params)
    Xtr=vec.fit_transform(train['text'].iloc[tr]); Xva=vec.transform(train['text'].iloc[va])
    # Inner CV for Platt calibration on Ridge decision_function
    Pva=np.zeros((len(va),3))
    for c in range(3):
        yb_tr=(y[tr]==c).astype(int)
        skf_inner=StratifiedKFold(n_splits=inner_cv_splits, shuffle=True, random_state=42+c)
        F_cal=[]; z_cal=[]
        for i_tr, i_va in skf_inner.split(Xtr, yb_tr):
            ridge=RidgeClassifier(alpha=0.25, random_state=42+c)
            ridge.fit(Xtr[i_tr], yb_tr[i_tr])
            s=ridge.decision_function(Xtr[i_va])
            if s.ndim > 1: s=s[:,0]
            F_cal.append(s.reshape(-1,1)); z_cal.append(yb_tr[i_va])
        F_cal=np.vstack(F_cal); z_cal=np.concatenate(z_cal)
        platt=LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, random_state=42+c)
        platt.fit(F_cal, z_cal)
        # Final Ridge on full tr
        ridge_full=RidgeClassifier(alpha=0.25, random_state=42+c)
        ridge_full.fit(Xtr, yb_tr)
        s_va=ridge_full.decision_function(Xva)
        if s_va.ndim > 1: s_va=s_va[:,0]
        Pva[:,c]=platt.predict_proba(s_va.reshape(-1,1))[:,1]
    # Odds normalize OvR probs
    Pva=np.clip(Pva,1e-9,1-1e-9); odds=Pva/(1-Pva)
    Pva=odds/(odds.sum(axis=1,keepdims=True)+1e-9)
    oof[va]=Pva
    s=log_loss(y[va], Pva); scores.append(s); print(f'Ridge Char_wb Cal Fold {f}: {s:.4f}')
sc=float(np.mean(scores)); print('Ridge Char_wb Cal OOF:', sc)
pd.DataFrame(oof, columns=classes).to_csv('oof_ridge_char_wb.csv', index=False)

# Full fit for test
vec_full=TfidfVectorizer(**vec_params)
Xfull=vec_full.fit_transform(train['text']); Xtest=vec_full.transform(test['text'])
Ptest=np.zeros((len(test),3))
for c in range(3):
    yb=(y==c).astype(int)
    skf_inner=StratifiedKFold(n_splits=inner_cv_splits, shuffle=True, random_state=42+c)
    F_cal=[]; z_cal=[]
    indices=np.arange(len(train))
    for i_tr, i_va in skf_inner.split(indices, yb):
        ridge=RidgeClassifier(alpha=0.25, random_state=42+c)
        ridge.fit(Xfull[i_tr], yb[i_tr])
        s=ridge.decision_function(Xfull[i_va])
        if s.ndim > 1: s=s[:,0]
        F_cal.append(s.reshape(-1,1)); z_cal.append(yb[i_va])
    F_cal=np.vstack(F_cal); z_cal=np.concatenate(z_cal)
    platt=LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, random_state=42+c)
    platt.fit(F_cal, z_cal)
    ridge_full=RidgeClassifier(alpha=0.25, random_state=42+c)
    ridge_full.fit(Xfull, yb)
    s_test=ridge_full.decision_function(Xtest)
    if s_test.ndim > 1: s_test=s_test[:,0]
    Ptest[:,c]=platt.predict_proba(s_test.reshape(-1,1))[:,1]
Ptest=np.clip(Ptest,1e-9,1-1e-9); odds=Ptest/(1-Ptest)
Ptest=odds/(odds.sum(axis=1,keepdims=True)+1e-9)
pd.DataFrame(Ptest, columns=classes).to_csv('test_ridge_char_wb.csv', index=False)

In [95]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha=0.5):
    pos=np.asarray(X[yb==1].sum(0)).ravel()+alpha
    neg=np.asarray(X[yb==0].sum(0)).ravel()+alpha
    r=np.log(pos/neg); r[~np.isfinite(r)]=0.0
    return r

def odds_normalize(P, eps=1e-15):
    P=np.clip(P,eps,1-eps)
    odds=P/(1-P)
    Q=odds/(odds.sum(axis=1, keepdims=True)+eps)
    return Q/Q.sum(axis=1, keepdims=True)

vec_params=dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_df=0.90, binary=False)
C_grid=[4.0, 6.0, 8.0]
best_sc=1e9; best_oof=None; best_C=None

for C in C_grid:
    oof=np.zeros((len(train),3)); scores=[]
    for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
        vec=CountVectorizer(**vec_params)
        Xtr_cnt=vec.fit_transform(train['text'].iloc[tr])
        Xva_cnt=vec.transform(train['text'].iloc[va])
        # Make binary features by copying the count matrices
        Xtr_bin=Xtr_cnt.copy(); Xtr_bin.data[:]=1
        Xva_bin=Xva_cnt.copy(); Xva_bin.data[:]=1

        Pva=np.zeros((len(va),3))
        for c in range(3):
            yb=(y[tr]==c).astype(int)
            r=log_count_ratio(Xtr_cnt, yb, alpha=0.5)
            clf=LogisticRegression(solver='liblinear', penalty='l2', C=C, max_iter=3000, tol=1e-4, random_state=42+c)
            clf.fit(Xtr_bin.multiply(csr_matrix(r)), yb)
            Pva[:,c]=clf.predict_proba(Xva_bin.multiply(csr_matrix(r)))[:,1]
        Pva=odds_normalize(Pva)
        oof[va]=Pva
        scores.append(log_loss(y[va], Pva))
        print(f'Word NB-SVM C={C} Fold {f}: {scores[-1]:.4f}')
    sc=float(np.mean(scores)); print(f'Word NB-SVM C={C} OOF: {sc:.4f}')
    if sc<best_sc: best_sc=sc; best_oof=oof; best_C=C

print(f'Best Word NB-SVM OOF: {best_sc:.4f} at C={best_C}')
pd.DataFrame(best_oof, columns=classes).to_csv('oof_word_nbsvm_final.csv', index=False)

# Full fit for test with best C
vec_full=CountVectorizer(**vec_params)
Xfull_cnt=vec_full.fit_transform(train['text']); Xtest_cnt=vec_full.transform(test['text'])
Xfull_bin=Xfull_cnt.copy(); Xfull_bin.data[:]=1
Xtest_bin=Xtest_cnt.copy(); Xtest_bin.data[:]=1

Ptest=np.zeros((len(test),3))
for c in range(3):
    yb=(y==c).astype(int)
    r=log_count_ratio(Xfull_cnt, yb, alpha=0.5)
    clf=LogisticRegression(solver='liblinear', penalty='l2', C=best_C, max_iter=3000, tol=1e-4, random_state=999+c)
    clf.fit(Xfull_bin.multiply(csr_matrix(r)), yb)
    Ptest[:,c]=clf.predict_proba(Xtest_bin.multiply(csr_matrix(r)))[:,1]
Ptest=odds_normalize(Ptest)
pd.DataFrame(Ptest, columns=classes).to_csv('test_word_nbsvm_final.csv', index=False)

Word NB-SVM C=4.0 Fold 1: 0.6056


Word NB-SVM C=4.0 Fold 2: 0.5798


Word NB-SVM C=4.0 Fold 3: 0.6116


Word NB-SVM C=4.0 Fold 4: 0.5545


Word NB-SVM C=6.0 Fold 1: 0.6574


Word NB-SVM C=6.0 Fold 2: 0.6294


Word NB-SVM C=6.0 Fold 3: 0.6650


Word NB-SVM C=6.0 Fold 4: 0.6011


Word NB-SVM C=6.0 Fold 5: 0.6591
Word NB-SVM C=6.0 OOF: 0.6424


Word NB-SVM C=8.0 Fold 1: 0.6973


Word NB-SVM C=8.0 Fold 2: 0.6679


Word NB-SVM C=8.0 Fold 3: 0.7065


Word NB-SVM C=8.0 Fold 4: 0.6374


Word NB-SVM C=8.0 Fold 5: 0.6996
Word NB-SVM C=8.0 OOF: 0.6817
Best Word NB-SVM OOF: 0.5915 at C=4.0


In [87]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
train['text']=train['text'].fillna(''); test['text']=test['text'].fillna('')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def run(vec_params, C, name):
    oof=np.zeros((len(train),3)); scores=[]; test_preds=[]
    for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
        vec=TfidfVectorizer(**vec_params)
        Xtr=vec.fit_transform(train['text'].iloc[tr]); Xva=vec.transform(train['text'].iloc[va]); Xte=vec.transform(test['text'])
        clf=LogisticRegression(solver='lbfgs', C=C, max_iter=3000, tol=1e-4, random_state=42, n_jobs=1)
        clf.fit(Xtr, y[tr])
        p=clf.predict_proba(Xva); oof[va]=p; test_preds.append(clf.predict_proba(Xte))
        s=log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f}: {s:.4f}')
    sc=float(np.mean(scores)); print(f'{name} OOF: {sc:.4f}\n')
    test_pred=np.mean(test_preds, axis=0)
    pd.DataFrame(oof, columns=classes).to_csv(f'oof_{name}.csv', index=False)
    pd.DataFrame(test_pred, columns=classes).to_csv(f'test_{name}.csv', index=False)

models=[
    # char_wb
    ('char_wb_1_7', dict(analyzer='char_wb', ngram_range=(1,7), lowercase=False, sublinear_tf=True, min_df=1, max_df=0.98, max_features=600_000), 5.0),
    ('char_wb_2_7', dict(analyzer='char_wb', ngram_range=(2,7), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98, max_features=500_000), 8.0),
    ('char_wb_3_7', dict(analyzer='char_wb', ngram_range=(3,7), lowercase=False, sublinear_tf=True, min_df=3, max_df=0.97, max_features=400_000), 10.0),
    ('char_wb_1_8', dict(analyzer='char_wb', ngram_range=(1,8), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.97, max_features=700_000), 6.0),
    # char
    ('char_2_8', dict(analyzer='char', ngram_range=(2,8), lowercase=False, sublinear_tf=True, min_df=1, max_df=0.99, max_features=800_000), 3.0),
    ('char_3_8', dict(analyzer='char', ngram_range=(3,8), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98, max_features=600_000), 4.0),
    ('char_2_9_mindf5', dict(analyzer='char', ngram_range=(2,9), lowercase=False, sublinear_tf=True, min_df=5, max_df=0.98, max_features=500_000), 4.0),
    ('char_2_7_mindf3', dict(analyzer='char', ngram_range=(2,7), lowercase=False, sublinear_tf=True, min_df=3, max_df=0.98, max_features=500_000), 6.0),
]
for name, vp, C in models: run(vp, C, name)

char_wb_1_7 Fold 1: 0.4059


char_wb_1_7 Fold 2: 0.4049


char_wb_1_7 Fold 3: 0.4213


char_wb_1_7 Fold 4: 0.4107


char_wb_1_7 Fold 5: 0.4047
char_wb_1_7 OOF: 0.4095



char_wb_2_7 Fold 1: 0.3966


char_wb_2_7 Fold 2: 0.3934


char_wb_2_7 Fold 3: 0.4114


char_wb_2_7 Fold 4: 0.4001


char_wb_2_7 Fold 5: 0.3951
char_wb_2_7 OOF: 0.3993



char_wb_3_7 Fold 1: 0.3985


char_wb_3_7 Fold 2: 0.3975


char_wb_3_7 Fold 3: 0.4141


char_wb_3_7 Fold 4: 0.4040


char_wb_3_7 Fold 5: 0.3997
char_wb_3_7 OOF: 0.4028



char_wb_1_8 Fold 1: 0.4014


char_wb_1_8 Fold 2: 0.4000


char_wb_1_8 Fold 3: 0.4176


char_wb_1_8 Fold 4: 0.4060


char_wb_1_8 Fold 5: 0.3991
char_wb_1_8 OOF: 0.4048



char_2_8 Fold 1: 0.4652


char_2_8 Fold 2: 0.4649


char_2_8 Fold 3: 0.4723


char_2_8 Fold 4: 0.4634


char_2_8 Fold 5: 0.4556
char_2_8 OOF: 0.4643



char_3_8 Fold 1: 0.4462


char_3_8 Fold 2: 0.4497


char_3_8 Fold 3: 0.4568


char_3_8 Fold 4: 0.4479


char_3_8 Fold 5: 0.4402
char_3_8 OOF: 0.4482



char_2_9_mindf5 Fold 1: 0.4405


char_2_9_mindf5 Fold 2: 0.4398


char_2_9_mindf5 Fold 3: 0.4490


char_2_9_mindf5 Fold 4: 0.4393


char_2_9_mindf5 Fold 5: 0.4295
char_2_9_mindf5 OOF: 0.4396



char_2_7_mindf3 Fold 1: 0.4156


char_2_7_mindf3 Fold 2: 0.4146


char_2_7_mindf3 Fold 3: 0.4270


char_2_7_mindf3 Fold 4: 0.4135


char_2_7_mindf3 Fold 5: 0.4060
char_2_7_mindf3 OOF: 0.4153



In [93]:
import numpy as np, pandas as pd, glob, os
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Ridge

train=pd.read_csv('train.csv'); le=LabelEncoder(); y=le.fit_transform(train['author']); classes=list(le.classes_)

# Load all OOF/test pairs with OOF <0.46, exclude stacked/meta files
cand=[]
for oof_file in glob.glob('oof_*.csv'):
    if 'stacked' in oof_file or 'meta' in oof_file:
        continue
    test_file=oof_file.replace('oof_','test_')
    if not os.path.exists(test_file): continue
    oof=pd.read_csv(oof_file)[classes].values
    sc=log_loss(y, oof)
    if sc<0.46:
        cand.append((oof_file, test_file, oof, pd.read_csv(test_file)[classes].values, sc))
cand=sorted(cand, key=lambda x: x[4])

# Greedy forward selection (simple average)
selected=[]; best=float('inf'); sel_tests=[]
while True:
    improved=False; pick=None; pick_oof=None; pick_test=None; pick_sc=None
    for (oof_f, test_f, oof, test, sc) in cand:
        if any(oof_f==s[0] for s in selected): continue
        cur=[s[2] for s in selected]+[oof]
        blend=np.mean(cur, axis=0)
        s=log_loss(y, blend)
        if s<best-1e-5:
            improved=True; best=s; pick=(oof_f,test_f); pick_oof=oof; pick_test=test; pick_sc=s
    if not improved: break
    selected.append((pick[0], pick[1], pick_oof, pick_test)); sel_tests=[s[3] for s in selected]
    print(f'Added {pick[0]} -> OOF {pick_sc:.4f}')
print('Greedy OOF:', best)

# Weight optimization on selected
oofs=[s[2] for s in selected]; tests=[s[3] for s in selected]
best_w=None; best_w_sc=float('inf')
for _ in range(2000):
    w=np.random.dirichlet(np.ones(len(oofs)))
    blend=sum(wi*o for wi,o in zip(w,oofs))
    sc=log_loss(y, blend)
    if sc<best_w_sc: best_w_sc=sc; best_w=w
print('Weighted OOF:', best_w_sc)

# Ridge meta on selected
meta_train=np.hstack(oofs); meta_test=np.hstack(tests)
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
def to_prob(P): P=np.clip(P,1e-15,1-1e-15); P/=P.sum(1,keepdims=True); return P
best_alpha=None; best_alpha_sc=float('inf')
for a in [0.1,0.3,0.5,1.0,2.0]:
    oof=np.zeros((len(train),3)); scs=[]
    for tr,va in skf.split(meta_train, y):
        Y=np.zeros((len(tr),3)); Y[np.arange(len(tr)), y[tr]]=1
        ridge=Ridge(alpha=a, random_state=42).fit(meta_train[tr], Y)
        p=to_prob(ridge.predict(meta_train[va])); oof[va]=p; scs.append(log_loss(y[va], p))
    sc=np.mean(scs)
    if sc<best_alpha_sc: best_alpha_sc=sc; best_alpha=a
print('Ridge OOF:', best_alpha_sc)

# Pick best (lowest OOF) and save final test
method=min([('greedy',best), ('weighted',best_w_sc), ('ridge',best_alpha_sc)], key=lambda x:x[1])[0]
if method=='greedy':
    final_test=np.mean(tests, axis=0)
elif method=='weighted':
    final_test=sum(wi*t for wi,t in zip(best_w, tests))
else:
    Y=np.zeros((len(train),3)); Y[np.arange(len(train)), y]=1
    ridge=Ridge(alpha=best_alpha, random_state=42).fit(meta_train, Y)
    final_test=ridge.predict(meta_test)
final_test=np.clip(final_test,1e-15,1-1e-15); final_test/=final_test.sum(1,keepdims=True)
pd.DataFrame(final_test, columns=classes).to_csv('submission.csv', index=False)
print('Saved submission.csv with method:', method)

ValueError: Found input variables with inconsistent numbers of samples: [17914, 17621]

In [92]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train=pd.read_csv('train.csv'); test=pd.read_csv('test.csv')
le=LabelEncoder(); y=le.fit_transform(train['author']); classes=['EAP','HPL','MWS']
blend_test = pd.read_csv('submission.csv')[classes].values

k = int(0.15*len(test))
m = blend_test.max(axis=1); thr = np.partition(m, -k)[-k]
mask = m >= thr
pseudo_y = blend_test[mask].argmax(1)
pseudo_text = test.loc[mask,'text'].values

X_text = np.concatenate([train['text'].values, pseudo_text])
y_all = np.concatenate([y, pseudo_y])
sw = np.concatenate([np.ones(len(y)), np.full(len(pseudo_y), 0.4)])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = [
 ('pl_char_wb_2_7', dict(analyzer='char_wb', ngram_range=(2,7), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98, max_features=500_000), 8.0),
 ('pl_char_wb_1_7', dict(analyzer='char_wb', ngram_range=(1,7), lowercase=False, sublinear_tf=True, min_df=1, max_df=0.98, max_features=600_000), 5.0),
 ('pl_char_3_7',   dict(analyzer='char',    ngram_range=(3,7), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98, max_features=300_000), 4.0),
]
for name, vp, C in models:
    oof_pl = np.zeros((len(X_text), 3)); scores_pl = []
    test_preds_pl = []
    for tr, va in skf.split(X_text, y_all):
        vec=TfidfVectorizer(**vp)
        Xtr_pl=vec.fit_transform(X_text[tr]); Xva_pl=vec.transform(X_text[va]); Xte_pl=vec.transform(test['text'])
        clf=LogisticRegression(solver='lbfgs', C=C, max_iter=3000, tol=1e-4, n_jobs=1, random_state=2025)
        clf.fit(Xtr_pl, y_all[tr], sample_weight=sw[tr])
        p_pl=clf.predict_proba(Xva_pl); oof_pl[va]=p_pl; test_preds_pl.append(clf.predict_proba(Xte_pl))
        s_pl=log_loss(y_all[va], p_pl); scores_pl.append(s_pl)
    sc_pl=float(np.mean(scores_pl)); print(f'{name} PL OOF: {sc_pl:.4f}')
    test_pred_pl=np.mean(test_preds_pl, axis=0)
    pd.DataFrame(oof_pl, columns=classes).to_csv(f'oof_{name}.csv', index=False)
    pd.DataFrame(test_pred_pl, columns=classes).to_csv(f'test_{name}.csv', index=False)

pl_char_wb_2_7 PL OOF: 0.3993


pl_char_wb_1_7 PL OOF: 0.4081


pl_char_3_7 PL OOF: 0.4348


In [98]:
import re, unicodedata

def normalize_text(s: str) -> str:
    if not isinstance(s, str): s = '' if s is None else str(s)
    s = unicodedata.normalize('NFKC', s)
    s = (s.replace('“','"').replace('”','"')
           .replace('‘',"'").replace('’',"'")
           .replace('—','-').replace('–','-').replace('−','-')
           .replace('…',' ... ').replace('\u00A0',' '))
    s = re.sub(r'\d+', '0', s)          # unify digit sequences
    s = re.sub(r'\s+', ' ', s).strip()  # collapse whitespace
    return s

# usage
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna('').map(normalize_text)
test['text']  = test['text'].fillna('').map(normalize_text)

le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

def run_10fold(name, vec_params, C):
    oof = np.zeros((len(train),3)); Ptest = np.zeros((len(test),3)); scores=[]
    for f,(tr,va) in enumerate(skf10.split(train['text'], y),1):
        vec = TfidfVectorizer(**vec_params)
        Xtr = vec.fit_transform(train['text'].iloc[tr]); Xva = vec.transform(train['text'].iloc[va]); Xte = vec.transform(test['text'])
        clf = LogisticRegression(solver='lbfgs', C=C, max_iter=3000, tol=1e-4, n_jobs=1, random_state=42+f)
        clf.fit(Xtr, y[tr])
        p = clf.predict_proba(Xva); oof[va] = p; Ptest += clf.predict_proba(Xte)
        scores.append(log_loss(y[va], p))
    print(name, '10f OOF:', round(float(np.mean(scores)),4))
    Ptest /= skf10.n_splits
    pd.DataFrame(oof, columns=classes).to_csv(f'oof_10f_{name}.csv', index=False)
    pd.DataFrame(Ptest, columns=classes).to_csv(f'test_10f_{name}.csv', index=False)

def run_10f_hstack(name, word_params, char_params, C):
    oof = np.zeros((len(train),3)); Ptest = np.zeros((len(test),3)); scores=[]
    for f,(tr,va) in enumerate(skf10.split(train['text'], y),1):
        vw = TfidfVectorizer(**word_params); vc = TfidfVectorizer(**char_params)
        Xtr = hstack([vw.fit_transform(train['text'].iloc[tr]), vc.fit_transform(train['text'].iloc[tr])])
        Xva = hstack([vw.transform(train['text'].iloc[va]), vc.transform(train['text'].iloc[va])])
        Xte = hstack([vw.transform(test['text']), vc.transform(test['text'])])
        clf = LogisticRegression(solver='lbfgs', C=C, max_iter=3000, tol=1e-4, n_jobs=1, random_state=42+f)
        clf.fit(Xtr, y[tr])
        p = clf.predict_proba(Xva); oof[va] = p; Ptest += clf.predict_proba(Xte)
        scores.append(log_loss(y[va], p))
    print(name, '10f OOF:', round(float(np.mean(scores)),4))
    Ptest /= skf10.n_splits
    pd.DataFrame(oof, columns=classes).to_csv(f'oof_10f_{name}.csv', index=False)
    pd.DataFrame(Ptest, columns=classes).to_csv(f'test_10f_{name}.csv', index=False)

# Run the 5 bases (skipping buggy CalSVC for now)
run_10f_hstack('hstack_lr',
    word_params=dict(analyzer='word', ngram_range=(1,3), lowercase=True, sublinear_tf=True, min_df=2, max_df=0.95),
    char_params=dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98),
    C=6.0)
run_10fold('char_wb_2_7', dict(analyzer='char_wb', ngram_range=(2,7), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98), C=8.0)
run_10fold('char_wb_1_7', dict(analyzer='char_wb', ngram_range=(1,7), lowercase=False, sublinear_tf=True, min_df=1, max_df=0.98), C=5.0)
run_10fold('char_wb_3_7', dict(analyzer='char_wb', ngram_range=(3,7), lowercase=False, sublinear_tf=True, min_df=3, max_df=0.97), C=10.0)
run_10fold('char_2_7_mindf3', dict(analyzer='char', ngram_range=(2,7), lowercase=False, sublinear_tf=True, min_df=3, max_df=0.98), C=6.0)

# Re-ensemble on the new 10-fold OOFs (Ridge meta + greedy weighted average)
import numpy as np, pandas as pd, itertools
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge

train = pd.read_csv('train.csv'); le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)

# load pairs (exclude calsvc)
names = ['hstack_lr','char_wb_2_7','char_wb_1_7','char_wb_3_7','char_2_7_mindf3']
oofs = [pd.read_csv(f'oof_10f_{n}.csv')[classes].values for n in names]
tests = [pd.read_csv(f'test_10f_{n}.csv')[classes].values for n in names]

# greedy forward (simple mean)
selected = []; best = 1e9
while True:
    improved = False; best_idx = None; best_sc = None
    for i,(oo,_) in enumerate(zip(oofs, tests)):
        if i in selected: continue
        idxs = selected + [i]
        blend = np.mean([oofs[j] for j in idxs], axis=0)
        sc = log_loss(y, blend)
        if sc < best - 1e-6:
            improved = True; best = sc; best_idx = i; best_sc = sc
    if not improved: break
    selected.append(best_idx)
print('Greedy OOF:', round(best,4), 'selected:', [names[i] for i in selected])

# dirichlet weight search on selected
sel_oofs  = [oofs[i] for i in selected]
sel_tests = [tests[i] for i in selected]
rng = np.random.default_rng(42); best_w = None; best_w_sc = 1e9
for _ in range(4000):
    w = rng.dirichlet(np.ones(len(sel_oofs)))
    sc = log_loss(y, sum(wi*oo for wi,oo in zip(w, sel_oofs)))
    if sc < best_w_sc: best_w_sc, best_w = sc, w
print('Weighted OOF:', round(best_w_sc,4))

# ridge meta on concatenated base probs
X = np.hstack(sel_oofs); Xt = np.hstack(sel_tests)
def to_prob(P): P = np.clip(P,1e-15,1-1e-15); return P / P.sum(1, keepdims=True)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
best_a = None; best_ridge = 1e9
for a in [0.1,0.2,0.3,0.5,1.0,1.5,2.0]:
    oof_meta = np.zeros((len(train),3)); scs=[]
    for tr,va in skf.split(X, y):
        Y = np.zeros((len(tr),3)); Y[np.arange(len(tr)), y[tr]] = 1
        ridge = Ridge(alpha=a, random_state=42).fit(X[tr], Y)
        p = to_prob(ridge.predict(X[va])); oof_meta[va] = p; scs.append(log_loss(y[va], p))
    sc = float(np.mean(scs))
    if sc < best_ridge: best_ridge, best_a = sc, a
print('Ridge OOF:', round(best_ridge,4), 'alpha:', best_a)

# pick best and save submission
if best_ridge < best_w_sc:
    Y = np.zeros((len(train),3)); Y[np.arange(len(train)), y] = 1
    ridge = Ridge(alpha=best_a, random_state=42).fit(X, Y)
    final = to_prob(ridge.predict(Xt))
else:
    final = sum(wi*tt for wi,tt in zip(best_w, sel_tests))
final = np.clip(final,1e-15,1-1e-15); final /= final.sum(1, keepdims=True)
pd.DataFrame(final, columns=classes).to_csv('submission.csv', index=False)

hstack_lr 10f OOF: 0.3629


char_wb_2_7 10f OOF: 0.3912


char_wb_1_7 10f OOF: 0.4001


char_wb_3_7 10f OOF: 0.3947


char_2_7_mindf3 10f OOF: 0.4032
Greedy OOF: 0.3629 selected: ['hstack_lr']


Weighted OOF: 0.3629


Ridge OOF: 0.3963 alpha: 2.0


In [99]:
# Cell 47 — Light Pseudo-Labeling of strong char_wb bases (5-fold CV), then save OOF/test
import numpy as np, pandas as pd, re, unicodedata
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

# 0) Normalization (same as Cell 46)
def normalize_text(s: str) -> str:
    if not isinstance(s, str): s = '' if s is None else str(s)
    s = unicodedata.normalize('NFKC', s)
    s = (s.replace('“','"').replace('”','"')
           .replace('‘',"'").replace('’',"'")
           .replace('—','-').replace('–','-').replace('−','-')
           .replace('…',' ... ').replace('\u00A0',' '))
    s = re.sub(r'\d+', '0', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna('').map(normalize_text)
test['text']  = test['text'].fillna('').map(normalize_text)

le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 1) Select high-confidence pseudo labels from current submission
sub = pd.read_csv('submission.csv')[classes].values  # from Cell 46
maxp = sub.max(axis=1)
# target top 10–15% with a min prob floor
target_frac = 0.15
q_thr = np.quantile(maxp, 1 - target_frac)
thr = max(q_thr, 0.90)  # use 0.90 floor; raise to 0.92–0.95 if noisy
mask = maxp >= thr
pseudo_text = test.loc[mask, 'text'].values
pseudo_y = sub[mask].argmax(axis=1)
pl_w = 0.35
print(f'Pseudo-labeled rows: {mask.sum()} ({mask.mean():.1%}) | threshold: {thr:.3f}')

# 2) Helpers: fit each fold on (train_fold + all pseudo), OOF strictly on original train
def retrain_hstack_on_pseudo(name, word_params, char_params, C):
    oof = np.zeros((len(train), 3)); test_preds = []; scores=[]
    for f,(tr,va) in enumerate(skf5.split(train['text'], y), 1):
        Xtr_text = pd.concat([train['text'].iloc[tr], pd.Series(pseudo_text)], ignore_index=True)
        ytr = np.concatenate([y[tr], pseudo_y])
        sw = np.concatenate([np.ones(len(tr)), np.full(len(pseudo_y), pl_w)])
        Xva_text = train['text'].iloc[va]

        vw = TfidfVectorizer(**word_params); vc = TfidfVectorizer(**char_params)
        Xtr = hstack([vw.fit_transform(Xtr_text), vc.fit_transform(Xtr_text)])
        Xva = hstack([vw.transform(Xva_text), vc.transform(Xva_text)])
        Xte = hstack([vw.transform(test['text']), vc.transform(test['text'])])

        clf = LogisticRegression(solver='lbfgs', C=C, max_iter=3000, tol=1e-4, n_jobs=1, random_state=2025+f)
        clf.fit(Xtr, ytr, sample_weight=sw)
        p = clf.predict_proba(Xva); oof[va] = p; test_preds.append(clf.predict_proba(Xte))
        s = log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f}: {s:.4f}')
    sc = float(np.mean(scores)); print(f'{name} PL OOF: {sc:.4f}')
    ptest = np.mean(test_preds, axis=0)
    pd.DataFrame(oof, columns=classes).to_csv(f'oof_pl_{name}.csv', index=False)
    pd.DataFrame(ptest, columns=classes).to_csv(f'test_pl_{name}.csv', index=False)

def retrain_single_on_pseudo(name, vec_params, C):
    oof = np.zeros((len(train), 3)); test_preds = []; scores=[]
    for f,(tr,va) in enumerate(skf5.split(train['text'], y), 1):
        Xtr_text = pd.concat([train['text'].iloc[tr], pd.Series(pseudo_text)], ignore_index=True)
        ytr = np.concatenate([y[tr], pseudo_y])
        sw = np.concatenate([np.ones(len(tr)), np.full(len(pseudo_y), pl_w)])
        Xva_text = train['text'].iloc[va]

        vec = TfidfVectorizer(**vec_params)
        Xtr = vec.fit_transform(Xtr_text); Xva = vec.transform(Xva_text); Xte = vec.transform(test['text'])
        clf = LogisticRegression(solver='lbfgs', C=C, max_iter=3000, tol=1e-4, n_jobs=1, random_state=2025+f)
        clf.fit(Xtr, ytr, sample_weight=sw)
        p = clf.predict_proba(Xva); oof[va] = p; test_preds.append(clf.predict_proba(Xte))
        s = log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f}: {s:.4f}')
    sc = float(np.mean(scores)); print(f'{name} PL OOF: {sc:.4f}')
    ptest = np.mean(test_preds, axis=0)
    pd.DataFrame(oof, columns=classes).to_csv(f'oof_pl_{name}.csv', index=False)
    pd.DataFrame(ptest, columns=classes).to_csv(f'test_pl_{name}.csv', index=False)

# 3) Run your requested bases
# hstack_lr (C=6)
retrain_hstack_on_pseudo(
    name='hstack_lr',
    word_params=dict(analyzer='word', ngram_range=(1,3), lowercase=True, sublinear_tf=True, min_df=2, max_df=0.95),
    char_params=dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98),
    C=6.0
)
# char_wb_2_7 (C=8)
retrain_single_on_pseudo(
    name='char_wb_2_7',
    vec_params=dict(analyzer='char_wb', ngram_range=(2,7), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98),
    C=8.0
)
# char_wb_1_7 (C=5)
retrain_single_on_pseudo(
    name='char_wb_1_7',
    vec_params=dict(analyzer='char_wb', ngram_range=(1,7), lowercase=False, sublinear_tf=True, min_df=1, max_df=0.98),
    C=5.0
)

# Optional extra diversity (comment out if time is tight)
retrain_single_on_pseudo(
    name='char_wb_1_8',
    vec_params=dict(analyzer='char_wb', ngram_range=(1,8), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.97),
    C=6.0
)
retrain_single_on_pseudo(
    name='char_wb_2_8_mindf1',
    vec_params=dict(analyzer='char_wb', ngram_range=(2,8), lowercase=False, sublinear_tf=True, min_df=1, max_df=0.97),
    C=7.0
)

print('Pseudo-labeling complete. Now re-ensemble including oof_pl_*.csv/test_pl_*.csv.')

Pseudo-labeled rows: 294 (15.0%) | threshold: 0.987


hstack_lr Fold 1: 0.3656


hstack_lr Fold 2: 0.3676


hstack_lr Fold 3: 0.3890


hstack_lr Fold 4: 0.3723


hstack_lr Fold 5: 0.3684
hstack_lr PL OOF: 0.3726


char_wb_2_7 Fold 1: 0.3965


char_wb_2_7 Fold 2: 0.3919


char_wb_2_7 Fold 3: 0.4128


char_wb_2_7 Fold 4: 0.4000


char_wb_2_7 Fold 5: 0.3950
char_wb_2_7 PL OOF: 0.3992


char_wb_1_7 Fold 1: 0.4029


char_wb_1_7 Fold 2: 0.4040


char_wb_1_7 Fold 3: 0.4215


char_wb_1_7 Fold 4: 0.4100


char_wb_1_7 Fold 5: 0.4055
char_wb_1_7 PL OOF: 0.4088


char_wb_1_8 Fold 1: 0.4012


char_wb_1_8 Fold 2: 0.4000


char_wb_1_8 Fold 3: 0.4163


char_wb_1_8 Fold 4: 0.4054


char_wb_1_8 Fold 5: 0.3981
char_wb_1_8 PL OOF: 0.4042


char_wb_2_8_mindf1 Fold 1: 0.3997


char_wb_2_8_mindf1 Fold 2: 0.3969


char_wb_2_8_mindf1 Fold 3: 0.4141


char_wb_2_8_mindf1 Fold 4: 0.4028


char_wb_2_8_mindf1 Fold 5: 0.3940
char_wb_2_8_mindf1 PL OOF: 0.4015
Pseudo-labeling complete. Now re-ensemble including oof_pl_*.csv/test_pl_*.csv.


In [100]:
# Cell 48 — Re-ensemble originals + PL models (greedy/weighted/Ridge meta)
import numpy as np, pandas as pd, itertools
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge

train = pd.read_csv('train.csv'); le = LabelEncoder(); y = le.fit_transform(train['author'])
classes = list(le.classes_)

# Load original 10f + PL pairs (10 models total, all OOF <0.41)
names = ['hstack_lr', 'char_wb_2_7', 'char_wb_1_7', 'char_wb_3_7', 'char_2_7_mindf3',
         'pl_hstack_lr', 'pl_char_wb_2_7', 'pl_char_wb_1_7', 'pl_char_wb_1_8', 'pl_char_wb_2_8_mindf1']
oofs = []
tests = []
for n in names:
    if 'pl_' in n:
        oof_file = f'oof_pl_{n[3:]}.csv'
        test_file = f'test_pl_{n[3:]}.csv'
    else:
        oof_file = f'oof_10f_{n}.csv'
        test_file = f'test_10f_{n}.csv'
    oofs.append(pd.read_csv(oof_file)[classes].values)
    tests.append(pd.read_csv(test_file)[classes].values)

# greedy forward (simple mean)
selected = []; best = 1e9
while True:
    improved = False; best_idx = None; best_sc = None
    for i,(oo,_) in enumerate(zip(oofs, tests)):
        if i in selected: continue
        idxs = selected + [i]
        blend = np.mean([oofs[j] for j in idxs], axis=0)
        sc = log_loss(y, blend)
        if sc < best - 1e-6:
            improved = True; best = sc; best_idx = i; best_sc = sc
    if not improved: break
    selected.append(best_idx)
print('Greedy OOF:', round(best,4), 'selected:', [names[i] for i in selected])

# dirichlet weight search on selected (4000 iters)
sel_oofs  = [oofs[i] for i in selected]
sel_tests = [tests[i] for i in selected]
rng = np.random.default_rng(42); best_w = None; best_w_sc = 1e9
for _ in range(4000):
    w = rng.dirichlet(np.ones(len(sel_oofs)))
    sc = log_loss(y, sum(wi*oo for wi,oo in zip(w, sel_oofs)))
    if sc < best_w_sc: best_w_sc, best_w = sc, w
print('Weighted OOF:', round(best_w_sc,4))

# ridge meta on concatenated base probs
X = np.hstack(sel_oofs); Xt = np.hstack(sel_tests)
def to_prob(P): P = np.clip(P,1e-15,1-1e-15); return P / P.sum(1, keepdims=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
best_a = None; best_ridge = 1e9
for a in [0.1,0.2,0.3,0.5,1.0,1.5,2.0]:
    oof_meta = np.zeros((len(train),3)); scs=[]
    for tr,va in skf.split(X, y):
        Y = np.zeros((len(tr),3)); Y[np.arange(len(tr)), y[tr]] = 1
        ridge = Ridge(alpha=a, random_state=42).fit(X[tr], Y)
        p = to_prob(ridge.predict(X[va])); oof_meta[va] = p; scs.append(log_loss(y[va], p))
    sc = float(np.mean(scs))
    if sc < best_ridge: best_ridge, best_a = sc, a
print('Ridge OOF:', round(best_ridge,4), 'alpha:', best_a)

# pick best and save submission
if best_ridge < best_w_sc:
    Y = np.zeros((len(train),3)); Y[np.arange(len(train)), y] = 1
    ridge = Ridge(alpha=best_a, random_state=42).fit(X, Y)
    final = to_prob(ridge.predict(Xt))
else:
    final = sum(wi*tt for wi,tt in zip(best_w, sel_tests))
final = np.clip(final,1e-15,1-1e-15); final /= final.sum(1, keepdims=True)
pd.DataFrame(final, columns=classes).to_csv('submission.csv', index=False)
print('Re-ensemble complete. Check final OOF and submission.csv')

Greedy OOF: 0.3629 selected: ['hstack_lr']


Weighted OOF: 0.3629
Ridge OOF: 0.3977 alpha: 2.0
Re-ensemble complete. Check final OOF and submission.csv


In [103]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
le = LabelEncoder()
y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def pr(y_i, X, alpha=1.0):
    p = X[y_i].sum(0).astype(float)
    p += alpha
    nb = X[~y_i].sum(0).astype(float)
    nb += alpha
    num = p / nb
    num = num.A1
    return np.log(num)

oof = np.zeros((len(train), len(classes)))
test_pred = np.zeros((len(test), len(classes)))
for tr, va in skf.split(train, y):
    bin_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=2, max_df=0.95, binary=True)
    bin_vec = CountVectorizer(**bin_params)
    Xtr_bin = bin_vec.fit_transform(train['text'].iloc[tr])
    Xva_bin = bin_vec.transform(train['text'].iloc[va])
    Xte_bin = bin_vec.transform(test['text'])
    count_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=2, max_df=0.95, binary=False)
    count_vec = CountVectorizer(**count_params)
    Xtr_count = count_vec.fit_transform(train['text'].iloc[tr])
    pva = np.zeros((len(va), len(classes)))
    pte = np.zeros((len(test), len(classes)))
    for i in range(len(classes)):
        y_i = (y[tr] == i)
        ratio = pr(y_i, Xtr_count, alpha=1.0)
        Xtr_r = Xtr_bin.multiply(ratio)
        lr = LogisticRegression(C=4.0, max_iter=3000, dual=False, random_state=123)
        lr.fit(Xtr_r, y_i)
        Xva_r = Xva_bin.multiply(ratio)
        pva[:,i] = lr.predict_proba(Xva_r)[:,1]
        Xte_r = Xte_bin.multiply(ratio)
        pte[:,i] = lr.predict_proba(Xte_r)[:,1]
    pva = np.clip(pva, 1e-15, 1-1e-15)
    pva /= pva.sum(axis=1, keepdims=True)
    oof[va] = pva
    pte = np.clip(pte, 1e-15, 1-1e-15)
    pte /= pte.sum(axis=1, keepdims=True)
    test_pred += pte
test_pred /= 5
score = log_loss(y, oof)
print('Word NB-SVM OOF logloss:', round(score, 4))
pd.DataFrame(oof, columns=classes).to_csv('oof_word_nbsvm.csv', index=False)
pd.DataFrame(test_pred, columns=classes).to_csv('test_word_nbsvm.csv', index=False)

Word NB-SVM OOF logloss: 0.4573


In [104]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.linear_model import Ridge
from sklearn.model_selection import StratifiedKFold

train = pd.read_csv('train.csv')
le = LabelEncoder()
y = le.fit_transform(train['author'])
classes = list(le.classes_)

# Load new Word NB-SVM
oof_word = pd.read_csv('oof_word_nbsvm.csv')[classes].values
test_word = pd.read_csv('test_word_nbsvm.csv')[classes].values
print('Word NB-SVM OOF:', log_loss(y, oof_word))

# Top char models (OOF <0.41)
models = [
    ('oof_10f_hstack_lr.csv', 'test_10f_hstack_lr.csv'),
    ('oof_pl_hstack_lr.csv', 'test_pl_hstack_lr.csv'),
    ('oof_10f_char_wb_2_7.csv', 'test_10f_char_wb_2_7.csv'),
    ('oof_10f_char_wb_3_7.csv', 'test_10f_char_wb_3_7.csv'),
    ('oof_pl_char_wb_2_7.csv', 'test_pl_char_wb_2_7.csv')
]
oofs = [pd.read_csv(o)[classes].values for o, _ in models]
tests = [pd.read_csv(t)[classes].values for _, t in models]
names = [m[0].replace('oof_10f_', '').replace('oof_pl_', '').replace('.csv', '') for m in models]

# Quick check: 0.7 * hstack_lr + 0.3 * word_nbsvm
hstack_oof = oofs[0]
blend_quick = 0.7 * hstack_oof + 0.3 * oof_word
score_quick = log_loss(y, blend_quick)
print('Quick blend OOF (0.7 hstack + 0.3 word):', score_quick)
if score_quick < 0.3629:
    print('Word NB-SVM adds value; include in ensemble')
    include_word = True
else:
    print('Word NB-SVM does not improve; skip')
    include_word = False

# If include, Ridge meta on top 3-4 + word; else on top char only
if include_word:
    sel_oofs = [oofs[0], oofs[1], oofs[2], oof_word]  # hstack, pl_hstack, char_wb_2_7, word
    sel_tests = [tests[0], tests[1], tests[2], test_word]
    sel_names = ['hstack_lr', 'pl_hstack_lr', 'char_wb_2_7', 'word_nbsvm']
else:
    sel_oofs = [oofs[0], oofs[1], oofs[2]]  # top char
    sel_tests = [tests[0], tests[1], tests[2]]
    sel_names = ['hstack_lr', 'pl_hstack_lr', 'char_wb_2_7']

# Ridge meta CV
X = np.hstack(sel_oofs)
Xt = np.hstack(sel_tests)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
def to_prob(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return P / P.sum(axis=1, keepdims=True)
best_a = 1.0; best_ridge = float('inf')
for a in [0.1, 0.5, 1.0, 2.0, 5.0]:
    oof_meta = np.zeros((len(train), 3)); scs = []
    for tr, va in skf.split(X, y):
        Y = np.zeros((len(tr), 3)); Y[np.arange(len(tr)), y[tr]] = 1
        ridge = Ridge(alpha=a, random_state=42).fit(X[tr], Y)
        p = to_prob(ridge.predict(X[va])); oof_meta[va] = p
        scs.append(log_loss(y[va], p))
    sc = float(np.mean(scs))
    print(f'Ridge alpha={a} OOF: {sc:.4f}')
    if sc < best_ridge:
        best_ridge = sc; best_a = a

# Final fit and submission
Y = np.zeros((len(train), 3)); Y[np.arange(len(train)), y] = 1
ridge = Ridge(alpha=best_a, random_state=42).fit(X, Y)
final = to_prob(ridge.predict(Xt))
pd.DataFrame(final, columns=classes).to_csv('submission.csv', index=False)
print(f'Final ensemble OOF: {best_ridge:.4f} | Models: {sel_names} | Include Word: {include_word}')
print('submission.csv updated')

Word NB-SVM OOF: 0.4572863139813886
Quick blend OOF (0.7 hstack + 0.3 word): 0.35687910180008375
Word NB-SVM adds value; include in ensemble
Ridge alpha=0.1 OOF: 0.4188
Ridge alpha=0.5 OOF: 0.4187
Ridge alpha=1.0 OOF: 0.4171
Ridge alpha=2.0 OOF: 0.4171
Ridge alpha=5.0 OOF: 0.4187
Final ensemble OOF: 0.4171 | Models: ['hstack_lr', 'pl_hstack_lr', 'char_wb_2_7', 'word_nbsvm'] | Include Word: True
submission.csv updated


In [105]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
le = LabelEncoder()
y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def pr(y_i, X, alpha=0.5):
    p = X[y_i].sum(0).astype(float)
    p += alpha
    nb = X[~y_i].sum(0).astype(float)
    nb += alpha
    num = p / nb
    num = num.A1
    return np.log(num)

def odds_normalize(P, eps=1e-15):
    P = np.clip(P, eps, 1-eps)
    odds = P / (1 - P)
    Q = odds / (odds.sum(axis=1, keepdims=True) + eps)
    return Q / Q.sum(axis=1, keepdims=True)

C_grid = [4.0, 6.0, 8.0]
best_sc = float('inf')
best_oof = None
best_test = None
best_C = None
for C in C_grid:
    oof = np.zeros((len(train), len(classes)))
    test_pred = np.zeros((len(test), len(classes)))
    scores = []
    for tr, va in skf.split(train, y):
        vec_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=3, max_df=0.90, binary=False)
        vec = CountVectorizer(**vec_params)
        Xtr_cnt = vec.fit_transform(train['text'].iloc[tr])
        Xva_cnt = vec.transform(train['text'].iloc[va])
        Xte_cnt = vec.transform(test['text'])
        # Binary copies
        Xtr_bin = Xtr_cnt.copy(); Xtr_bin.data[:] = 1
        Xva_bin = Xva_cnt.copy(); Xva_bin.data[:] = 1
        Xte_bin = Xte_cnt.copy(); Xte_bin.data[:] = 1
        pva = np.zeros((len(va), len(classes)))
        pte = np.zeros((len(test), len(classes)))
        for i in range(len(classes)):
            y_i = (y[tr] == i)
            ratio = pr(y_i, Xtr_cnt, alpha=0.5)
            Xtr_r = Xtr_bin.multiply(ratio)
            lr = LogisticRegression(solver='liblinear', penalty='l2', C=C, max_iter=3000, tol=1e-4, random_state=42+i)
            lr.fit(Xtr_r, y_i)
            Xva_r = Xva_bin.multiply(ratio)
            pva[:,i] = lr.predict_proba(Xva_r)[:,1]
            Xte_r = Xte_bin.multiply(ratio)
            pte[:,i] = lr.predict_proba(Xte_r)[:,1]
        pva = odds_normalize(pva)
        oof[va] = pva
        pte = odds_normalize(pte)
        test_pred += pte
        scores.append(log_loss(y[va], pva))
    sc = float(np.mean(scores))
    print(f'Word NB-SVM C={C} OOF: {sc:.4f}')
    test_pred /= 5
    if sc < best_sc:
        best_sc = sc
        best_oof = oof
        best_test = test_pred
        best_C = C
print(f'Best Word NB-SVM OOF: {best_sc:.4f} at C={best_C}')
pd.DataFrame(best_oof, columns=classes).to_csv('oof_word_nbsvm_fixed.csv', index=False)
pd.DataFrame(best_test, columns=classes).to_csv('test_word_nbsvm_fixed.csv', index=False)

Word NB-SVM C=4.0 OOF: 0.5915


Word NB-SVM C=6.0 OOF: 0.6424


Word NB-SVM C=8.0 OOF: 0.6818
Best Word NB-SVM OOF: 0.5915 at C=4.0


In [106]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv('train.csv')
le = LabelEncoder()
y = le.fit_transform(train['author'])
classes = list(le.classes_)

# Skip bad Word NB-SVM (OOF 0.5915 >0.40); focus on top char + PL (OOF <0.41)
names = [
    'hstack_lr', 'pl_hstack_lr', 'char_wb_2_7', 'char_wb_3_7',
    'pl_char_wb_2_7', 'pl_char_wb_1_7', 'pl_char_wb_1_8', 'pl_char_wb_2_8_mindf1'
]
oof_files = [f'oof_10f_{n}.csv' if 'pl_' not in n else f'oof_pl_{n[3:]}.csv' for n in names]
test_files = [f'test_10f_{n}.csv' if 'pl_' not in n else f'test_pl_{n[3:]}.csv' for n in names]

oofs = [pd.read_csv(of)[classes].values for of in oof_files]
tests = [pd.read_csv(tf)[classes].values for tf in test_files]
oof_scores = [log_loss(y, oof) for oof in oofs]
print('Model OOFs:', dict(zip(names, [round(s,4) for s in oof_scores])))

# Filter to top (OOF <0.40)
top_idx = [i for i,s in enumerate(oof_scores) if s < 0.40]
top_oofs = [oofs[i] for i in top_idx]
top_tests = [tests[i] for i in top_idx]
top_names = [names[i] for i in top_idx]
print('Top models:', top_names)

# 1. Greedy forward selection (simple mean)
selected = []; best_greedy = float('inf')
while True:
    improved = False; best_add = None; best_sc = None
    for i in range(len(top_oofs)):
        if i in selected: continue
        cur_idx = selected + [i]
        blend = np.mean([top_oofs[j] for j in cur_idx], axis=0)
        sc = log_loss(y, blend)
        if sc < best_greedy - 1e-6:
            improved = True; best_greedy = sc; best_add = i; best_sc = sc
    if not improved: break
    selected.append(best_add)
    print(f'Greedy added {top_names[best_add]} -> OOF {best_sc:.4f}')
print('Greedy final OOF:', round(best_greedy,4), 'models:', [top_names[i] for i in selected])

# 2. Dirichlet weight search on selected (5000 iters)
sel_oofs = [top_oofs[i] for i in selected]
sel_tests = [top_tests[i] for i in selected]
rng = np.random.default_rng(42)
best_w = None; best_w_sc = float('inf')
for _ in range(5000):
    w = rng.dirichlet(np.ones(len(sel_oofs)))
    blend = sum(wi * oo for wi, oo in zip(w, sel_oofs))
    sc = log_loss(y, blend)
    if sc < best_w_sc:
        best_w_sc = sc; best_w = w
print('Dirichlet weighted OOF:', round(best_w_sc,4))

# 3. Multinomial LR meta on logits of selected
def to_logits(P):
    P = np.clip(P, 1e-15, 1-1e-15)
    return np.log(P / (1 - P))
logit_oofs = [to_logits(oo) for oo in sel_oofs]
X_logit = np.hstack(logit_oofs)
Xt_logit = np.hstack([to_logits(tt) for tt in sel_tests])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_meta = np.zeros((len(train), 3)); scs = []
for tr, va in skf.split(X_logit, y):
    Y = np.zeros((len(tr), 3)); Y[np.arange(len(tr)), y[tr]] = 1
    meta_lr = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, multi_class='multinomial', random_state=42)
    meta_lr.fit(X_logit[tr], y[tr])
    p_logit = meta_lr.predict_proba(X_logit[va])
    oof_meta[va] = p_logit
    scs.append(log_loss(y[va], p_logit))
lr_sc = float(np.mean(scs)); print('LR on logits OOF:', round(lr_sc,4))

# Pick best method
methods = {
    'greedy': best_greedy,
    'weighted': best_w_sc,
    'lr_logits': lr_sc
}
best_method = min(methods, key=methods.get)
best_sc = methods[best_method]
print(f'Best method: {best_method} with OOF {best_sc:.4f}')

# Generate final test preds
if best_method == 'greedy':
    final_test = np.mean(sel_tests, axis=0)
elif best_method == 'weighted':
    final_test = sum(wi * tt for wi, tt in zip(best_w, sel_tests))
else:  # lr_logits
    meta_lr = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, multi_class='multinomial', random_state=42)
    meta_lr.fit(X_logit, y)
    final_test = meta_lr.predict_proba(Xt_logit)

final_test = np.clip(final_test, 1e-15, 1-1e-15)
final_test /= final_test.sum(axis=1, keepdims=True)
pd.DataFrame(final_test, columns=classes).to_csv('submission.csv', index=False)
print('Final submission.csv saved with OOF:', round(best_sc,4))
if best_sc <= 0.35:
    print('Medal territory! Ready for submit_final_answer.')
else:
    print('Still above 0.35; consider further tuning or diversity.')

Model OOFs: {'hstack_lr': 0.3629, 'pl_hstack_lr': 0.3726, 'char_wb_2_7': 0.3912, 'char_wb_3_7': 0.3947, 'pl_char_wb_2_7': 0.3992, 'pl_char_wb_1_7': 0.4088, 'pl_char_wb_1_8': 0.4042, 'pl_char_wb_2_8_mindf1': 0.4015}
Top models: ['hstack_lr', 'pl_hstack_lr', 'char_wb_2_7', 'char_wb_3_7', 'pl_char_wb_2_7']
Greedy added hstack_lr -> OOF 0.3629
Greedy final OOF: 0.3629 models: ['hstack_lr']


Dirichlet weighted OOF: 0.3629
LR on logits OOF: 0.3561
Best method: lr_logits with OOF 0.3561
Final submission.csv saved with OOF: 0.3561
Still above 0.35; consider further tuning or diversity.


In [107]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
le = LabelEncoder()
y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def pr(y_i, X, alpha=0.1):
    p = X[y_i].sum(0).astype(float)
    p += alpha
    nb = X[~y_i].sum(0).astype(float)
    nb += alpha
    num = p / nb
    num = num.A1
    return np.log(num)

def odds_normalize(P, eps=1e-15):
    P = np.clip(P, eps, 1-eps)
    odds = P / (1 - P)
    Q = odds / (odds.sum(axis=1, keepdims=True) + eps)
    return Q / Q.sum(axis=1, keepdims=True)

C_grid = [2.0, 4.0, 6.0, 8.0, 12.0]
best_sc = float('inf')
best_oof = None
best_test = None
best_C = None
for C in C_grid:
    oof = np.zeros((len(train), len(classes)))
    test_pred = np.zeros((len(test), len(classes)))
    scores = []
    for tr, va in skf.split(train, y):
        vec_params = dict(analyzer='word', ngram_range=(1,3), lowercase=True, min_df=2, max_df=0.90, binary=False)
        vec = CountVectorizer(**vec_params)
        Xtr_cnt = vec.fit_transform(train['text'].iloc[tr])
        Xva_cnt = vec.transform(train['text'].iloc[va])
        Xte_cnt = vec.transform(test['text'])
        # Binary copies
        Xtr_bin = Xtr_cnt.copy(); Xtr_bin.data[:] = 1
        Xva_bin = Xva_cnt.copy(); Xva_bin.data[:] = 1
        Xte_bin = Xte_cnt.copy(); Xte_bin.data[:] = 1
        pva = np.zeros((len(va), len(classes)))
        pte = np.zeros((len(test), len(classes)))
        for i in range(len(classes)):
            y_i = (y[tr] == i)
            ratio = pr(y_i, Xtr_cnt, alpha=0.1)
            Xtr_r = Xtr_bin.multiply(ratio)
            lr = LogisticRegression(solver='liblinear', penalty='l2', C=C, max_iter=3000, tol=1e-4, random_state=42+i)
            lr.fit(Xtr_r, y_i)
            Xva_r = Xva_bin.multiply(ratio)
            pva[:,i] = lr.predict_proba(Xva_r)[:,1]
            Xte_r = Xte_bin.multiply(ratio)
            pte[:,i] = lr.predict_proba(Xte_r)[:,1]
        pva = odds_normalize(pva)
        oof[va] = pva
        pte = odds_normalize(pte)
        test_pred += pte
        scores.append(log_loss(y[va], pva))
    sc = float(np.mean(scores))
    print(f'Word NB-SVM C={C} OOF: {sc:.4f}')
    test_pred /= 10
    if sc < best_sc:
        best_sc = sc
        best_oof = oof
        best_test = test_pred
        best_C = C
print(f'Best Word NB-SVM OOF: {best_sc:.4f} at C={best_C}')
pd.DataFrame(best_oof, columns=classes).to_csv('oof_word_nbsvm_improved.csv', index=False)
pd.DataFrame(best_test, columns=classes).to_csv('test_word_nbsvm_improved.csv', index=False)

Word NB-SVM C=2.0 OOF: 0.4792


Word NB-SVM C=4.0 OOF: 0.5335


Word NB-SVM C=6.0 OOF: 0.5696


Word NB-SVM C=8.0 OOF: 0.5970


Word NB-SVM C=12.0 OOF: 0.6382
Best Word NB-SVM OOF: 0.4792 at C=2.0


In [108]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha):
    pos = np.asarray(X[yb==1].sum(0)).ravel() + alpha
    neg = np.asarray(X[yb==0].sum(0)).ravel() + alpha
    r = np.log(pos/neg); r[~np.isfinite(r)] = 0.0
    return r

def odds_norm(P, eps=1e-15):
    P = np.clip(P, eps, 1-eps); odds = P/(1-P)
    Q = odds / (odds.sum(axis=1, keepdims=True) + eps)
    return Q / Q.sum(axis=1, keepdims=True)

param_grid = [
    {'ngram': (1,2), 'min_df': 1, 'alpha': 0.5, 'C': 2.0},
    {'ngram': (1,2), 'min_df': 2, 'alpha': 0.75, 'C': 4.0},
    {'ngram': (1,2), 'min_df': 3, 'alpha': 1.0, 'C': 6.0},
    {'ngram': (1,3), 'min_df': 1, 'alpha': 0.5, 'C': 4.0},
    {'ngram': (1,3), 'min_df': 2, 'alpha': 0.75, 'C': 6.0},
    {'ngram': (1,3), 'min_df': 3, 'alpha': 1.0, 'C': 8.0},
]

best_sc = 1e9; best_oof = None; best_test = None; best_params = None
for p in param_grid:
    oof = np.zeros((len(train), 3)); scores = []; test_preds = []
    for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
        vec = CountVectorizer(analyzer='word', ngram_range=p['ngram'], lowercase=True,
                              min_df=p['min_df'], max_df=0.9, binary=False)
        Xtr_cnt = vec.fit_transform(train['text'].iloc[tr])
        Xva_cnt = vec.transform(train['text'].iloc[va])
        Xte_cnt = vec.transform(test['text'])
        Xtr_bin = Xtr_cnt.copy(); Xtr_bin.data[:] = 1
        Xva_bin = Xva_cnt.copy(); Xva_bin.data[:] = 1
        Xte_bin = Xte_cnt.copy(); Xte_bin.data[:] = 1

        Pva = np.zeros((len(va), 3)); Pte = np.zeros((len(test), 3))
        for c in range(3):
            yb = (y[tr]==c).astype(int)
            r = log_count_ratio(Xtr_cnt, yb, alpha=p['alpha'])
            clf = LogisticRegression(solver='liblinear', penalty='l2', C=p['C'],
                                     max_iter=3000, tol=1e-4, random_state=42+c)
            clf.fit(Xtr_bin.multiply(csr_matrix(r)), yb)
            Pva[:,c] = clf.predict_proba(Xva_bin.multiply(csr_matrix(r)))[:,1]
            Pte[:,c] = clf.predict_proba(Xte_bin.multiply(csr_matrix(r)))[:,1]
        Pva = odds_norm(Pva); oof[va] = Pva
        test_preds.append(odds_norm(Pte))
        scores.append(log_loss(y[va], Pva))
    sc = float(np.mean(scores)); print(f'NB-SVM params {p} OOF: {sc:.4f}')
    if sc < best_sc:
        best_sc = sc; best_oof = oof; best_params = p
        best_test = np.mean(test_preds, axis=0)

print('Best Word NB-SVM OOF:', round(best_sc,4), 'params:', best_params)
pd.DataFrame(best_oof, columns=classes).to_csv('oof_word_nbsvm_fixed.csv', index=False)
pd.DataFrame(best_test, columns=classes).to_csv('test_word_nbsvm_fixed.csv', index=False)

NB-SVM params {'ngram': (1, 2), 'min_df': 1, 'alpha': 0.5, 'C': 2.0} OOF: 0.4481


NB-SVM params {'ngram': (1, 2), 'min_df': 2, 'alpha': 0.75, 'C': 4.0} OOF: 0.5200


NB-SVM params {'ngram': (1, 2), 'min_df': 3, 'alpha': 1.0, 'C': 6.0} OOF: 0.5930


NB-SVM params {'ngram': (1, 3), 'min_df': 1, 'alpha': 0.5, 'C': 4.0} OOF: 0.4756


NB-SVM params {'ngram': (1, 3), 'min_df': 2, 'alpha': 0.75, 'C': 6.0} OOF: 0.5459


NB-SVM params {'ngram': (1, 3), 'min_df': 3, 'alpha': 1.0, 'C': 8.0} OOF: 0.6181
Best Word NB-SVM OOF: 0.4481 params: {'ngram': (1, 2), 'min_df': 1, 'alpha': 0.5, 'C': 2.0}


In [109]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

oof = np.zeros((len(train), 3)); test_preds = []; scores=[]
for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
    vec = CountVectorizer(analyzer='word', ngram_range=(1,2), lowercase=True, min_df=2, max_df=0.95, binary=False)
    Xtr = vec.fit_transform(train['text'].iloc[tr])
    Xva = vec.transform(train['text'].iloc[va])
    Xte = vec.transform(test['text'])
    clf = ComplementNB(alpha=0.5)
    clf.fit(Xtr, y[tr])
    p = clf.predict_proba(Xva); oof[va] = p; test_preds.append(clf.predict_proba(Xte))
    s = log_loss(y[va], p); scores.append(s)
print('ComplementNB OOF:', round(float(np.mean(scores)),4))
pd.DataFrame(oof, columns=classes).to_csv('oof_word_cnb.csv', index=False)
pd.DataFrame(np.mean(test_preds,0), columns=classes).to_csv('test_word_cnb.csv', index=False)

ComplementNB OOF: 0.5063


In [134]:
import numpy as np, pandas as pd
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv('train.csv'); le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)

# Expanded base pool per expert advice: top char + PL + diversity (CalSVC, Cal Ridge), word unconditionally
bases = [
    ('oof_10f_hstack_lr.csv', 'test_10f_hstack_lr.csv'),  # 0.3629
    ('oof_pl_refined_hstack_lr.csv', 'test_pl_refined_hstack_lr.csv'),  # 0.3628
    ('oof_10f_char_wb_2_7.csv', 'test_10f_char_wb_2_7.csv'),  # 0.3912
    ('oof_pl_refined_char_wb_2_7.csv', 'test_pl_refined_char_wb_2_7.csv'),  # 0.3911
    ('oof_10f_char_wb_1_7.csv', 'test_10f_char_wb_1_7.csv'),  # 0.4001
    ('oof_10f_char_wb_3_7.csv', 'test_10f_char_wb_3_7.csv'),  # 0.3947
    ('oof_calsvc_char.csv', 'test_calsvc_char.csv'),  # 0.4403 diversity
    ('oof_10f_cal_ridge_char_wb.csv', 'test_10f_cal_ridge_char_wb.csv'),  # 0.4116 diversity
]
# Add best word unconditionally (improved2 0.4358 for diversity, better than CNB 0.5063)
bases.append(('oof_word_nbsvm_improved2.csv', 'test_word_nbsvm_improved2.csv'))

# Load all
oofs = []; tests = []
for o,t in bases:
    oofs.append(pd.read_csv(o)[classes].values)
    tests.append(pd.read_csv(t)[classes].values)

# 1. Greedy forward selection (simple mean)
best_greedy = 1e9; sel_greedy = []
while True:
    improved = False; cand = None
    for i in range(len(oofs)):
        if i in sel_greedy: continue
        idx = sel_greedy + [i]
        sc = log_loss(y, np.mean([oofs[j] for j in idx], axis=0))
        if sc < best_greedy - 1e-6:
            best_greedy = sc; improved = True; cand = i
    if not improved: break
    sel_greedy.append(cand)
sel_oofs_greedy = [oofs[i] for i in sel_greedy]; sel_tests_greedy = [tests[i] for i in sel_greedy]
print('Greedy OOF:', round(best_greedy,4), 'selected models:', [bases[i][0] for i in sel_greedy])

# 2. Dirichlet weights on greedy selected (4000 iters for speed)
rng = np.random.default_rng(42); best_w = None; best_w_sc = 1e9
for _ in range(4000):
    w = rng.dirichlet(np.ones(len(sel_oofs_greedy)))
    sc = log_loss(y, sum(wi*oo for wi,oo in zip(w, sel_oofs_greedy)))
    if sc < best_w_sc: best_w_sc = sc; best_w = w
print('Dirichlet OOF:', round(best_w_sc,4))

# 3. LR-on-logits with C grid on greedy selected (5-fold CV)
def to_logits(P): 
    P = np.clip(P,1e-15,1-1e-15)
    return np.log(P/(1-P))
X_logit = np.hstack([to_logits(oo) for oo in sel_oofs_greedy])
Xt_logit = np.hstack([to_logits(tt) for tt in sel_tests_greedy])
c_grid = [0.5, 0.75, 1.0, 1.5, 2.0, 3.0]
best_c = None; best_lr_sc = 1e9; best_oof_lr = None
skf_meta = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for c in c_grid:
    oof_meta = np.zeros((len(train),3)); scs=[]
    for tr,va in skf_meta.split(X_logit, y):
        meta = LogisticRegression(solver='lbfgs', C=c, max_iter=1000, multi_class='multinomial', random_state=42)
        meta.fit(X_logit[tr], y[tr])
        p = meta.predict_proba(X_logit[va]); oof_meta[va]=p; scs.append(log_loss(y[va], p))
    sc = float(np.mean(scs)); print(f'LR C={c} OOF: {sc:.4f}')
    if sc < best_lr_sc: best_lr_sc = sc; best_c = c; best_oof_lr = oof_meta
print('Best LR-on-logits OOF:', round(best_lr_sc,4), 'at C=', best_c)

# 4. Ridge meta on logits as fallback (alpha grid)
from sklearn.linear_model import Ridge
alpha_grid = [0.5, 1.0, 2.0]
best_alpha = None; best_ridge_sc = 1e9; best_oof_ridge = None
def to_onehot(y): Y = np.zeros((len(y),3)); Y[np.arange(len(y)), y] = 1; return Y
for alpha in alpha_grid:
    oof_ridge = np.zeros((len(train),3)); scs=[]
    for tr,va in skf_meta.split(X_logit, y):
        Y_tr = to_onehot(y[tr])
        ridge = Ridge(alpha=alpha, random_state=42).fit(X_logit[tr], Y_tr)
        p = ridge.predict(X_logit[va])
        p = np.clip(p, 1e-15, 1-1e-15); p /= p.sum(axis=1, keepdims=True)
        oof_ridge[va] = p; scs.append(log_loss(y[va], p))
    sc = float(np.mean(scs)); print(f'Ridge alpha={alpha} OOF: {sc:.4f}')
    if sc < best_ridge_sc: best_ridge_sc = sc; best_alpha = alpha; best_oof_ridge = oof_ridge
print('Best Ridge meta OOF:', round(best_ridge_sc,4), 'at alpha=', best_alpha)

# Pick the best method among greedy, weighted, LR, Ridge
methods = {
    'greedy': best_greedy,
    'weighted': best_w_sc,
    'lr_logits': best_lr_sc,
    'ridge': best_ridge_sc
}
best_method = min(methods, key=methods.get)
best_sc = methods[best_method]
print(f'Best method: {best_method} with OOF {best_sc:.4f}')

# Generate final test preds with best method
if best_method == 'greedy':
    final = np.mean(sel_tests_greedy, axis=0)
elif best_method == 'weighted':
    final = sum(wi*tt for wi,tt in zip(best_w, sel_tests_greedy))
elif best_method == 'lr_logits':
    meta = LogisticRegression(solver='lbfgs', C=best_c, max_iter=1000, multi_class='multinomial', random_state=42)
    meta.fit(X_logit, y)
    final = meta.predict_proba(Xt_logit)
else:  # ridge
    Y = to_onehot(y)
    ridge = Ridge(alpha=best_alpha, random_state=42).fit(X_logit, Y)
    final = ridge.predict(Xt_logit)
    final = np.clip(final, 1e-15, 1-1e-15); final /= final.sum(axis=1, keepdims=True)

final = np.clip(final,1e-15,1-1e-15); final /= final.sum(axis=1, keepdims=True)
pd.DataFrame(final, columns=classes).to_csv('submission.csv', index=False)
print('Saved submission.csv | Best OOF:', round(best_sc,4))
if best_sc <= 0.29381:
    print('Bronze medal! Ready for submit_final_answer.')
else:
    print('Still above bronze; consider more tuning.')

Greedy OOF: 0.3287 selected models: ['oof_pl_refined_hstack_lr.csv', 'oof_word_nbsvm_improved2.csv', 'oof_10f_cal_ridge_char_wb.csv']


Dirichlet OOF: 0.3273


LR C=0.5 OOF: 0.3171


LR C=0.75 OOF: 0.3171


LR C=1.0 OOF: 0.3171


LR C=1.5 OOF: 0.3171


LR C=2.0 OOF: 0.3171


LR C=3.0 OOF: 0.3171
Best LR-on-logits OOF: 0.3171 at C= 0.5
Ridge alpha=0.5 OOF: 0.4503
Ridge alpha=1.0 OOF: 0.4503
Ridge alpha=2.0 OOF: 0.4503
Best Ridge meta OOF: 0.4503 at alpha= 0.5
Best method: lr_logits with OOF 0.3171


Saved submission.csv | Best OOF: 0.3171
Still above bronze; consider more tuning.


In [111]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha):
    pos = np.asarray(X[yb==1].sum(0)).ravel() + alpha
    neg = np.asarray(X[yb==0].sum(0)).ravel() + alpha
    r = np.log(pos/neg); r[~np.isfinite(r)] = 0.0
    return r

def odds_norm(P, eps=1e-15):
    P = np.clip(P, eps, 1-eps); odds = P/(1-P)
    Q = odds / (odds.sum(axis=1, keepdims=True) + eps)
    return Q / Q.sum(axis=1, keepdims=True)

param_grid = [
    {'ngram': (1,2), 'min_df': 1, 'alpha': 0.1, 'C': 1.5},
    {'ngram': (1,2), 'min_df': 1, 'alpha': 0.25, 'C': 2.0},
    {'ngram': (1,2), 'min_df': 2, 'alpha': 0.5, 'C': 3.0},
    {'ngram': (1,3), 'min_df': 1, 'alpha': 0.1, 'C': 2.0},
    {'ngram': (1,3), 'min_df': 1, 'alpha': 0.25, 'C': 3.0},
    {'ngram': (1,3), 'min_df': 2, 'alpha': 0.5, 'C': 4.0},
]

best_sc = 1e9; best_oof = None; best_test = None; best_params = None
for p in param_grid:
    oof = np.zeros((len(train), 3)); scores = []; test_preds = []
    for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
        vec = CountVectorizer(analyzer='word', ngram_range=p['ngram'], lowercase=True,
                              min_df=p['min_df'], max_df=0.95, binary=False)
        Xtr_cnt = vec.fit_transform(train['text'].iloc[tr])
        Xva_cnt = vec.transform(train['text'].iloc[va])
        Xte_cnt = vec.transform(test['text'])
        Xtr_bin = Xtr_cnt.copy(); Xtr_bin.data[:] = 1
        Xva_bin = Xva_cnt.copy(); Xva_bin.data[:] = 1
        Xte_bin = Xte_cnt.copy(); Xte_bin.data[:] = 1

        Pva = np.zeros((len(va), 3)); Pte = np.zeros((len(test), 3))
        for c in range(3):
            yb = (y[tr]==c).astype(int)
            r = log_count_ratio(Xtr_cnt, yb, alpha=p['alpha'])
            clf = LogisticRegression(solver='liblinear', penalty='l2', C=p['C'],
                                     max_iter=3000, tol=1e-4, random_state=42+c)
            clf.fit(Xtr_bin.multiply(csr_matrix(r)), yb)
            Pva[:,c] = clf.predict_proba(Xva_bin.multiply(csr_matrix(r)))[:,1]
            Pte[:,c] = clf.predict_proba(Xte_bin.multiply(csr_matrix(r)))[:,1]
        Pva = odds_norm(Pva); oof[va] = Pva
        test_preds.append(odds_norm(Pte))
        scores.append(log_loss(y[va], Pva))
    sc = float(np.mean(scores)); print(f'NB-SVM params {p} OOF: {sc:.4f}')
    if sc < best_sc:
        best_sc = sc; best_oof = oof; best_params = p
        best_test = np.mean(test_preds, axis=0)

print('Best Word NB-SVM OOF:', round(best_sc,4), 'params:', best_params)
pd.DataFrame(best_oof, columns=classes).to_csv('oof_word_nbsvm_improved2.csv', index=False)
pd.DataFrame(best_test, columns=classes).to_csv('test_word_nbsvm_improved2.csv', index=False)

NB-SVM params {'ngram': (1, 2), 'min_df': 1, 'alpha': 0.1, 'C': 1.5} OOF: 0.4358


NB-SVM params {'ngram': (1, 2), 'min_df': 1, 'alpha': 0.25, 'C': 2.0} OOF: 0.4455


NB-SVM params {'ngram': (1, 2), 'min_df': 2, 'alpha': 0.5, 'C': 3.0} OOF: 0.4981


NB-SVM params {'ngram': (1, 3), 'min_df': 1, 'alpha': 0.1, 'C': 2.0} OOF: 0.4527


NB-SVM params {'ngram': (1, 3), 'min_df': 1, 'alpha': 0.25, 'C': 3.0} OOF: 0.4617


NB-SVM params {'ngram': (1, 3), 'min_df': 2, 'alpha': 0.5, 'C': 4.0} OOF: 0.5125
Best Word NB-SVM OOF: 0.4358 params: {'ngram': (1, 2), 'min_df': 1, 'alpha': 0.1, 'C': 1.5}


In [114]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

import unicodedata
import re

def normalize_text(s: str) -> str:
    if not isinstance(s, str): s = '' if s is None else str(s)
    s = unicodedata.normalize('NFKC', s)
    s = (s.replace('\u201c','\"').replace('\u201d','\"')
           .replace('\u2018',"'").replace('\u2019',"'")
           .replace('\u2014','-').replace('\u2013','-').replace('\u2212','-')
           .replace('\u2026',' ... ').replace('\\u00A0',' '))
    s = re.sub(r'\\d+', '0', s)
    s = re.sub(r'\\s+', ' ', s).strip()
    return s

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna('').map(normalize_text)
test['text'] = test['text'].fillna('').map(normalize_text)
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Refined PL: top 20% thr>=0.95, w=0.2 on top 2 bases (hstack_lr, char_wb_2_7)
sub = pd.read_csv('submission.csv')[classes].values
maxp = sub.max(axis=1)
target_frac = 0.20; q_thr = np.quantile(maxp, 1 - target_frac)
thr = max(q_thr, 0.95)
mask = maxp >= thr
pseudo_text = test.loc[mask, 'text'].values
pseudo_y = sub[mask].argmax(axis=1)
pl_w = 0.2
print(f'Refined PL rows: {mask.sum()} ({mask.mean():.1%}) | thr: {thr:.3f}')

def retrain_hstack_pl(name, word_params, char_params, C):
    oof = np.zeros((len(train), 3)); test_preds = []; scores=[]
    for f,(tr,va) in enumerate(skf10.split(train['text'], y), 1):
        Xtr_text = pd.concat([train['text'].iloc[tr], pd.Series(pseudo_text)], ignore_index=True)
        ytr = np.concatenate([y[tr], pseudo_y])
        sw = np.concatenate([np.ones(len(tr)), np.full(len(pseudo_y), pl_w)])
        Xva_text = train['text'].iloc[va]

        vw = TfidfVectorizer(**word_params); vc = TfidfVectorizer(**char_params)
        Xtr = hstack([vw.fit_transform(Xtr_text), vc.fit_transform(Xtr_text)])
        Xva = hstack([vw.transform(Xva_text), vc.transform(Xva_text)])
        Xte = hstack([vw.transform(test['text']), vc.transform(test['text'])])
        clf = LogisticRegression(solver='lbfgs', C=C, max_iter=3000, tol=1e-4, n_jobs=1, random_state=42+f)
        clf.fit(Xtr, ytr, sample_weight=sw)
        p = clf.predict_proba(Xva); oof[va] = p; test_preds.append(clf.predict_proba(Xte))
        s = log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f}: {s:.4f}')
    sc = float(np.mean(scores)); print(f'{name} PL OOF: {sc:.4f}')
    ptest = np.mean(test_preds, axis=0)
    pd.DataFrame(oof, columns=classes).to_csv(f'oof_pl_refined_{name}.csv', index=False)
    pd.DataFrame(ptest, columns=classes).to_csv(f'test_pl_refined_{name}.csv', index=False)

def retrain_single_pl(name, vec_params, C):
    oof = np.zeros((len(train), 3)); test_preds = []; scores=[]
    for f,(tr,va) in enumerate(skf10.split(train['text'], y), 1):
        Xtr_text = pd.concat([train['text'].iloc[tr], pd.Series(pseudo_text)], ignore_index=True)
        ytr = np.concatenate([y[tr], pseudo_y])
        sw = np.concatenate([np.ones(len(tr)), np.full(len(pseudo_y), pl_w)])
        Xva_text = train['text'].iloc[va]

        vec = TfidfVectorizer(**vec_params)
        Xtr = vec.fit_transform(Xtr_text)
        Xva = vec.transform(Xva_text)
        Xte = vec.transform(test['text'])
        clf = LogisticRegression(solver='lbfgs', C=C, max_iter=3000, tol=1e-4, n_jobs=1, random_state=42+f)
        clf.fit(Xtr, ytr, sample_weight=sw)
        p = clf.predict_proba(Xva); oof[va] = p; test_preds.append(clf.predict_proba(Xte))
        s = log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f}: {s:.4f}')
    sc = float(np.mean(scores)); print(f'{name} PL OOF: {sc:.4f}')
    ptest = np.mean(test_preds, axis=0)
    pd.DataFrame(oof, columns=classes).to_csv(f'oof_pl_refined_{name}.csv', index=False)
    pd.DataFrame(ptest, columns=classes).to_csv(f'test_pl_refined_{name}.csv', index=False)

# Run refined PL on top 2
retrain_hstack_pl(
    name='hstack_lr',
    word_params=dict(analyzer='word', ngram_range=(1,3), lowercase=True, sublinear_tf=True, min_df=2, max_df=0.95),
    char_params=dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98),
    C=6.0
)
retrain_single_pl(
    name='char_wb_2_7',
    vec_params=dict(analyzer='char_wb', ngram_range=(2,7), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98),
    C=8.0
)

print('Refined PL complete. Re-ensemble in next cell to push <0.30 OOF.')

Refined PL rows: 196 (10.0%) | thr: 1.000


hstack_lr Fold 1: 0.3625


hstack_lr Fold 2: 0.3521


hstack_lr Fold 3: 0.3480


hstack_lr Fold 4: 0.3627


hstack_lr Fold 5: 0.3845


hstack_lr Fold 6: 0.3743


hstack_lr Fold 7: 0.3720


hstack_lr Fold 8: 0.3537


hstack_lr Fold 9: 0.3646


hstack_lr Fold 10: 0.3537
hstack_lr PL OOF: 0.3628


char_wb_2_7 Fold 1: 0.3915


char_wb_2_7 Fold 2: 0.3883


char_wb_2_7 Fold 3: 0.3756


char_wb_2_7 Fold 4: 0.3955


char_wb_2_7 Fold 5: 0.4136


char_wb_2_7 Fold 6: 0.3960


char_wb_2_7 Fold 7: 0.4049


char_wb_2_7 Fold 8: 0.3781


char_wb_2_7 Fold 9: 0.3876


char_wb_2_7 Fold 10: 0.3800
char_wb_2_7 PL OOF: 0.3911
Refined PL complete. Re-ensemble in next cell to push <0.30 OOF.


In [115]:
import numpy as np, pandas as pd
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

train = pd.read_csv('train.csv'); le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)

# Base pool: top originals + refined PL + word (use improved2 if <=0.41 else cnb)
bases = [
    ('oof_10f_hstack_lr.csv', 'test_10f_hstack_lr.csv'),
    ('oof_pl_refined_hstack_lr.csv', 'test_pl_refined_hstack_lr.csv'),
    ('oof_10f_char_wb_2_7.csv', 'test_10f_char_wb_2_7.csv'),
    ('oof_pl_refined_char_wb_2_7.csv', 'test_pl_refined_char_wb_2_7.csv'),
]
# Add word if useful
try:
    o_word = pd.read_csv('oof_word_nbsvm_improved2.csv')[classes].values
    w_score = log_loss(y, o_word)
    if w_score <= 0.41:
        bases.append(('oof_word_nbsvm_improved2.csv', 'test_word_nbsvm_improved2.csv'))
    else:
        bases.append(('oof_word_cnb.csv', 'test_word_cnb.csv'))
except:
    bases.append(('oof_word_cnb.csv', 'test_word_cnb.csv'))

# Load
oofs = []; tests = []
for o,t in bases:
    oofs.append(pd.read_csv(o)[classes].values)
    tests.append(pd.read_csv(t)[classes].values)

# Greedy forward (mean)
best = 1e9; sel = []
while True:
    improved = False; cand = None
    for i in range(len(oofs)):
        if i in sel: continue
        idx = sel + [i]
        sc = log_loss(y, np.mean([oofs[j] for j in idx], axis=0))
        if sc < best - 1e-6:
            best = sc; improved = True; cand = i
    if not improved: break
    sel.append(cand)
sel_oofs = [oofs[i] for i in sel]; sel_tests = [tests[i] for i in sel]
print('Greedy OOF:', round(best,4))

# Dirichlet weights (10000 iters)
rng = np.random.default_rng(42); best_w = None; best_w_sc = 1e9
for _ in range(10000):
    w = rng.dirichlet(np.ones(len(sel_oofs)))
    sc = log_loss(y, sum(wi*oo for wi,oo in zip(w, sel_oofs)))
    if sc < best_w_sc: best_w_sc = sc; best_w = w
print('Dirichlet OOF:', round(best_w_sc,4))

# LR-on-logits
def to_logits(P): 
    P = np.clip(P,1e-15,1-1e-15)
    return np.log(P/(1-P))
X = np.hstack([to_logits(oo) for oo in sel_oofs])
Xt = np.hstack([to_logits(tt) for tt in sel_tests])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_meta = np.zeros((len(train),3)); scs=[]
for tr,va in skf.split(X, y):
    meta = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, multi_class='multinomial', random_state=42)
    meta.fit(X[tr], y[tr])
    p = meta.predict_proba(X[va]); oof_meta[va]=p; scs.append(log_loss(y[va], p))
lr_sc = float(np.mean(scs)); print('LR-on-logits OOF:', round(lr_sc,4))

# Pick best and save
if lr_sc < best_w_sc:
    meta = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, multi_class='multinomial', random_state=42)
    meta.fit(X, y)
    final = meta.predict_proba(Xt)
else:
    final = sum(wi*tt for wi,tt in zip(best_w, sel_tests))
final = np.clip(final,1e-15,1-1e-15); final /= final.sum(axis=1, keepdims=True)
pd.DataFrame(final, columns=classes).to_csv('submission.csv', index=False)
print('Saved submission.csv | Best OOF:', round(min(best_w_sc, lr_sc),4))
if min(best_w_sc, lr_sc) <= 0.29381:
    print('Bronze medal! Ready for submit_final_answer.')
else:
    print('Still above bronze; consider more tuning.')

Greedy OOF: 0.3354


KeyboardInterrupt: 

In [117]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import log_loss

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def softmax(S):
    S = S - S.max(axis=1, keepdims=True)
    P = np.exp(S); P /= P.sum(axis=1, keepdims=True)
    return np.clip(P, 1e-15, 1-1e-15)

vec_params = dict(analyzer='char_wb', ngram_range=(2,7), lowercase=False, sublinear_tf=True,
                  min_df=2, max_df=0.98, max_features=500_000)
alpha_grid = [0.5, 1.0, 2.0]

best_sc = 1e9; best_oof = None; best_test = None; best_alpha = None
for alpha in alpha_grid:
    oof = np.zeros((len(train), 3)); scores = []; Ptest = np.zeros((len(test), 3))
    for f, (tr, va) in enumerate(skf.split(train['text'], y), 1):
        vec = TfidfVectorizer(**vec_params)
        Xtr = vec.fit_transform(train['text'].iloc[tr])
        Xva = vec.transform(train['text'].iloc[va])
        Xte = vec.transform(test['text'])
        clf = RidgeClassifier(alpha=alpha, random_state=42+f)
        clf.fit(Xtr, y[tr])
        pva = softmax(clf.decision_function(Xva)); oof[va] = pva
        Ptest += softmax(clf.decision_function(Xte))
        scores.append(log_loss(y[va], pva))
    Ptest /= skf.n_splits
    sc = float(np.mean(scores))
    print(f'Ridge char_wb alpha={alpha} 10f OOF: {sc:.4f}')
    if sc < best_sc:
        best_sc = sc; best_oof = oof; best_test = Ptest; best_alpha = alpha

print(f'Best Ridge char_wb OOF: {best_sc:.4f} at alpha={best_alpha}')
pd.DataFrame(best_oof, columns=classes).to_csv('oof_10f_ridge_char_wb.csv', index=False)
pd.DataFrame(best_test, columns=classes).to_csv('test_10f_ridge_char_wb.csv', index=False)

Ridge char_wb alpha=0.5 10f OOF: 0.5758


Ridge char_wb alpha=1.0 10f OOF: 0.5926


Ridge char_wb alpha=2.0 10f OOF: 0.6223
Best Ridge char_wb OOF: 0.5758 at alpha=0.5


In [119]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha=0.1):
    pos = np.asarray(X[yb==1].sum(0)).ravel() + alpha
    neg = np.asarray(X[yb==0].sum(0)).ravel() + alpha
    r = np.log(pos/neg); r[~np.isfinite(r)] = 0.0
    return r

def odds_norm(P, eps=1e-15):
    P = np.clip(P, eps, 1-eps); odds = P/(1-P)
    Q = odds / (odds.sum(axis=1, keepdims=True) + eps)
    return Q / Q.sum(axis=1, keepdims=True)

vec_params = dict(analyzer='word', ngram_range=(1,2), lowercase=True,
                  min_df=1, max_df=0.95, binary=False)
C = 2.0

oof = np.zeros((len(train), 3)); scores = []; test_preds = []
for f, (tr, va) in enumerate(skf.split(train['text'], y), 1):
    vec = CountVectorizer(**vec_params)
    Xtr_cnt = vec.fit_transform(train['text'].iloc[tr])
    Xva_cnt = vec.transform(train['text'].iloc[va])
    Xte_cnt = vec.transform(test['text'])
    # Binary copies
    Xtr_bin = Xtr_cnt.copy(); Xtr_bin.data[:] = 1
    Xva_bin = Xva_cnt.copy(); Xva_bin.data[:] = 1
    Xte_bin = Xte_cnt.copy(); Xte_bin.data[:] = 1
    Pva = np.zeros((len(va), 3)); Pte = np.zeros((len(test), 3))
    for c in range(3):
        yb = (y[tr] == c).astype(int)
        r = log_count_ratio(Xtr_cnt, yb, alpha=0.1)
        clf = LogisticRegression(solver='liblinear', penalty='l2', C=C,
                                 max_iter=3000, tol=1e-4, random_state=42+c)
        clf.fit(Xtr_bin.multiply(csr_matrix(r)), yb)
        Pva[:, c] = clf.predict_proba(Xva_bin.multiply(csr_matrix(r)))[:, 1]
        Pte[:, c] = clf.predict_proba(Xte_bin.multiply(csr_matrix(r)))[:, 1]
    Pva = odds_norm(Pva); oof[va] = Pva
    Pte = odds_norm(Pte); test_preds.append(Pte)
    scores.append(log_loss(y[va], Pva))
    print(f'Word NB-SVM Fold {f}: {scores[-1]:.4f}')
sc = float(np.mean(scores)); print(f'Word NB-SVM 10f OOF: {sc:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_word_nbsvm_correct.csv', index=False)
pd.DataFrame(np.mean(test_preds, axis=0), columns=classes).to_csv('test_word_nbsvm_correct.csv', index=False)

Word NB-SVM Fold 1: 0.5106


Word NB-SVM Fold 2: 0.3948


Word NB-SVM Fold 3: 0.4330


Word NB-SVM Fold 4: 0.4460


Word NB-SVM Fold 5: 0.4758


Word NB-SVM Fold 6: 0.4432


Word NB-SVM Fold 7: 0.4006


Word NB-SVM Fold 8: 0.4490


Word NB-SVM Fold 9: 0.4507


Word NB-SVM Fold 10: 0.4749
Word NB-SVM 10f OOF: 0.4479


In [121]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

vec_params = dict(analyzer='char_wb', ngram_range=(2,7), lowercase=False, sublinear_tf=True,
                  min_df=2, max_df=0.98, max_features=500_000)

oof = np.zeros((len(train), 3)); scores = []; Ptest = np.zeros((len(test), 3))
for f, (tr, va) in enumerate(skf.split(train['text'], y), 1):
    vec = TfidfVectorizer(**vec_params)
    Xtr = vec.fit_transform(train['text'].iloc[tr])
    Xva = vec.transform(train['text'].iloc[va])
    Xte = vec.transform(test['text'])
    clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=5e-5, max_iter=5000, tol=1e-3,
                        early_stopping=True, validation_fraction=0.1, n_iter_no_change=5,
                        random_state=42+f)
    clf.fit(Xtr, y[tr])
    pva = clf.predict_proba(Xva); oof[va] = pva
    Ptest += clf.predict_proba(Xte)
    scores.append(log_loss(y[va], pva))
    print(f'SGD char_wb Fold {f}: {scores[-1]:.4f}')
Ptest /= skf.n_splits
sc = float(np.mean(scores)); print(f'SGD char_wb 10f OOF: {sc:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_10f_sgd_char_wb.csv', index=False)
pd.DataFrame(Ptest, columns=classes).to_csv('test_10f_sgd_char_wb.csv', index=False)

SGD char_wb Fold 1: 0.5312


SGD char_wb Fold 2: 0.5245


SGD char_wb Fold 3: 0.5254


SGD char_wb Fold 4: 0.5291


SGD char_wb Fold 5: 0.5385


SGD char_wb Fold 6: 0.5370


SGD char_wb Fold 7: 0.5328


SGD char_wb Fold 8: 0.5211


SGD char_wb Fold 9: 0.5232


SGD char_wb Fold 10: 0.5202
SGD char_wb 10f OOF: 0.5283


In [124]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha):
    pos = np.asarray(X[yb==1].sum(0)).ravel() + alpha
    neg = np.asarray(X[yb==0].sum(0)).ravel() + alpha
    r = np.log(pos/neg); r[~np.isfinite(r)] = 0.0
    return r

def odds_norm(P, eps=1e-15):
    P = np.clip(P, eps, 1-eps); odds = P/(1-P)
    Q = odds / (odds.sum(axis=1, keepdims=True) + eps)
    return Q / Q.sum(axis=1, keepdims=True)

param_grid = [
    {'ngram': (1,2), 'min_df': 1, 'alpha': 0.05, 'C': 1.5},
    {'ngram': (1,2), 'min_df': 1, 'alpha': 0.1, 'C': 2.0},
    {'ngram': (1,2), 'min_df': 2, 'alpha': 0.15, 'C': 2.5},
    {'ngram': (1,3), 'min_df': 1, 'alpha': 0.05, 'C': 2.0},
    {'ngram': (1,3), 'min_df': 1, 'alpha': 0.1, 'C': 3.0},
    {'ngram': (1,3), 'min_df': 2, 'alpha': 0.2, 'C': 3.0}
]

best_sc = 1e9; best_oof = None; best_test = None; best_params = None
for p in param_grid:
    oof = np.zeros((len(train), 3)); scores = []; test_preds = []
    for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
        vec = CountVectorizer(analyzer='word', ngram_range=p['ngram'], lowercase=True,
                              min_df=p['min_df'], max_df=0.95, binary=False)
        Xtr_cnt = vec.fit_transform(train['text'].iloc[tr])
        Xva_cnt = vec.transform(train['text'].iloc[va])
        Xte_cnt = vec.transform(test['text'])
        Xtr_bin = Xtr_cnt.copy(); Xtr_bin.data[:] = 1
        Xva_bin = Xva_cnt.copy(); Xva_bin.data[:] = 1
        Xte_bin = Xte_cnt.copy(); Xte_bin.data[:] = 1
        Pva = np.zeros((len(va), 3)); Pte = np.zeros((len(test), 3))
        for c in range(3):
            yb = (y[tr]==c).astype(int)
            r = log_count_ratio(Xtr_cnt, yb, alpha=p['alpha'])
            clf = LogisticRegression(solver='liblinear', penalty='l2', C=p['C'],
                                     max_iter=3000, tol=1e-4, random_state=42+c)
            clf.fit(Xtr_bin.multiply(csr_matrix(r)), yb)
            Pva[:,c] = clf.predict_proba(Xva_bin.multiply(csr_matrix(r)))[:,1]
            Pte[:,c] = clf.predict_proba(Xte_bin.multiply(csr_matrix(r)))[:,1]
        Pva = odds_norm(Pva); oof[va] = Pva
        Pte = odds_norm(Pte); test_preds.append(Pte)
        scores.append(log_loss(y[va], Pva))
    sc = float(np.mean(scores)); print(f'NB-SVM {p} OOF: {sc:.4f}')
    if sc < best_sc:
        best_sc = sc; best_oof = oof; best_params = p
        best_test = np.mean(test_preds, axis=0)

print('Best Word NB-SVM OOF:', round(best_sc,4), 'params:', best_params)
pd.DataFrame(best_oof, columns=classes).to_csv('oof_word_nbsvm_tuned.csv', index=False)
pd.DataFrame(best_test, columns=classes).to_csv('test_word_nbsvm_tuned.csv', index=False)

NB-SVM {'ngram': (1, 2), 'min_df': 1, 'alpha': 0.05, 'C': 1.5} OOF: 0.4409


NB-SVM {'ngram': (1, 2), 'min_df': 1, 'alpha': 0.1, 'C': 2.0} OOF: 0.4479


NB-SVM {'ngram': (1, 2), 'min_df': 2, 'alpha': 0.15, 'C': 2.5} OOF: 0.4987


NB-SVM {'ngram': (1, 3), 'min_df': 1, 'alpha': 0.05, 'C': 2.0} OOF: 0.4592


NB-SVM {'ngram': (1, 3), 'min_df': 1, 'alpha': 0.1, 'C': 3.0} OOF: 0.4667


NB-SVM {'ngram': (1, 3), 'min_df': 2, 'alpha': 0.2, 'C': 3.0} OOF: 0.4984
Best Word NB-SVM OOF: 0.4409 params: {'ngram': (1, 2), 'min_df': 1, 'alpha': 0.05, 'C': 1.5}


In [126]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier, LogisticRegression

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def odds_norm(P, eps=1e-15):
    P = np.clip(P, eps, 1-eps); odds = P / (1 - P)
    Q = odds / (odds.sum(axis=1, keepdims=True) + eps)
    return Q / Q.sum(axis=1, keepdims=True)

vec_params = dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False, sublinear_tf=True,
                  min_df=2, max_df=0.98, max_features=400_000)
alpha_grid = [0.1, 0.5, 1.0]
inner_cv_splits = 3

best_sc = 1e9; best_oof = None; best_test = None; best_alpha = None
for alpha in alpha_grid:
    oof = np.zeros((len(train), 3)); scores = []; test_preds = []
    for f, (tr, va) in enumerate(skf.split(train['text'], y), 1):
        vec = TfidfVectorizer(**vec_params)
        Xtr = vec.fit_transform(train['text'].iloc[tr])
        Xva = vec.transform(train['text'].iloc[va])
        Xte = vec.transform(test['text'])
        Pva = np.zeros((len(va), 3)); Pte = np.zeros((len(test), 3))
        for c in range(3):
            yb_tr = (y[tr] == c).astype(int)
            skf_inner = StratifiedKFold(n_splits=inner_cv_splits, shuffle=True, random_state=42 + c)
            F_cal = []; z_cal = []
            for i_tr, i_va in skf_inner.split(Xtr, yb_tr):
                ridge = RidgeClassifier(alpha=alpha, random_state=42 + c)
                ridge.fit(Xtr[i_tr], yb_tr[i_tr])
                s = ridge.decision_function(Xtr[i_va])
                if s.ndim > 1: s = s[:, 0]
                F_cal.append(s.reshape(-1, 1)); z_cal.append(yb_tr[i_va])
            F_cal = np.vstack(F_cal); z_cal = np.concatenate(z_cal)
            platt = LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000, random_state=42 + c)
            platt.fit(F_cal, z_cal)
            ridge_full = RidgeClassifier(alpha=alpha, random_state=42 + c)
            ridge_full.fit(Xtr, yb_tr)
            s_va = ridge_full.decision_function(Xva)
            if s_va.ndim > 1: s_va = s_va[:, 0]
            Pva[:, c] = platt.predict_proba(s_va.reshape(-1, 1))[:, 1]
            s_te = ridge_full.decision_function(Xte)
            if s_te.ndim > 1: s_te = s_te[:, 0]
            Pte[:, c] = platt.predict_proba(s_te.reshape(-1, 1))[:, 1]
        Pva = odds_norm(Pva); oof[va] = Pva
        Pte = odds_norm(Pte); test_preds.append(Pte)
        scores.append(log_loss(y[va], Pva))
    sc = float(np.mean(scores)); print(f'Ridge Cal char_wb alpha={alpha} 10f OOF: {sc:.4f}')
    if sc < best_sc:
        best_sc = sc; best_oof = oof; best_test = np.mean(test_preds, axis=0); best_alpha = alpha

print(f'Best Calibrated Ridge char_wb OOF: {best_sc:.4f} at alpha={best_alpha}')
pd.DataFrame(best_oof, columns=classes).to_csv('oof_10f_cal_ridge_char_wb.csv', index=False)
pd.DataFrame(best_test, columns=classes).to_csv('test_10f_cal_ridge_char_wb.csv', index=False)

Ridge Cal char_wb alpha=0.1 10f OOF: 0.4952


Ridge Cal char_wb alpha=0.5 10f OOF: 0.4188


Ridge Cal char_wb alpha=1.0 10f OOF: 0.4116
Best Calibrated Ridge char_wb OOF: 0.4116 at alpha=1.0


In [128]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Refined PL: thr=0.98 (top ~10%), w=0.15 on hstack_lr and char_wb_2_7
sub = pd.read_csv('submission.csv')[classes].values
maxp = sub.max(axis=1)
target_frac = 0.10; q_thr = np.quantile(maxp, 1 - target_frac)
thr = max(q_thr, 0.98)
mask = maxp >= thr
pseudo_text = test.loc[mask, 'text'].values
pseudo_y = sub[mask].argmax(axis=1)
pl_w = 0.15
print(f'Refined PL rows: {mask.sum()} ({mask.mean():.1%}) | thr: {thr:.3f}')

def retrain_hstack_pl(name, word_params, char_params, C):
    oof = np.zeros((len(train), 3)); test_preds = []; scores=[]
    for f,(tr,va) in enumerate(skf10.split(train['text'], y), 1):
        Xtr_text = pd.concat([train['text'].iloc[tr], pd.Series(pseudo_text)], ignore_index=True)
        ytr = np.concatenate([y[tr], pseudo_y])
        sw = np.concatenate([np.ones(len(tr)), np.full(len(pseudo_y), pl_w)])
        Xva_text = train['text'].iloc[va]

        vw = TfidfVectorizer(**word_params); vc = TfidfVectorizer(**char_params)
        Xtr = hstack([vw.fit_transform(Xtr_text), vc.fit_transform(Xtr_text)])
        Xva = hstack([vw.transform(Xva_text), vc.transform(Xva_text)])
        Xte = hstack([vw.transform(test['text']), vc.transform(test['text'])])
        clf = LogisticRegression(solver='lbfgs', C=C, max_iter=3000, tol=1e-4, n_jobs=1, random_state=42+f)
        clf.fit(Xtr, ytr, sample_weight=sw)
        p = clf.predict_proba(Xva); oof[va] = p; test_preds.append(clf.predict_proba(Xte))
        s = log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f}: {s:.4f}')
    sc = float(np.mean(scores)); print(f'{name} PL OOF: {sc:.4f}')
    ptest = np.mean(test_preds, axis=0)
    pd.DataFrame(oof, columns=classes).to_csv(f'oof_pl_refined_{name}.csv', index=False)
    pd.DataFrame(ptest, columns=classes).to_csv(f'test_pl_refined_{name}.csv', index=False)

def retrain_single_pl(name, vec_params, C):
    oof = np.zeros((len(train), 3)); test_preds = []; scores=[]
    for f,(tr,va) in enumerate(skf10.split(train['text'], y), 1):
        Xtr_text = pd.concat([train['text'].iloc[tr], pd.Series(pseudo_text)], ignore_index=True)
        ytr = np.concatenate([y[tr], pseudo_y])
        sw = np.concatenate([np.ones(len(tr)), np.full(len(pseudo_y), pl_w)])
        Xva_text = train['text'].iloc[va]

        vec = TfidfVectorizer(**vec_params)
        Xtr = vec.fit_transform(Xtr_text); Xva = vec.transform(Xva_text); Xte = vec.transform(test['text'])
        clf = LogisticRegression(solver='lbfgs', C=C, max_iter=3000, tol=1e-4, n_jobs=1, random_state=42+f)
        clf.fit(Xtr, ytr, sample_weight=sw)
        p = clf.predict_proba(Xva); oof[va] = p; test_preds.append(clf.predict_proba(Xte))
        s = log_loss(y[va], p); scores.append(s); print(f'{name} Fold {f}: {s:.4f}')
    sc = float(np.mean(scores)); print(f'{name} PL OOF: {sc:.4f}')
    ptest = np.mean(test_preds, axis=0)
    pd.DataFrame(oof, columns=classes).to_csv(f'oof_pl_refined_{name}.csv', index=False)
    pd.DataFrame(ptest, columns=classes).to_csv(f'test_pl_refined_{name}.csv', index=False)

# Run refined PL on top 2
retrain_hstack_pl(
    name='hstack_lr',
    word_params=dict(analyzer='word', ngram_range=(1,3), lowercase=True, sublinear_tf=True, min_df=2, max_df=0.95),
    char_params=dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98),
    C=6.0
)
retrain_single_pl(
    name='char_wb_2_7',
    vec_params=dict(analyzer='char_wb', ngram_range=(2,7), lowercase=False, sublinear_tf=True, min_df=2, max_df=0.98),
    C=8.0
)

print('Refined PL complete. Re-ensemble in Cell 56 to target <0.30 OOF.')

Refined PL rows: 196 (10.0%) | thr: 1.000


hstack_lr Fold 1: 0.3614


hstack_lr Fold 2: 0.3544


hstack_lr Fold 3: 0.3490


hstack_lr Fold 4: 0.3659


hstack_lr Fold 5: 0.3845


hstack_lr Fold 6: 0.3730


hstack_lr Fold 7: 0.3706


hstack_lr Fold 8: 0.3523


hstack_lr Fold 9: 0.3644


hstack_lr Fold 10: 0.3531
hstack_lr PL OOF: 0.3629


char_wb_2_7 Fold 1: 0.3918


char_wb_2_7 Fold 2: 0.3858


char_wb_2_7 Fold 3: 0.3744


char_wb_2_7 Fold 4: 0.3942


char_wb_2_7 Fold 5: 0.4135


char_wb_2_7 Fold 6: 0.3965


char_wb_2_7 Fold 7: 0.4058


char_wb_2_7 Fold 8: 0.3769


char_wb_2_7 Fold 9: 0.3895


char_wb_2_7 Fold 10: 0.3816
char_wb_2_7 PL OOF: 0.3910
Refined PL complete. Re-ensemble in Cell 56 to target <0.30 OOF.


In [130]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import log_loss

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

vec_params = dict(analyzer='char_wb', ngram_range=(3,6), lowercase=False,
                  min_df=3, max_df=0.95, binary=False, max_features=300_000)

oof = np.zeros((len(train), 3)); scores = []; Ptest = np.zeros((len(test), 3))
for f, (tr, va) in enumerate(skf.split(train['text'], y), 1):
    vec = CountVectorizer(**vec_params)
    Xtr = vec.fit_transform(train['text'].iloc[tr])
    Xva = vec.transform(train['text'].iloc[va])
    Xte = vec.transform(test['text'])
    clf = ComplementNB(alpha=0.5, norm=True)
    clf.fit(Xtr, y[tr])
    pva = clf.predict_proba(Xva); oof[va] = pva
    Ptest += clf.predict_proba(Xte)
    scores.append(log_loss(y[va], pva))
    print(f'Char ComplementNB Fold {f}: {scores[-1]:.4f}')
Ptest /= skf.n_splits
sc = float(np.mean(scores)); print(f'Char ComplementNB 10f OOF: {sc:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_char_cnb.csv', index=False)
pd.DataFrame(Ptest, columns=classes).to_csv('test_char_cnb.csv', index=False)

Char ComplementNB Fold 1: 1.0986


Char ComplementNB Fold 2: 1.0986


Char ComplementNB Fold 3: 1.0986


Char ComplementNB Fold 4: 1.0986


Char ComplementNB Fold 5: 1.0986


KeyboardInterrupt: 

In [131]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import log_loss

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

vec_params = dict(analyzer='char_wb', ngram_range=(2,6), lowercase=False,
                  min_df=2, max_df=0.95, binary=True, max_features=300_000)

oof = np.zeros((len(train), 3)); scores = []; Ptest = np.zeros((len(test), 3))
for f, (tr, va) in enumerate(skf.split(train['text'], y), 1):
    vec = CountVectorizer(**vec_params)
    Xtr = vec.fit_transform(train['text'].iloc[tr])
    Xva = vec.transform(train['text'].iloc[va])
    Xte = vec.transform(test['text'])
    clf = BernoulliNB(alpha=0.5)
    clf.fit(Xtr, y[tr])
    pva = clf.predict_proba(Xva); oof[va] = pva
    Ptest += clf.predict_proba(Xte)
    scores.append(log_loss(y[va], pva))
    print(f'Char BernoulliNB Fold {f}: {scores[-1]:.4f}')
Ptest /= skf.n_splits
sc = float(np.mean(scores)); print(f'Char BernoulliNB 10f OOF: {sc:.4f}')
pd.DataFrame(oof, columns=classes).to_csv('oof_char_bnb.csv', index=False)
pd.DataFrame(Ptest, columns=classes).to_csv('test_char_bnb.csv', index=False)

Char BernoulliNB Fold 1: 3.6306


Char BernoulliNB Fold 2: 3.3430


Char BernoulliNB Fold 3: 3.2960


KeyboardInterrupt: 

In [133]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

vec_params = dict(analyzer='char_wb', ngram_range=(2,7), lowercase=False,
                  n_features=2_000_000, alternate_sign=False)
c_grid = [3.0, 5.0, 8.0]

best_sc = 1e9; best_oof = None; best_test = None; best_c = None
for c in c_grid:
    oof = np.zeros((len(train), 3)); scores = []; Ptest = np.zeros((len(test), 3))
    for f, (tr, va) in enumerate(skf.split(train['text'], y), 1):
        vec = HashingVectorizer(**vec_params)
        Xtr = vec.fit_transform(train['text'].iloc[tr])
        Xva = vec.transform(train['text'].iloc[va])
        Xte = vec.transform(test['text'])
        clf = LogisticRegression(solver='lbfgs', C=c, max_iter=3000, tol=1e-4, random_state=42+f)
        clf.fit(Xtr, y[tr])
        pva = clf.predict_proba(Xva); oof[va] = pva
        Ptest += clf.predict_proba(Xte)
        scores.append(log_loss(y[va], pva))
    Ptest /= skf.n_splits
    sc = float(np.mean(scores))
    print(f'Hashing char_wb C={c} 10f OOF: {sc:.4f}')
    if sc < best_sc:
        best_sc = sc; best_oof = oof; best_test = Ptest; best_c = c

print(f'Best Hashing char_wb OOF: {best_sc:.4f} at C={best_c}')
pd.DataFrame(best_oof, columns=classes).to_csv('oof_hashing_char_wb.csv', index=False)
pd.DataFrame(best_test, columns=classes).to_csv('test_hashing_char_wb.csv', index=False)

In [141]:
import pandas as pd

# Fix submission.csv format: load sample_submission, replace probs, ensure sum=1
sample_sub = pd.read_csv('sample_submission.csv')
probs = pd.read_csv('submission.csv')[['EAP', 'HPL', 'MWS']].values
probs = np.clip(probs, 1e-15, 1-1e-15)
probs /= probs.sum(axis=1, keepdims=True)
sample_sub[['EAP', 'HPL', 'MWS']] = probs
sample_sub.to_csv('submission.csv', index=False)
print('Fixed submission.csv with id column and normalized probs')
print('Probs shape:', probs.shape)
print('Probs sum check:', (probs.sum(axis=1) - 1.0).max())
print('Ready for submit_final_answer')

Fixed submission.csv with id column and normalized probs
Probs shape: (1958, 3)
Probs sum check: 2.220446049250313e-16
Ready for submit_final_answer


In [136]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
train['text'] = train['text'].fillna(''); test['text'] = test['text'].fillna('')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def log_count_ratio(X, yb, alpha):
    pos = np.asarray(X[yb==1].sum(0)).ravel() + alpha
    neg = np.asarray(X[yb==0].sum(0)).ravel() + alpha
    r = np.log(pos/neg); r[~np.isfinite(r)] = 0.0
    return r

def odds_norm(P, eps=1e-15):
    P = np.clip(P, eps, 1-eps); odds = P/(1-P)
    Q = odds / (odds.sum(axis=1, keepdims=True) + eps)
    return Q / Q.sum(axis=1, keepdims=True)

param_grid = [
    {'ngram': (1,2), 'min_df': 1, 'alpha': 0.05, 'C': 1.5},
    {'ngram': (1,2), 'min_df': 1, 'alpha': 0.1, 'C': 2.0},
    {'ngram': (1,2), 'min_df': 2, 'alpha': 0.05, 'C': 1.5},
    {'ngram': (1,2), 'min_df': 2, 'alpha': 0.1, 'C': 2.0},
    {'ngram': (1,3), 'min_df': 1, 'alpha': 0.05, 'C': 1.5},
    {'ngram': (1,3), 'min_df': 1, 'alpha': 0.1, 'C': 2.0},
    {'ngram': (1,3), 'min_df': 2, 'alpha': 0.05, 'C': 1.5},
    {'ngram': (1,3), 'min_df': 2, 'alpha': 0.1, 'C': 2.0},
    {'ngram': (1,2), 'min_df': 1, 'alpha': 0.2, 'C': 2.5},
    {'ngram': (1,2), 'min_df': 1, 'alpha': 0.2, 'C': 3.0},
    {'ngram': (1,3), 'min_df': 1, 'alpha': 0.2, 'C': 2.5},
    {'ngram': (1,3), 'min_df': 1, 'alpha': 0.2, 'C': 3.0}
]

best_sc = 1e9; best_oof = None; best_test = None; best_params = None
for p in param_grid:
    oof = np.zeros((len(train), 3)); scores = []; test_preds = []
    for f,(tr,va) in enumerate(skf.split(train['text'], y),1):
        vec = CountVectorizer(analyzer='word', ngram_range=p['ngram'], lowercase=True,
                              min_df=p['min_df'], max_df=0.95, binary=False)
        Xtr_cnt = vec.fit_transform(train['text'].iloc[tr])
        Xva_cnt = vec.transform(train['text'].iloc[va])
        Xte_cnt = vec.transform(test['text'])
        Xtr_bin = Xtr_cnt.copy(); Xtr_bin.data[:] = 1
        Xva_bin = Xva_cnt.copy(); Xva_bin.data[:] = 1
        Xte_bin = Xte_cnt.copy(); Xte_bin.data[:] = 1
        Pva = np.zeros((len(va), 3)); Pte = np.zeros((len(test), 3))
        for c in range(3):
            yb = (y[tr]==c).astype(int)
            r = log_count_ratio(Xtr_cnt, yb, alpha=p['alpha'])
            clf = LogisticRegression(solver='liblinear', penalty='l2', C=p['C'],
                                     max_iter=3000, tol=1e-4, random_state=42+c)
            clf.fit(Xtr_bin.multiply(csr_matrix(r)), yb)
            Pva[:,c] = clf.predict_proba(Xva_bin.multiply(csr_matrix(r)))[:,1]
            Pte[:,c] = clf.predict_proba(Xte_bin.multiply(csr_matrix(r)))[:,1]
        Pva = odds_norm(Pva); oof[va] = Pva
        Pte = odds_norm(Pte); test_preds.append(Pte)
        scores.append(log_loss(y[va], Pva))
    sc = float(np.mean(scores)); print(f'Word NB-SVM {p} OOF: {sc:.4f}')
    if sc < best_sc:
        best_sc = sc; best_oof = oof; best_params = p
        best_test = np.mean(test_preds, axis=0)

print('Best Word NB-SVM OOF:', round(best_sc,4), 'params:', best_params)
pd.DataFrame(best_oof, columns=classes).to_csv('oof_word_nbsvm_tuned.csv', index=False)
pd.DataFrame(best_test, columns=classes).to_csv('test_word_nbsvm_tuned.csv', index=False)

Word NB-SVM {'ngram': (1, 2), 'min_df': 1, 'alpha': 0.05, 'C': 1.5} OOF: 0.4409


Word NB-SVM {'ngram': (1, 2), 'min_df': 1, 'alpha': 0.1, 'C': 2.0} OOF: 0.4479


Word NB-SVM {'ngram': (1, 2), 'min_df': 2, 'alpha': 0.05, 'C': 1.5} OOF: 0.4807


Word NB-SVM {'ngram': (1, 2), 'min_df': 2, 'alpha': 0.1, 'C': 2.0} OOF: 0.4885


Word NB-SVM {'ngram': (1, 3), 'min_df': 1, 'alpha': 0.05, 'C': 1.5} OOF: 0.4505


Word NB-SVM {'ngram': (1, 3), 'min_df': 1, 'alpha': 0.1, 'C': 2.0} OOF: 0.4527


Word NB-SVM {'ngram': (1, 3), 'min_df': 2, 'alpha': 0.05, 'C': 1.5} OOF: 0.4719


Word NB-SVM {'ngram': (1, 3), 'min_df': 2, 'alpha': 0.1, 'C': 2.0} OOF: 0.4792


KeyboardInterrupt: 

In [138]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression, Ridge
from scipy.special import softmax
from scipy.stats import entropy as ent

train = pd.read_csv('train.csv'); le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)
skf_meta = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Base pool files (OOF scores in comments)
base_files = [
    ('oof_10f_hstack_lr.csv', 'test_10f_hstack_lr.csv'),  # 0.3629
    ('oof_pl_refined_hstack_lr.csv', 'test_pl_refined_hstack_lr.csv'),  # 0.3628
    ('oof_10f_char_wb_2_7.csv', 'test_10f_char_wb_2_7.csv'),  # 0.3912
    ('oof_10f_char_wb_3_7.csv', 'test_10f_char_wb_3_7.csv'),  # 0.3947
    ('oof_10f_cal_ridge_char_wb.csv', 'test_10f_cal_ridge_char_wb.csv'),  # 0.4116
    ('oof_calsvc_char.csv', 'test_calsvc_char.csv'),  # 0.4403
    ('oof_stylo_word_lr.csv', 'test_stylo_word_lr.csv'),  # 0.4624
    ('oof_word_nbsvm_improved2.csv', 'test_word_nbsvm_improved2.csv')  # 0.4358
]

# Load OOF and test preds
oofs = []; tests = []; names = []
for o_file, t_file in base_files:
    o = pd.read_csv(o_file)[classes].values
    t = pd.read_csv(t_file)[classes].values
    oofs.append(o); tests.append(t)
    names.append(o_file.replace('.csv', ''))
print('Loaded', len(oofs), 'base models')

# Compute meta features per base: max_prob, entropy, margin (top1 - top2)
n_bases = len(oofs); n_train = len(train)
meta_feats_train = np.zeros((n_train, 3 * n_bases))
meta_feats_test = np.zeros((len(test), 3 * n_bases))
for i, (oof, tst) in enumerate(zip(oofs, tests)):
    start = i * 3
    # max_prob
    meta_feats_train[:, start] = oof.max(axis=1)
    meta_feats_test[:, start] = tst.max(axis=1)
    # entropy
    meta_feats_train[:, start+1] = ent(oof, axis=1)
    meta_feats_test[:, start+1] = ent(tst, axis=1)
    # margin
    top2 = np.partition(oof, -2, axis=1)[:, -2]
    meta_feats_train[:, start+2] = oof.max(axis=1) - top2
    top2_t = np.partition(tst, -2, axis=1)[:, -2]
    meta_feats_test[:, start+2] = tst.max(axis=1) - top2_t

# Stack base probs as logits for meta
def to_logits(P): return np.log(np.clip(P, 1e-15, 1-1e-15) / (1 - np.clip(P, 1e-15, 1-1e-15)))
logit_oofs = [to_logits(o) for o in oofs]
X_logit_train = np.hstack(logit_oofs)
X_logit_test = np.hstack([to_logits(t) for t in tests])

# Full meta input: logits + meta_feats
X_train = np.hstack([X_logit_train, meta_feats_train])
X_test = np.hstack([X_logit_test, meta_feats_test])

# 1. Greedy forward selection on base probs (simple mean, target 4-7)
best_greedy = 1e9; sel_greedy = []
while len(sel_greedy) < 7:
    improved = False; cand = None
    for i in range(len(oofs)):
        if i in sel_greedy: continue
        idx = sel_greedy + [i]
        blend = np.mean([oofs[j] for j in idx], axis=0)
        sc = log_loss(y, blend)
        if sc < best_greedy - 1e-6:
            best_greedy = sc; improved = True; cand = i
    if not improved: break
    sel_greedy.append(cand)
print('Greedy selected:', [names[i] for i in sel_greedy], 'OOF:', round(best_greedy,4))

# 2. Weighted average (Dirichlet on greedy selected)
sel_oofs_g = [oofs[i] for i in sel_greedy]; sel_tests_g = [tests[i] for i in sel_greedy]
rng = np.random.default_rng(42); best_w = None; best_w_sc = 1e9
for _ in range(5000):
    w = rng.dirichlet(np.ones(len(sel_oofs_g)))
    sc = log_loss(y, sum(wi*oo for wi,oo in zip(w, sel_oofs_g)))
    if sc < best_w_sc: best_w_sc = sc; best_w = w
print('Weighted avg OOF:', round(best_w_sc,4))

# 3. LR-on-logits 10f CV on full X_train (C grid)
best_c = None; best_lr_sc = 1e9; best_oof_lr = None
c_grid = [0.25, 0.5, 1.0, 1.5, 2.0, 3.0]
for c in c_grid:
    oof_lr = np.zeros((n_train, 3)); scs = []
    for tr,va in skf_meta.split(X_train, y):
        meta = LogisticRegression(solver='lbfgs', C=c, max_iter=2000, multi_class='multinomial', random_state=42)
        meta.fit(X_train[tr], y[tr])
        p = meta.predict_proba(X_train[va]); oof_lr[va] = p
        scs.append(log_loss(y[va], p))
    sc = float(np.mean(scs)); print(f'LR C={c} OOF: {sc:.4f}')
    if sc < best_lr_sc: best_lr_sc = sc; best_c = c; best_oof_lr = oof_lr

# 4. L1 meta (saga solver) on full X_train (C grid)
best_c_l1 = None; best_l1_sc = 1e9; best_oof_l1 = None
c_grid_l1 = [0.5, 1.0, 2.0]
for c in c_grid_l1:
    oof_l1 = np.zeros((n_train, 3)); scs = []
    for tr,va in skf_meta.split(X_train, y):
        meta = LogisticRegression(solver='saga', penalty='l1', C=c, max_iter=2000, multi_class='multinomial', random_state=42)
        meta.fit(X_train[tr], y[tr])
        p = meta.predict_proba(X_train[va]); oof_l1[va] = p
        scs.append(log_loss(y[va], p))
    sc = float(np.mean(scs)); print(f'L1 LR C={c} OOF: {sc:.4f}')
    if sc < best_l1_sc: best_l1_sc = sc; best_c_l1 = c; best_oof_l1 = oof_l1

# Pick best method and generate submission
methods = {'greedy': best_greedy, 'weighted': best_w_sc, 'lr_l2': best_lr_sc, 'lr_l1': best_l1_sc}
best_method = min(methods, key=methods.get); best_sc = methods[best_method]
print(f'Best method: {best_method} OOF: {best_sc:.4f}')

if best_method == 'greedy':
    final_test = np.mean(sel_tests_g, axis=0)
elif best_method == 'weighted':
    final_test = sum(wi * tt for wi, tt in zip(best_w, sel_tests_g))
elif best_method == 'lr_l2':
    meta = LogisticRegression(solver='lbfgs', C=best_c, max_iter=2000, multi_class='multinomial', random_state=42)
    meta.fit(X_train, y)
    final_test = meta.predict_proba(X_test)
else:  # l1
    meta = LogisticRegression(solver='saga', penalty='l1', C=best_c_l1, max_iter=2000, multi_class='multinomial', random_state=42)
    meta.fit(X_train, y)
    final_test = meta.predict_proba(X_test)

final_test = np.clip(final_test, 1e-15, 1-1e-15); final_test /= final_test.sum(axis=1, keepdims=True)
pd.DataFrame(final_test, columns=classes).to_csv('submission.csv', index=False)
print('New submission.csv saved with OOF:', round(best_sc,4))
if best_sc <= 0.29381:
    print('Bronze achieved! Proceed to submit_final_answer.')
else:
    print('Close to bronze; consider further improvements or submit as is for potential private LB medal.')

Loaded 8 base models
Greedy selected: ['oof_pl_refined_hstack_lr', 'oof_word_nbsvm_improved2', 'oof_10f_cal_ridge_char_wb'] OOF: 0.3287


Weighted avg OOF: 0.3273


LR C=0.25 OOF: 0.3149


LR C=0.5 OOF: 0.3151


LR C=1.0 OOF: 0.3153


LR C=1.5 OOF: 0.3154


LR C=2.0 OOF: 0.3154


LR C=3.0 OOF: 0.3156


L1 LR C=0.5 OOF: 0.3145


L1 LR C=1.0 OOF: 0.3148


L1 LR C=2.0 OOF: 0.3150
Best method: lr_l1 OOF: 0.3145


New submission.csv saved with OOF: 0.3145
Close to bronze; consider further improvements or submit as is for potential private LB medal.
