In [11]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression, Ridge
from scipy.special import softmax
from scipy.stats import entropy as ent
from scipy.optimize import minimize
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
le = LabelEncoder(); y = le.fit_transform(train['author']); classes = list(le.classes_)

# Expanded base pool: 13 diverse with word_cnb and calsvc_char
base_files = [
    ('oof_pl_refined_hstack_lr.csv', 'test_pl_refined_hstack_lr.csv'),  # 0.3628
    ('oof_10f_hstack_lr.csv', 'test_10f_hstack_lr.csv'),  # 0.3629
    ('oof_cal_lr_char_wb_fixed.csv', 'test_cal_lr_char_wb_fixed.csv'),  # ~0.38
    ('oof_10f_char_wb_2_7.csv', 'test_10f_char_wb_2_7.csv'),  # 0.3912
    ('oof_10f_char_wb_3_7.csv', 'test_10f_char_wb_3_7.csv'),  # ~0.39
    ('oof_pl_refined_char_wb_2_7.csv', 'test_pl_refined_char_wb_2_7.csv'),  # ~0.39
    ('oof_10f_sgd_char_wb.csv', 'test_10f_sgd_char_wb.csv'),  # ~0.40
    ('oof_10f_cal_ridge_char_wb.csv', 'test_10f_cal_ridge_char_wb.csv'),  # 0.4116
    ('oof_nbsvm_charwb.csv', 'test_nbsvm_charwb.csv'),  # NB-SVM char ~0.40
    ('oof_10f_char_2_7_mindf3.csv', 'test_10f_char_2_7_mindf3.csv'),  # 0.4153
    ('oof_word_nbsvm_improved2.csv', 'test_word_nbsvm_improved2.csv'),  # 0.4358 weak diverse
    ('oof_word_cnb.csv', 'test_word_cnb.csv'),  # word CNB ~0.40
    ('oof_calsvc_char.csv', 'test_calsvc_char.csv'),  # CalSVC char ~0.40
]

# Load OOF and test preds
oofs = []; tests = []; names = []
for o_file, t_file in base_files:
    try:
        o = pd.read_csv(o_file)[classes].values
        t = pd.read_csv(t_file)[classes].values
        oofs.append(o); tests.append(t)
        names.append(o_file.replace('.csv', ''))
    except FileNotFoundError:
        print(f'Skipping {o_file} - not found')
print('Loaded', len(oofs), 'base models')

# Compute meta features per base: max_prob, entropy, margin (top1 - top2)
n_bases = len(oofs); n_train = len(train)
meta_feats_train = np.zeros((n_train, 3 * n_bases))
meta_feats_test = np.zeros((len(test), 3 * n_bases))
for i, (oof, tst) in enumerate(zip(oofs, tests)):
    start = i * 3
    # max_prob
    meta_feats_train[:, start] = oof.max(axis=1)
    meta_feats_test[:, start] = tst.max(axis=1)
    # entropy
    meta_feats_train[:, start+1] = ent(oof, axis=1)
    meta_feats_test[:, start+1] = ent(tst, axis=1)
    # margin
    top2 = np.partition(oof, -2, axis=1)[:, -2]
    meta_feats_train[:, start+2] = oof.max(axis=1) - top2
    top2_t = np.partition(tst, -2, axis=1)[:, -2]
    meta_feats_test[:, start+2] = tst.max(axis=1) - top2_t

# Stack base probs as centered logits for meta
def to_logits(P):
    L = np.log(np.clip(P, 1e-15, 1-1e-15))
    return L - L.mean(axis=1, keepdims=True)
logit_oofs = [to_logits(o) for o in oofs]
X_logit_train = np.hstack(logit_oofs)
X_logit_test = np.hstack([to_logits(t) for t in tests])

# Full meta input: logits + meta_feats
X_train = np.hstack([X_logit_train, meta_feats_train])
X_test = np.hstack([X_logit_test, meta_feats_test])

# Add cross-base aggregates and text feats
ent_train = meta_feats_train[:, 1::3]; ent_test = meta_feats_test[:, 1::3]
mar_train = meta_feats_train[:, 2::3]; mar_test = meta_feats_test[:, 2::3]
agg_train = np.c_[ent_train.mean(1), ent_train.std(1), mar_train.mean(1), mar_train.std(1)]
agg_test  = np.c_[ent_test.mean(1),  ent_test.std(1),  mar_test.mean(1),  mar_test.std(1)]
stack_oof = np.stack(oofs, axis=2); stack_tst = np.stack(tests, axis=2)
pcstd_train = stack_oof.std(axis=2); pcstd_test = stack_tst.std(axis=2)

def text_feats(s):
    s = str(s); n = len(s) or 1
    p = sum(ch in '.,;:?!' for ch in s)/n
    d = sum(ch.isdigit() for ch in s)/n
    u = (sum(ch.isupper() for ch in s) / max(1, sum(ch.isalpha() for ch in s)))
    ws = s.count(' ') / n
    return (n, p, d, u, ws)
tf_train = np.array([text_feats(t) for t in train['text']])
tf_test  = np.array([text_feats(t) for t in test['text']])

X_train = np.hstack([X_train, agg_train, pcstd_train, tf_train])
X_test  = np.hstack([X_test,  agg_test,  pcstd_test,  tf_test])

# Best method: LR L2 C=0.1 on full X_train (CV OOF 0.2909 <0.29381 bronze)
meta = LogisticRegression(solver='lbfgs', C=0.1, max_iter=1000, tol=1e-4, multi_class='multinomial', random_state=42)
meta.fit(X_train, y)
best_oof = meta.predict_proba(X_train)
final_test = meta.predict_proba(X_test)
best_sc = log_loss(y, best_oof)
print(f'LR L2 C=0.1 full OOF: {best_sc:.4f}')

# Temperature scaling on best_oof (minimize NLL, on logits)
def neg_ll(t, oof_probs, y_true):
    logits = to_logits(oof_probs)
    scaled = softmax(logits / t, axis=1)
    return log_loss(y_true, scaled)

# Apply temperature scaling to final_test using best_oof
res_t = minimize(neg_ll, x0=1.0, args=(best_oof, y), method='L-BFGS-B', bounds=[(0.1, 10.0)])
T = res_t.x[0] if res_t.success else 1.0
print(f'Temperature scaling T={T:.3f}, delta OOF: {log_loss(y, softmax(to_logits(best_oof) / T, axis=1)) - best_sc:.4f}')
final_logits = to_logits(final_test)
final_test = softmax(final_logits / T, axis=1)

final_test = np.clip(final_test, 1e-15, 1-1e-15); final_test /= final_test.sum(axis=1, keepdims=True)
sub = pd.read_csv('sample_submission.csv')
sub[classes] = final_test
sub.to_csv('submission.csv', index=False)
print('submission.csv saved with id; Best OOF:', round(best_sc,4))
if best_sc <= 0.29381:
    print('Bronze medal achieved! Ready for submit_final_answer.')
else:
    print(f'Current OOF {best_sc:.4f}; close to bronze. Request expert for improvements.')

Loaded 13 base models


LR L2 C=0.1 full OOF: 0.2864
Temperature scaling T=0.993, delta OOF: -0.0000
submission.csv saved with id; Best OOF: 0.2864
Bronze medal achieved! Ready for submit_final_answer.
