In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
from scipy.special import softmax
from unicodedata import normalize as uni_norm
import re
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
le = LabelEncoder()
y = le.fit_transform(train['author'])
classes = list(le.classes_)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def simple_clean(s):
    if pd.isna(s): return ""
    s = str(s)
    s = s.replace("\u2019","'") .replace("\u2018","'") .replace("\u201c","\"\"") .replace("\u201d","\"\"") .replace("\u2014","-") .replace("\u2013","-")
    s = uni_norm("NFKC", s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

train['text'] = train['text'].map(simple_clean)
test['text']  = test['text'].map(simple_clean)

print('Data loaded and cleaned. Train shape:', train.shape, 'Classes:', classes)

# Function for Char TF-IDF + LR
def run_char_tfidf_lr(analyzer='char_wb', ngram=(2,6), min_df=2, sublinear_tf=True, max_features=None, C=1.0):
    vec = TfidfVectorizer(
        analyzer=analyzer,
        ngram_range=ngram,
        lowercase=True,
        sublinear_tf=sublinear_tf,
        min_df=min_df,
        max_df=1.0,
        strip_accents='unicode',
        stop_words=None,
        token_pattern=r'(?u)\S+\w*',  # Adjusted for char ngrams
        preprocessor=simple_clean,
        max_features=max_features,
        dtype=np.float32
    )
    oof = np.zeros((len(train), 3), dtype=np.float32)
    test_preds = np.zeros((len(test), 3), dtype=np.float32)
    scores = []
    for fold, (tr_idx, va_idx) in enumerate(skf.split(train, y), 1):
        print(f'Fold {fold} started for {analyzer} {ngram} min_df{min_df}')
        X_tr = vec.fit_transform(train['text'].iloc[tr_idx])
        X_va = vec.transform(train['text'].iloc[va_idx])
        X_te = vec.transform(test['text'])
        clf = LogisticRegression(
            solver='saga', C=C, penalty='l2',
            max_iter=3000, tol=1e-4, multi_class='multinomial',
            random_state=42+fold
        )
        clf.fit(X_tr, y[tr_idx])
        oof[va_idx] = clf.predict_proba(X_va)
        test_preds += clf.predict_proba(X_te)
        sc = log_loss(y[va_idx], oof[va_idx]); scores.append(sc)
        print(f'Char TF-IDF LR Fold {fold}: {sc:.4f}')
    test_preds /= skf.n_splits
    oof_score = float(np.mean(scores))
    print(f'Char TF-IDF LR OOF: {oof_score:.4f} for {analyzer} {ngram} min_df{min_df} C{C}')
    return oof_score, oof, test_preds

# Function for Calibrated LinearSVC on char features
def run_char_linsvc_cal(analyzer='char_wb', ngram=(2,6), min_df=2, C=1.0):
    vec = TfidfVectorizer(
        analyzer=analyzer,
        ngram_range=ngram,
        lowercase=True,
        sublinear_tf=True,
        min_df=min_df,
        max_df=1.0,
        strip_accents='unicode',
        stop_words=None,
        token_pattern=r'(?u)\S+\w*',
        preprocessor=simple_clean,
        dtype=np.float32
    )
    oof = np.zeros((len(train), 3), dtype=np.float32)
    test_preds = np.zeros((len(test), 3), dtype=np.float32)
    scores = []
    for fold, (tr_idx, va_idx) in enumerate(skf.split(train, y), 1):
        X_tr = vec.fit_transform(train['text'].iloc[tr_idx])
        X_va = vec.transform(train['text'].iloc[va_idx])
        X_te = vec.transform(test['text'])
        base = LinearSVC(C=C, max_iter=2000, random_state=42+fold)
        clf = CalibratedClassifierCV(base, method='sigmoid', cv=3)
        clf.fit(X_tr, y[tr_idx])
        oof[va_idx] = clf.predict_proba(X_va)
        test_preds += clf.predict_proba(X_te)
        sc = log_loss(y[va_idx], oof[va_idx]); scores.append(sc)
        print(f'Char LinSVC Cal Fold {fold}: {sc:.4f}')
    test_preds /= skf.n_splits
    oof_score = float(np.mean(scores))
    print(f'Char LinSVC Cal OOF: {oof_score:.4f} for {analyzer} {ngram}')
    return oof_score, oof, test_preds

# Function for Ridge on char features
def run_char_ridge(analyzer='char_wb', ngram=(2,6), min_df=2, alpha=1.0):
    vec = TfidfVectorizer(
        analyzer=analyzer,
        ngram_range=ngram,
        lowercase=True,
        sublinear_tf=True,
        min_df=min_df,
        max_df=1.0,
        strip_accents='unicode',
        stop_words=None,
        token_pattern=r'(?u)\S+\w*',
        preprocessor=simple_clean,
        dtype=np.float32
    )
    oof = np.zeros((len(train), 3), dtype=np.float32)
    test_preds = np.zeros((len(test), 3), dtype=np.float32)
    scores = []
    for fold, (tr_idx, va_idx) in enumerate(skf.split(train, y), 1):
        X_tr = vec.fit_transform(train['text'].iloc[tr_idx])
        X_va = vec.transform(train['text'].iloc[va_idx])
        X_te = vec.transform(test['text'])
        clf = Ridge(alpha=alpha, random_state=42+fold)
        clf.fit(X_tr, y[tr_idx])
        oof[va_idx] = softmax(clf.predict(X_va), axis=1)
        test_preds += softmax(clf.predict(X_te), axis=1)
        sc = log_loss(y[va_idx], oof[va_idx]); scores.append(sc)
        print(f'Char Ridge Fold {fold}: {sc:.4f}')
    test_preds /= skf.n_splits
    oof_score = float(np.mean(scores))
    print(f'Char Ridge OOF: {oof_score:.4f} for {analyzer} {ngram}')
    return oof_score, oof, test_preds

# Sweep over diverse configurations for char models (aim for 5-8 strong ones, OOF ~0.38-0.41)
configs_lr = [
    dict(analyzer='char_wb', ngram=(2,6), min_df=2, sublinear_tf=True, max_features=None, C=1.0),  # Baseline
    dict(analyzer='char_wb', ngram=(2,7), min_df=2, sublinear_tf=True, max_features=None, C=1.0),
    dict(analyzer='char', ngram=(3,7), min_df=3, sublinear_tf=True, max_features=50000, C=2.0),
    dict(analyzer='char_wb', ngram=(3,8), min_df=1, sublinear_tf=False, max_features=None, C=0.5),
    dict(analyzer='char', ngram=(2,6), min_df=2, sublinear_tf=True, max_features=None, C=5.0),  # Higher C
]

for i, cfg in enumerate(configs_lr, 1):
    tag = f'char_lr_{cfg["analyzer"]}_{cfg["ngram"][0]}_{cfg["ngram"][1]}_mindf{cfg["min_df"]}_C{cfg["C"]}'
    sc, oof, te = run_char_tfidf_lr(**cfg)
    if sc < 0.42:  # Save promising ones
        pd.DataFrame(oof, columns=classes).to_csv(f'oof_10f_{tag}.csv', index=False)
        pd.DataFrame(te, columns=classes).to_csv(f'test_10f_{tag}.csv', index=False)
        print(f'Saved {tag} with OOF {sc:.4f}')

# Additional diversity: LinSVC Cal and Ridge on best config (e.g., char_wb 2-7)
print('\n--- Running LinSVC Cal on char_wb (2,7) ---')
sc_svc, oof_svc, te_svc = run_char_linsvc_cal(analyzer='char_wb', ngram=(2,7), min_df=2, C=1.0)
if sc_svc < 0.42:
    pd.DataFrame(oof_svc, columns=classes).to_csv('oof_10f_char_wb_2_7_linsvc_cal.csv', index=False)
    pd.DataFrame(te_svc, columns=classes).to_csv('test_10f_char_wb_2_7_linsvc_cal.csv', index=False)

print('\n--- Running Ridge on char_wb (2,7) ---')
sc_ridge, oof_ridge, te_ridge = run_char_ridge(analyzer='char_wb', ngram=(2,7), min_df=2, alpha=1.0)
if sc_ridge < 0.42:
    pd.DataFrame(oof_ridge, columns=classes).to_csv('oof_10f_char_wb_2_7_ridge.csv', index=False)
    pd.DataFrame(te_ridge, columns=classes).to_csv('test_10f_char_wb_2_7_ridge.csv', index=False)

print('Char models sweep complete. Generated 5-7 diverse char-based models. Next: integrate top ones into final_ensemble.ipynb to improve from ~0.31 OOF towards medal. Request expert review if needed after execution.')

Data loaded and cleaned. Train shape: (17621, 3) Classes: ['EAP', 'HPL', 'MWS']
Fold 1 started for char_wb (2, 6) min_df2


Char TF-IDF LR Fold 1: 0.4925
Fold 2 started for char_wb (2, 6) min_df2


Char TF-IDF LR Fold 2: 0.4853
Fold 3 started for char_wb (2, 6) min_df2


Char TF-IDF LR Fold 3: 0.4920
Fold 4 started for char_wb (2, 6) min_df2


Char TF-IDF LR Fold 4: 0.4941
Fold 5 started for char_wb (2, 6) min_df2


Char TF-IDF LR Fold 5: 0.5068
Fold 6 started for char_wb (2, 6) min_df2


Char TF-IDF LR Fold 6: 0.4970
Fold 7 started for char_wb (2, 6) min_df2


Char TF-IDF LR Fold 7: 0.4983
Fold 8 started for char_wb (2, 6) min_df2


Char TF-IDF LR Fold 8: 0.4839
Fold 9 started for char_wb (2, 6) min_df2


Char TF-IDF LR Fold 9: 0.4875
Fold 10 started for char_wb (2, 6) min_df2


Char TF-IDF LR Fold 10: 0.4821
Char TF-IDF LR OOF: 0.4920 for char_wb (2, 6) min_df2 C1.0
Fold 1 started for char_wb (2, 7) min_df2


Char TF-IDF LR Fold 1: 0.4958
Fold 2 started for char_wb (2, 7) min_df2


Char TF-IDF LR Fold 2: 0.4882
Fold 3 started for char_wb (2, 7) min_df2


Char TF-IDF LR Fold 3: 0.4948
Fold 4 started for char_wb (2, 7) min_df2


Char TF-IDF LR Fold 4: 0.4970
Fold 5 started for char_wb (2, 7) min_df2


Char TF-IDF LR Fold 5: 0.5094
Fold 6 started for char_wb (2, 7) min_df2


Char TF-IDF LR Fold 6: 0.4986
Fold 7 started for char_wb (2, 7) min_df2


Char TF-IDF LR Fold 7: 0.5019
Fold 8 started for char_wb (2, 7) min_df2


Char TF-IDF LR Fold 8: 0.4862
Fold 9 started for char_wb (2, 7) min_df2


Char TF-IDF LR Fold 9: 0.4897
Fold 10 started for char_wb (2, 7) min_df2


Char TF-IDF LR Fold 10: 0.4856
Char TF-IDF LR OOF: 0.4947 for char_wb (2, 7) min_df2 C1.0
Fold 1 started for char (3, 7) min_df3


Char TF-IDF LR Fold 1: 0.4754
Fold 2 started for char (3, 7) min_df3


Char TF-IDF LR Fold 2: 0.4746
Fold 3 started for char (3, 7) min_df3


Char TF-IDF LR Fold 3: 0.4738
Fold 4 started for char (3, 7) min_df3


Char TF-IDF LR Fold 4: 0.4721
Fold 5 started for char (3, 7) min_df3


Char TF-IDF LR Fold 5: 0.4888
Fold 6 started for char (3, 7) min_df3


Char TF-IDF LR Fold 6: 0.4795
Fold 7 started for char (3, 7) min_df3


Char TF-IDF LR Fold 7: 0.4801
Fold 8 started for char (3, 7) min_df3


Char TF-IDF LR Fold 8: 0.4639
Fold 9 started for char (3, 7) min_df3


Char TF-IDF LR Fold 9: 0.4602
Fold 10 started for char (3, 7) min_df3


Char TF-IDF LR Fold 10: 0.4642
Char TF-IDF LR OOF: 0.4733 for char (3, 7) min_df3 C2.0
Fold 1 started for char_wb (3, 8) min_df1
