In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, r2_score
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')

In [None]:
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor

In [1]:
def extract_advanced_features(text):
    if pd.isna(text):
        return [0] * 20
    
    text_str = str(text)
    text_lower = text_str.lower()
    words = text_lower.split()
    
    return [
        len(text_str),
        len(words),
        np.mean([len(w) for w in words]) if words else 0,
        len([c for c in text_str if c.isupper()]),
        len(re.findall(r'\$[\d,]+\.?\d*', text_str)),
        len(re.findall(r'\d+%', text_str)),
        1 if re.search(r'\$\d{4,}', text_str) else 0,
        len(re.findall(r'\d{1,2}/\d{1,2}/\d{2,4}', text_str)),
        len(re.findall(r'\b\d{4}\b', text_str)),
        1 if 'xxxx' in text_lower else 0,
        text_lower.count('called'),
        text_lower.count('email'),
        text_lower.count('letter'),
        text_lower.count('fraud'),
        text_lower.count('error'),
        text_lower.count('never'),
        sum([text_lower.count(w) for w in ['please', 'help', 'need']]),
    ]

In [2]:
def clean_text_advanced(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'x{2,}', ' anonymized_info ', text, flags=re.IGNORECASE)
    text = re.sub(r'\$[\d,]+\.?\d*', ' dollar_amount ', text)
    text = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', ' date_mention ', text)
    text = re.sub(r'\d+%', ' percentage ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
train_df = pd.read_csv("neural-craft-2026/train_complaints.csv")
test_df = pd.read_csv("neural-craft-2026/test_complaints.csv")

In [None]:
train_text_features = np.array([extract_advanced_features(text) for text in train_df['complaint_text']])
test_text_features = np.array([extract_advanced_features(text) for text in test_df['complaint_text']])

scaler = StandardScaler()
train_text_features = scaler.fit_transform(train_text_features)
test_text_features = scaler.transform(test_text_features)

train_df['cleaned_text'] = train_df['complaint_text'].apply(clean_text_advanced)
test_df['cleaned_text'] = test_df['complaint_text'].apply(clean_text_advanced)

In [None]:
tfidf_word = TfidfVectorizer(
    max_features=6000, ngram_range=(1, 2), min_df=2, max_df=0.95,
    sublinear_tf=True, analyzer='word', token_pattern=r'\w{2,}', stop_words='english')

tfidf_char = TfidfVectorizer(
    max_features=2000, ngram_range=(3, 5), analyzer='char', sublinear_tf=True, min_df=3)

X_train_word = tfidf_word.fit_transform(train_df['cleaned_text'])
X_test_word = tfidf_word.transform(test_df['cleaned_text'])
X_train_char = tfidf_char.fit_transform(train_df['cleaned_text'])
X_test_char = tfidf_char.transform(test_df['cleaned_text'])

X_train_full = hstack([X_train_word, X_train_char, train_text_features]).tocsr()
X_test_full = hstack([X_test_word, X_test_char, test_text_features]).tocsr()

In [None]:
primary_encoder = LabelEncoder()
secondary_encoder = LabelEncoder()
y_primary = primary_encoder.fit_transform(train_df['primary_category'])
y_secondary = secondary_encoder.fit_transform(train_df['secondary_category'])
y_severity = train_df['severity'].values

In [None]:
N_FOLDS = 5
# Storing out-of-fold predictions
oof_primary = np.zeros(len(y_primary))
oof_secondary = np.zeros(len(y_secondary))
oof_severity = np.zeros(len(y_severity))

# Storage for test predictions
test_pred_primary = np.zeros((len(test_df), N_FOLDS))
test_pred_secondary = np.zeros((len(test_df), N_FOLDS))
test_pred_severity = np.zeros((len(test_df), N_FOLDS))

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_full, y_primary)):
    
    X_tr = X_train_full[train_idx]
    X_val = X_train_full[val_idx]
    
    y_tr_p = y_primary[train_idx]
    y_val_p = y_primary[val_idx]
    y_tr_s = y_secondary[train_idx]
    y_val_s = y_secondary[val_idx]
    y_tr_sev = y_severity[train_idx]
    y_val_sev = y_severity[val_idx]
    
    # --- PRIMARY ---
    print(f"\nTraining primary classifier")
    if XGBOOST_AVAILABLE:
        model_p = XGBClassifier(
            n_estimators=500, max_depth=9, learning_rate=0.04,
            subsample=0.8, colsample_bytree=0.8, min_child_weight=3,
            gamma=0.1, reg_alpha=0.1, reg_lambda=1.0,
            random_state=42+fold, tree_method='hist', n_jobs=-1
        )
    else:
        model_p = LGBMClassifier(
            n_estimators=500, max_depth=9, learning_rate=0.04,
            num_leaves=64, subsample=0.8, colsample_bytree=0.8,
            random_state=42+fold, verbose=-1, n_jobs=-1
        )
    
    model_p.fit(X_tr, y_tr_p)
    oof_primary[val_idx] = model_p.predict(X_val)
    test_pred_primary[:, fold] = model_p.predict(X_test_full)
    
    acc_p = accuracy_score(y_val_p, oof_primary[val_idx])
    print(f"  Primary accuracy: {acc_p:.4f}")
    
    # --- SECONDARY ---
    print(f"Training secondary classifier")
    if XGBOOST_AVAILABLE:
        model_s = XGBClassifier(
            n_estimators=600, max_depth=11, learning_rate=0.03,
            subsample=0.8, colsample_bytree=0.8, min_child_weight=2,
            gamma=0.05, reg_alpha=0.15, reg_lambda=1.2,
            random_state=42+fold, tree_method='hist', n_jobs=-1
        )
    else:
        model_s = LGBMClassifier(
            n_estimators=600, max_depth=11, learning_rate=0.03,
            num_leaves=96, subsample=0.8, colsample_bytree=0.8,
            random_state=42+fold, verbose=-1, n_jobs=-1
        )
    
    model_s.fit(X_tr, y_tr_s)
    oof_secondary[val_idx] = model_s.predict(X_val)
    test_pred_secondary[:, fold] = model_s.predict(X_test_full)
    
    acc_s = accuracy_score(y_val_s, oof_secondary[val_idx])
    print(f"  Secondary accuracy: {acc_s:.4f}")
    
    # --- SEVERITY ---
    print(f"Training severity regressor")
    if XGBOOST_AVAILABLE:
        model_sev = XGBRegressor(
            n_estimators=400, max_depth=7, learning_rate=0.05,
            subsample=0.85, colsample_bytree=0.85, min_child_weight=3,
            gamma=0.1, reg_alpha=0.1, reg_lambda=1.0,
            random_state=42+fold, tree_method='hist', n_jobs=-1
        )
    else:
        model_sev = LGBMRegressor(
            n_estimators=400, max_depth=7, learning_rate=0.05,
            num_leaves=48, subsample=0.85, colsample_bytree=0.85,
            random_state=42+fold, verbose=-1, n_jobs=-1
        )
    
    model_sev.fit(X_tr, y_tr_sev)
    oof_severity[val_idx] = model_sev.predict(X_val)
    test_pred_severity[:, fold] = model_sev.predict(X_test_full)
    
    r2_sev = r2_score(y_val_sev, oof_severity[val_idx])
    print(f"  Severity RÂ²: {r2_sev:.4f}")
    
    # Fold score
    fold_score = 0.3 * acc_p + 0.4 * acc_s + 0.3 * r2_sev
    print(f"\n  Fold {fold+1} weighted score: {fold_score:.4f}")


In [None]:
primary_acc = accuracy_score(y_primary, oof_primary)
secondary_acc = accuracy_score(y_secondary, oof_secondary)
severity_r2 = r2_score(y_severity, oof_severity)
weighted_score = 0.3 * primary_acc + 0.4 * secondary_acc + 0.3 * severity_r2

print(primary_acc)
print(secondary_acc)
print(severity_r2)

In [None]:

# Average predictions across folds (mode for classification, mean for regression)
from scipy import stats

# For classification: majority vote
test_primary_final = stats.mode(test_pred_primary.astype(int), axis=1, keepdims=False)[0]
test_secondary_final = stats.mode(test_pred_secondary.astype(int), axis=1, keepdims=False)[0]

# For regression: mean
test_severity_mean = test_pred_severity.mean(axis=1)
test_severity_final = np.clip(np.round(test_severity_mean), 1, 5).astype(int)

# Decode
test_primary_decoded = primary_encoder.inverse_transform(test_primary_final)
test_secondary_decoded = secondary_encoder.inverse_transform(test_secondary_final)

In [None]:
submission = pd.DataFrame({
    'complaint_id': test_df['complaint_id'],
    'primary_category': test_primary_decoded,
    'secondary_category': test_secondary_decoded,
    'severity': test_severity_final
})

submission.to_csv('submission_kfold.csv', index=False)