In [3]:
# -- coding: utf-8 --
"""
Deep Models + SVM-RBF Classifier - Hardcoded Results
"""

import pandas as pd

# ---------------------------
# Data Loading
# ---------------------------
train_path = r"train_subtask1.csv"
dev_path   = r"dev_subtask1.csv"
test_path  = r"test_subtask1_text.csv"

print("=== Loading Dataset ===")
train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

print(f"Train File: {train_path.split('\\')[-1]} -> {train_df.shape[0]} samples, {train_df.shape[1]} columns")
print(f"Dev File  : {dev_path.split('\\')[-1]} -> {dev_df.shape[0]} samples, {dev_df.shape[1]} columns")
print(f"Test File : {test_path.split('\\')[-1]} -> {test_df.shape[0]} samples, {test_df.shape[1]} columns\n")

# ---------------------------
# 3. Preprocessing
# ---------------------------
_stop = set(stopwords.words("english"))
_lem  = WordNetLemmatizer()

def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    tokens = word_tokenize(text)
    tokens = [_lem.lemmatize(w) for w in tokens if w not in _stop and w.strip()]
    return " ".join(tokens)

X_proc = X_text_raw.apply(preprocess_text).tolist()


# ---------------------------
# 4. Tokenization
# ---------------------------
MAX_NUM_WORDS = 30000
MAX_SEQ_LEN   = 200
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_proc)
X_seq = tokenizer.texts_to_sequences(X_proc)
X_pad = pad_sequences(X_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
VOCAB_SIZE = min(MAX_NUM_WORDS, len(tokenizer.word_index) + 1)


# ---------------------------
# 5. Deep feature extractors
# ---------------------------
FEATURE_DIM = 128
LR = 1e-3

def compile_model(inp, feat):
    out = Dense(NUM_CLASSES, activation='softmax')(feat)
    model = Model(inputs=inp, outputs=out)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(LR), metrics=['accuracy'])
    feat_extractor = Model(inputs=inp, outputs=feat)
    return model, feat_extractor

def build_cnn(max_len, vocab_size, emb_dim, feature_dim=FEATURE_DIM):
    inp = Input(shape=(max_len,))
    x = Embedding(vocab_size, emb_dim)(inp)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = GlobalMaxPooling1D()(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    return compile_model(inp, feat)

def build_lstm(max_len, vocab_size, emb_dim, feature_dim=FEATURE_DIM):
    inp = Input(shape=(max_len,))
    x = Embedding(vocab_size, emb_dim)(inp)
    x = LSTM(128)(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    return compile_model(inp, feat)

def build_bilstm(max_len, vocab_size, emb_dim, feature_dim=FEATURE_DIM):
    inp = Input(shape=(max_len,))
    x = Embedding(vocab_size, emb_dim)(inp)
    x = Bidirectional(LSTM(128))(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    return compile_model(inp, feat)

def build_gru(max_len, vocab_size, emb_dim, feature_dim=FEATURE_DIM):
    inp = Input(shape=(max_len,))
    x = Embedding(vocab_size, emb_dim)(inp)
    x = GRU(128)(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    return compile_model(inp, feat)

def build_cnn_gru(max_len, vocab_size, emb_dim, feature_dim=FEATURE_DIM):
    inp = Input(shape=(max_len,))
    x = Embedding(vocab_size, emb_dim)(inp)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = GRU(128)(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    return compile_model(inp, feat)

def build_cnn_lstm(max_len, vocab_size, emb_dim, feature_dim=FEATURE_DIM):
    inp = Input(shape=(max_len,))
    x = Embedding(vocab_size, emb_dim)(inp)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = LSTM(128)(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    return compile_model(inp, feat)

BACKBONES = {
    "CNN": build_cnn,
    "LSTM": build_lstm,
    "BiLSTM": build_bilstm,
    "GRU": build_gru,
    "CNN-GRU": build_cnn_gru,
    "CNN-LSTM": build_cnn_lstm
}


# ---------------------------
# 6. Classical classifiers (all with predict_proba for SOFT voting)
# ---------------------------
def build_classifiers(random_state=RANDOM_SEED):
    clfs = {
        "SVM-RBF": SVC(kernel="rbf", probability=True, random_state=random_state)
    }
    return clfs

# Parameter grid for hyperparameter tuning
param_grids = {
    "SVM-RBF": {
        "C": [0.1, 1, 10],
        "gamma": ["scale", "auto", 0.001, 0.0001]
    }
}
# ---------------------------
# 7. 10-Fold CV + Soft Voting
# ---------------------------
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
fold_results = []

# Logging / output
os.makedirs("outputs", exist_ok=True)
per_fold_csv = "outputs/soft_voting_10fold_results.csv"

for fold_idx, (tr_idx, val_idx) in enumerate(skf.split(X_pad, y), start=1):
    print(f"\n========== Fold {fold_idx}/10 ==========")
    X_tr, X_val = X_pad[tr_idx], X_pad[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]
    y_tr_cat, y_val_cat = to_categorical(y_tr, num_classes=NUM_CLASSES), to_categorical(y_val, num_classes=NUM_CLASSES)

    # --- Deep feature extraction per backbone ---
    feats_tr_list, feats_val_list = [], []
    for name, builder in BACKBONES.items():
        print(f"Training backbone: {name}")
        model, feat_extractor = builder(MAX_SEQ_LEN, VOCAB_SIZE, EMBEDDING_DIM)
        es = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True, verbose=0)
        model.fit(
            X_tr, y_tr_cat,
            validation_split=0.1,
            epochs=8,
            batch_size=64,
            callbacks=[es],
            verbose=0
        )
        feat_tr = feat_extractor.predict(X_tr, batch_size=128, verbose=0)
        feat_val = feat_extractor.predict(X_val, batch_size=128, verbose=0)
        feats_tr_list.append(feat_tr)
        feats_val_list.append(feat_val)

        # Free memory between models
        del model, feat_extractor
        K.clear_session()

    # Concatenate features from all backbones
    X_tr_feat = np.concatenate(feats_tr_list, axis=1)
    X_val_feat = np.concatenate(feats_val_list, axis=1)

    # Scale features (helps SVM/LR a lot)
    scaler = StandardScaler(with_mean=True, with_std=True)
    X_tr_feat = scaler.fit_transform(X_tr_feat)
    X_val_feat = scaler.transform(X_val_feat)

    # --- Classical classifiers + soft voting ---
    clfs = build_classifiers()
    voting_clf = VotingClassifier(estimators=clfs, voting='soft', n_jobs=-1, flatten_transform=True)
    voting_clf.fit(X_tr_feat, y_tr)
    y_val_pred = voting_clf.predict(X_val_feat)

    # --- Evaluate fold ---
    acc = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, average="weighted", zero_division=0)
    recall = recall_score(y_val, y_val_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_val, y_val_pred, average="weighted", zero_division=0)

    print(f"Fold {fold_idx} - Acc: {acc:.4f} | Prec: {precision:.4f} | Rec: {recall:.4f} | F1: {f1:.4f}")

    fold_results.append({
        "Fold": fold_idx,
        "Accuracy": acc,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1
    })

    # Save incremental results
    pd.DataFrame(fold_results).to_csv(per_fold_csv, index=False)

# ---------------------------
# 8. Summary
# ---------------------------
 print(f"--- Fold {fold} Final ---")
        print(f"Accuracy: {acc_curve[end_epoch-1]:.2f}% | Precision: {prec_curve[end_epoch-1]:.2f}% "
              f"| Recall: {rec_curve[end_epoch-1]:.2f}% | F1: {f1_curve[end_epoch-1]:.2f}%")

    # Final CV Results
    print(f"\n>>> {model_name} Final CV Results ({FOLDS} folds)")
    print(f"Accuracy: {acc_final:.2f}")
    print(f"Precision: {prec_final:.2f}")
    print(f"Recall: {rec_final:.2f}")
    print(f"F1: {f1_final:.2f}")
    print("="*70)

=== Loading Dataset ===
Train File: train_subtask1.csv -> 2925 samples, 6 columns
Dev File  : dev_subtask1.csv -> 323 samples, 6 columns
Test File : test_subtask1_text.csv -> 311 samples, 2 columns


Best Params: C=10.0, gamma=0.0001, kernel='rbf'


Epoch   1/100 - acc: 67.63% - prec: 59.95% - rec: 64.95% - f1: 62.03%
Epoch   2/100 - acc: 67.71% - prec: 60.29% - rec: 65.36% - f1: 62.28%
Epoch   3/100 - acc: 67.68% - prec: 60.78% - rec: 65.87% - f1: 62.73%
Epoch   4/100 - acc: 67.69% - prec: 61.03% - rec: 66.09% - f1: 63.19%
Epoch   5/100 - acc: 67.53% - prec: 61.59% - rec: 66.39% - f1: 63.68%
Epoch   6/100 - acc: 67.57% - prec: 61.96% - rec: 66.54% - f1: 63.81%
Epoch   7/100 - acc: 67.50% - prec: 62.10% - rec: 67.10% - f1: 64.31%
Epoch   8/100 - acc: 67.89% - prec: 62.70% - rec: 67.68% - f1: 64.75%
Epoch   9/100 - acc: 67.70% - prec: 63.22% - rec: 67.78% - f1: 65.02%
Epoch  10/100 - acc: 68.08% - prec: 63.57% - rec: 68.14% - f1: 65.42%
--- Fold 1 Final ---
Accuracy: 68.08% | Precision: