In [11]:
# -*- coding: utf-8 -*-
"""
Full Pipeline: Single-layer deep feature extractors + classical classifiers
with 10-fold cross-validation and ensemble weight learning
"""

# ---------------------------
# 0. Imports
# ---------------------------
import pandas as pd
import numpy as np
import re
import random
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, LSTM, GRU, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
tf.get_logger().setLevel('ERROR')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ---------------------------
# 1. Load Data
# ---------------------------
train_path = 'topics_train.csv'
dev_path   = 'topics_dev.csv'
test_path  = 'topics_test.csv'

train_df = pd.read_csv(train_path)
dev_df   = pd.read_csv(dev_path)
test_df  = pd.read_csv(test_path)

# Combine train + dev
train_df = pd.concat([train_df, dev_df], ignore_index=True)

text_column  = "review"
label_column = "sentiment_label"

X_train_raw = train_df[text_column]
y_train_raw = train_df[label_column]
X_test_raw  = test_df[text_column]
y_test_raw  = test_df[label_column]

# Label encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_test  = label_encoder.transform(y_test_raw)
NUM_CLASSES = len(label_encoder.classes_)

# ---------------------------
# 2. Preprocessing
# ---------------------------
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

X_train_processed = X_train_raw.apply(preprocess_text).tolist()
X_test_processed  = X_test_raw.apply(preprocess_text).tolist()

# ---------------------------
# 3. Tokenization
# ---------------------------
MAX_NUM_WORDS = 30000
MAX_SEQ_LEN   = 200
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_processed)
X_train_seq = tokenizer.texts_to_sequences(X_train_processed)
X_test_seq  = tokenizer.texts_to_sequences(X_test_processed)
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')
X_test_pad  = pad_sequences(X_test_seq,  maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

# ---------------------------
# 4. Build deep feature extractors (single-layer)
# ---------------------------
def build_cnn(max_seq_len, vocab_size, embedding_dim, feature_dim=128):
    inp = Input(shape=(max_seq_len,))
    x = Embedding(vocab_size, embedding_dim)(inp)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = GlobalMaxPooling1D()(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    out = Dense(NUM_CLASSES, activation='softmax')(feat)
    model = Model(inputs=inp, outputs=out)
    feat_extractor = Model(inputs=inp, outputs=feat)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
    return model, feat_extractor

def build_lstm(max_seq_len, vocab_size, embedding_dim, feature_dim=128):
    inp = Input(shape=(max_seq_len,))
    x = Embedding(vocab_size, embedding_dim)(inp)
    x = LSTM(128)(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    out = Dense(NUM_CLASSES, activation='softmax')(feat)
    model = Model(inputs=inp, outputs=out)
    feat_extractor = Model(inputs=inp, outputs=feat)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
    return model, feat_extractor

def build_bilstm(max_seq_len, vocab_size, embedding_dim, feature_dim=128):
    inp = Input(shape=(max_seq_len,))
    x = Embedding(vocab_size, embedding_dim)(inp)
    x = Bidirectional(LSTM(128))(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    out = Dense(NUM_CLASSES, activation='softmax')(feat)
    model = Model(inputs=inp, outputs=out)
    feat_extractor = Model(inputs=inp, outputs=feat)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
    return model, feat_extractor

def build_gru(max_seq_len, vocab_size, embedding_dim, feature_dim=128):
    inp = Input(shape=(max_seq_len,))
    x = Embedding(vocab_size, embedding_dim)(inp)
    x = GRU(128)(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    out = Dense(NUM_CLASSES, activation='softmax')(feat)
    model = Model(inputs=inp, outputs=out)
    feat_extractor = Model(inputs=inp, outputs=feat)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
    return model, feat_extractor

def build_cnn_gru(max_seq_len, vocab_size, embedding_dim, feature_dim=128):
    inp = Input(shape=(max_seq_len,))
    x = Embedding(vocab_size, embedding_dim)(inp)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = GRU(128)(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    out = Dense(NUM_CLASSES, activation='softmax')(feat)
    model = Model(inputs=inp, outputs=out)
    feat_extractor = Model(inputs=inp, outputs=feat)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
    return model, feat_extractor

def build_cnn_lstm(max_seq_len, vocab_size, embedding_dim, feature_dim=128):
    inp = Input(shape=(max_seq_len,))
    x = Embedding(vocab_size, embedding_dim)(inp)
    x = Conv1D(128, 5, activation='relu', padding='same')(x)
    x = LSTM(128)(x)
    feat = Dense(feature_dim, activation='relu', name='feat')(x)
    out = Dense(NUM_CLASSES, activation='softmax')(feat)
    model = Model(inputs=inp, outputs=out)
    feat_extractor = Model(inputs=inp, outputs=feat)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])
    return model, feat_extractor

# Map backbones
VOCAB_SIZE = min(MAX_NUM_WORDS, len(tokenizer.word_index)+1)
BACKBONES = {
    "CNN": build_cnn,
    "LSTM": build_lstm,
    "BiLSTM": build_bilstm,
    "GRU": build_gru,
    "CNN-GRU": build_cnn_gru,
    "CNN-LSTM": build_cnn_lstm
}

# ---------------------------
# 5. Classical classifiers
# ---------------------------
def build_classifiers():
    clfs = [
        ("nb", MultinomialNB(alpha=1.0)),
        ("svm", LinearSVC(C=1.0, dual=False, max_iter=5000, random_state=RANDOM_SEED)),
        ("rf", RandomForestClassifier(n_estimators=200, max_depth=None, random_state=RANDOM_SEED)),
        ("xgb", XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", n_estimators=100, max_depth=3, learning_rate=0.1, random_state=RANDOM_SEED))
    ]
    return clfs

# ---------------------------
# 6. 10-Fold Cross-validation + Feature extraction + Weight Learner
# ---------------------------
from tensorflow.keras.utils import to_categorical

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
fold_results = []

# Convert labels for Keras
y_train_cat = to_categorical(y_train, num_classes=NUM_CLASSES)

class WeightLearner(nn.Module):
    def __init__(self, num_streams, num_classes):
        super().__init__()
        self.w_raw = nn.Parameter(torch.zeros(num_streams))
        self.bias  = nn.Parameter(torch.zeros(num_classes))
    def forward(self, P):
        w = torch.nn.functional.softplus(self.w_raw)
        w = w / (w.sum() + 1e-12)
        mix = torch.einsum("nsc,s->nc", P, w)
        mix = torch.clamp(mix, 1e-8, 1-1e-8)
        logits = torch.log(mix) + self.bias
        return logits, w

EPOCHS_WL = 100

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train_pad, y_train)):
    X_tr, X_val = X_train_pad[train_idx], X_train_pad[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    y_tr_cat, y_val_cat = y_train_cat[train_idx], y_train_cat[val_idx]

    # --- Deep feature extraction ---
    train_feature_list = []
    val_feature_list = []
    streams = []
    for name, builder in BACKBONES.items():
        print(f"Training {name}")
        model, feat_extractor = builder(MAX_SEQ_LEN, VOCAB_SIZE, EMBEDDING_DIM)
        es = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True, verbose=0)
        model.fit(X_tr, y_tr_cat, validation_split=0.1, epochs=6, batch_size=64, callbacks=[es], verbose=0)
        feat_tr = feat_extractor.predict(X_tr, batch_size=64, verbose=0)
        feat_val = feat_extractor.predict(X_val, batch_size=64, verbose=0)
        train_feature_list.append(feat_tr)
        val_feature_list.append(feat_val)
        tf.keras.backend.clear_session()

    X_tr_feat = np.concatenate(train_feature_list, axis=1)
    X_val_feat = np.concatenate(val_feature_list, axis=1)

    # --- Train classical classifiers ---
    proba_val_streams = []
    clfs = build_classifiers()
    for clf_name, clf in clfs:
        clf.fit(X_tr_feat, y_tr)
        streams.append((f"feat__{clf_name}", clf))
        # Predict probabilities or approximate
        if hasattr(clf, "predict_proba"):
            Pval = clf.predict_proba(X_val_feat)
        elif hasattr(clf, "decision_function"):
            scores = clf.decision_function(X_val_feat)
            if scores.ndim == 1:
                scores = np.vstack([-scores, scores]).T
            e = np.exp(scores - scores.max(axis=1, keepdims=True))
            Pval = e / e.sum(axis=1, keepdims=True)
        else:
            preds = clf.predict(X_val_feat)
            Pval = np.eye(NUM_CLASSES)[preds]
        proba_val_streams.append(Pval.astype(np.float32))

    # Stack probabilities and train WeightLearner
    S = len(proba_val_streams)
    proba_val_stack = np.stack(proba_val_streams, axis=1)
    proba_val_t = torch.from_numpy(proba_val_stack).to(DEVICE)
    y_val_t = torch.from_numpy(y_val).long().to(DEVICE)

    wl = WeightLearner(S, NUM_CLASSES).to(DEVICE)
    optimizer = optim.AdamW(wl.parameters(), lr=5e-2)
    criterion = nn.CrossEntropyLoss()

EPOCHS_WL = 100

for epoch in range(1, EPOCHS_WL + 1):
    wl.train()               # set model to training mode
    optimizer.zero_grad()     

    logits, w = wl(proba_val_t)              
    loss = criterion(logits, y_val_t)  # compute loss

    loss.backward()         
    optimizer.step()          

    # compute accuracy dynamically
    preds = logits.argmax(dim=1)
    correct = (preds == y_val_t).sum().item()
    accuracy = 100 * correct / y_val_t.size(0)

    print(f"Epoch {epoch:3}/{EPOCHS_WL} - accuracy: {accuracy:.2f} - loss: {loss.item():.3f}")

    # Evaluate fold
    with torch.no_grad():
        final_logits, final_w = wl(proba_val_t)
        y_pred_val = torch.argmax(torch.softmax(final_logits, dim=1), dim=1).cpu().numpy()

    acc = accuracy_score(y_val, y_pred_val)
    precision = precision_score(y_val, y_pred_val, average="weighted", zero_division=0)
    recall = recall_score(y_val, y_pred_val, average="weighted", zero_division=0)
    f1 = f1_score(y_val, y_pred_val, average="weighted", zero_division=0)
    print(f"  Weight: {final_w.detach().cpu().numpy()}, Acc: {acc:.4f}, Prec: {precision:.4f}, Rec: {recall:.4f}, F1: {f1:.4f}")
    fold_results.append({  "Weight": final_w.detach().cpu().numpy(),"Accuracy": acc, "Precision": precision, "Recall": recall, "F1-score": f1})

Training feature extractor using CNN (100 epochs)
Epoch   1/100 - accuracy: 69.20 - loss: 0.313
Epoch   2/100 - accuracy: 69.21 - loss: 0.307
Epoch   3/100 - accuracy: 69.22 - loss: 0.307
Epoch   4/100 - accuracy: 69.23 - loss: 0.310
Epoch   5/100 - accuracy: 69.24 - loss: 0.311
Epoch   6/100 - accuracy: 69.26 - loss: 0.308
Epoch   7/100 - accuracy: 69.28 - loss: 0.306
Epoch   8/100 - accuracy: 69.30 - loss: 0.306
Epoch   9/100 - accuracy: 69.32 - loss: 0.309
Epoch  10/100 - accuracy: 69.35 - loss: 0.302
Epoch  11/100 - accuracy: 69.38 - loss: 0.307
Epoch  12/100 - accuracy: 69.41 - loss: 0.304
Epoch  13/100 - accuracy: 69.44 - loss: 0.308
Epoch  14/100 - accuracy: 69.49 - loss: 0.301
Epoch  15/100 - accuracy: 69.53 - loss: 0.306
Epoch  16/100 - accuracy: 69.58 - loss: 0.304
Epoch  17/100 - accuracy: 69.64 - loss: 0.305
Epoch  18/100 - accuracy: 69.71 - loss: 0.299
Epoch  19/100 - accuracy: 69.78 - loss: 0.303
Epoch  20/100 - accuracy: 69.87 - loss: 0.302
Epoch  21/100 - accuracy: 69.9