In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# ------------------ 1. Load Files ------------------
stance_file = "/content/drive/MyDrive/keepup project methodology code/aggregated_stance_counts.csv"
claims_file = "/content/drive/MyDrive/keepup project methodology code/entailment_predictions_results.csv"
post_file = "/content/drive/MyDrive/keepup project methodology code/post_combined_features.csv"
user_file = "/content/drive/MyDrive/keepup project methodology code/user_combined_features.csv"
excel_file = "/content/drive/MyDrive/keepup project methodology code/Final dataset.xlsx"

In [4]:
# Read CSVs
stance_df = pd.read_csv(stance_file)
entail_df = pd.read_csv(claims_file)
postf_df = pd.read_csv(post_file)
userf_df = pd.read_csv(user_file)


In [5]:
# Read Excel sheets
post_df = pd.read_excel(excel_file, sheet_name="post features")
user_df = pd.read_excel(excel_file, sheet_name="user features")
comments_df = pd.read_excel(excel_file, sheet_name="comments")

In [6]:
# ------------------ 2. Merge Feature DataFrames ------------------
data = post_df.merge(stance_df, left_on="post-id", right_on="post_id", how="left")
data = data.merge(entail_df, left_on="Event-id", right_on="claim-id", how="left")
data = data.merge(user_df, on="post-id", how="left")
data = data.merge(postf_df, left_on="event_id", right_on="post_id", how="left")
data = data.merge(userf_df, on="post-id", how="left")

In [7]:
# Safely combine comments by converting each to a string
combined_comments = comments_df.groupby("post-id")["commenttext"].apply(
    lambda x: " ".join(str(i) for i in x)
).reset_index()

In [8]:
# Ensure commenttext exists and handle potential issues
comments_df['commenttext'] = comments_df['commenttext'].fillna('').astype(str)

# Group and safely combine all comments as strings
combined_comments = (
    comments_df.groupby("post-id")["commenttext"]
    .apply(lambda x: " ".join(x.astype(str).tolist()))
    .reset_index()
)


In [9]:
# ------------------ 3. Aggregate Comments per Post ------------------
# Group and combine all comments for each post_id
combined_comments = comments_df.groupby("post-id")["commenttext"].apply(lambda x: " ".join(x)).reset_index()

In [10]:
# ------------------ 4. Merge Title & Comments ------------------
# Merge titles using post-id
data = data.merge(post_df, on="post-id", how="left")

In [11]:
# Merge combined comments using post-id
data = data.merge(combined_comments, left_on="post-id", right_on="post-id", how="left")

In [12]:
# ------------------ 5. Combine Title + Comments into One Field ------------------
data["combined_text"] = data["title"].fillna('') + " " + data["commenttext"].fillna('')


In [13]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm

# Load XLM-Roberta
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModel.from_pretrained('xlm-roberta-base')
model.eval()

# Generate mean pooled embeddings for combined_text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Apply embedding extraction (use tqdm for progress bar)
tqdm.pandas()
data["embedding"] = data["combined_text"].progress_apply(get_embedding)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

100%|██████████| 1024/1024 [09:04<00:00,  1.88it/s]


In [152]:
# Convert embeddings to array
X_text = np.stack(data['embedding'].values)

# Select numeric features (update with real column names)
numerical_cols = ['likescount_x', 'commentscount_x', 'followers_y', 'followings_y',
                  'is user verified(0 verified, 1 unverified)_y', 'join_days_ago',
                  'stance_agree', 'stance_disagree', 'stance_query', 'stance_comment',
                   'positive_prob','title_length', 'title_sentiment', 'clickbait_flag','post-title_x', 'commenttext']

X_numeric = data[numerical_cols].fillna(0).values  # Handle NaNs if any

# Combine text embeddings and numeric features
X = np.hstack([X_text, X_numeric])

# Target variable
y = data['post-label_x'].values  # Make sure this column exists


In [153]:
X_text = np.stack(data['embedding'].values)


In [154]:
for col in numerical_cols:
    print(f"{col} unique values (sample):", data[col].unique()[:5])

likescount_x unique values (sample): [2.5000e+04 3.0000e+01 2.4000e+01 6.2000e+01 6.2584e+04]
commentscount_x unique values (sample): [1704.    4.    6.    3.  711.]
followers_y unique values (sample): [1.80e+04 2.40e+04 4.80e+03 1.01e+03 3.50e+06]
followings_y unique values (sample): [    0.  2659. 65300.   463.   868.]
is user verified(0 verified, 1 unverified)_y unique values (sample): [1 0]
join_days_ago unique values (sample): [   0. 4903.  734. 3929. 6517.]
stance_agree unique values (sample): [159.   0.   5.  15.   7.]
stance_disagree unique values (sample): [507.   0.   3.   2.   1.]
stance_query unique values (sample): [75.  1.  0.  4.  8.]
stance_comment unique values (sample): [963.   3.   1. 675.   6.]
positive_prob unique values (sample): [0.]
title_length unique values (sample): [0.]
title_sentiment unique values (sample): [0.]
clickbait_flag unique values (sample): [0.]
post-title_x unique values (sample): [0.]
commenttext unique values (sample): [0.00000000e+00 3.006211

In [155]:
def parse_shorthand(value):
    try:
        value = str(value).strip().lower()
        if 'k' in value:
            return float(value.replace('k', '')) * 1_000
        elif 'm' in value:
            return float(value.replace('m', '')) * 1_000_000
        else:
            return float(value)
    except:
        return np.nan

In [156]:
for col in numerical_cols:
    # Only apply to object (string-like) columns
    if data[col].dtype == 'object':
        data[col] = data[col].apply(parse_shorthand)

In [157]:
data[numerical_cols] = data[numerical_cols].fillna(0)

In [159]:
other_feature_columns = [col for col in numerical_cols if col != 'post-title_x']
other_features = data[other_feature_columns].values.astype(np.float32)

In [160]:
X_numeric = data[numerical_cols].fillna(0).astype(np.float32).values


In [161]:
X = np.hstack([X_text, X_numeric]).astype(np.float32)


In [162]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#split 80:20

In [238]:

# Clean embeddings
embedding_features = np.array([np.array(e, dtype=np.float32) for e in data['embedding'].values])
numeric_features = data[numerical_cols].fillna(0).astype(np.float32).values
labels = data['post-label_x'].values

# Train/test split
X_embed_train, X_embed_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    embedding_features, numeric_features, labels, test_size=0.2, random_state=42)

# Convert to tensors
X_embed_train_tensor = torch.tensor(X_embed_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, embed_dim]
X_embed_test_tensor = torch.tensor(X_embed_test, dtype=torch.float32).unsqueeze(1)

X_num_train_tensor = torch.tensor(X_num_train, dtype=torch.float32)
X_num_test_tensor = torch.tensor(X_num_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_embed_train_tensor, X_num_train_tensor, y_train_tensor)
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_embed_test_tensor, X_num_test_tensor, y_test_tensor)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=32)


In [None]:
#CNN+Bi_GRU ENSEMBLE

In [231]:
from torch.utils.data import Dataset
import torch

class EnsembleDataset(Dataset):
    def __init__(self, texts, numeric_features, labels, tokenizer, max_len=512):
        self.texts = texts
        self.numeric_features = numeric_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        numeric = torch.tensor(self.numeric_features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)           # (seq_len)
        attention_mask = encoding['attention_mask'].squeeze(0) # (seq_len)

        return input_ids, attention_mask, numeric, label


In [232]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Attention Module
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        # gru_output: (batch, seq_len, hidden_dim)
        attn_weights = F.softmax(self.attn(gru_output), dim=1)  # (batch, seq_len, 1)
        context = torch.sum(attn_weights * gru_output, dim=1)   # (batch, hidden_dim)
        return context

# CNN Feature Extractor
class CNNExtractor(nn.Module):
    def __init__(self, embed_dim):
        super(CNNExtractor, self).__init__()
        self.conv1 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 32, 3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = x.transpose(1, 2)                    # (batch, embed_dim, seq_len)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)             # (batch, 32)
        x = self.dropout(x)
        return x

# BiGRU with Attention Feature Extractor
class BiGRUExtractor(nn.Module):
    def __init__(self, embed_dim, hidden_size=64):
        super(BiGRUExtractor, self).__init__()
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        out, _ = self.gru(x)                      # (batch, seq_len, hidden*2)
        attn_out = self.attention(out)            # (batch, hidden*2)
        attn_out = self.bn(attn_out)
        attn_out = self.dropout(attn_out)
        return attn_out

# Full Ensemble Model
class EnsembleSmallData(nn.Module):
    def __init__(self, embed_dim, num_features, num_classes=2):
        super(EnsembleSmallData, self).__init__()
        self.cnn_branch = CNNExtractor(embed_dim)
        self.gru_branch = BiGRUExtractor(embed_dim, hidden_size=64)
        self.fc_numeric = nn.Linear(num_features, 32)
        self.bn_numeric = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(0.4)

        self.classifier = nn.Linear(32 + 128 + 32, num_classes)  # CNN(32) + GRU(128) + numeric(32)

    def forward(self, embed_x, numeric_x):
        cnn_feat = self.cnn_branch(embed_x)                           # (batch, 32)
        gru_feat = self.gru_branch(embed_x)                           # (batch, 128)
        num_feat = F.relu(self.bn_numeric(self.fc_numeric(numeric_x)))# (batch, 32)
        num_feat = self.dropout(num_feat)

        combined = torch.cat([cnn_feat, gru_feat, num_feat], dim=1)   # (batch, 192)
        combined = self.dropout(combined)

        out = self.classifier(combined)                               # (batch, num_classes)
        return out


In [203]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import copy

model = EnsembleSmallData(
    embed_dim=X_embed_train.shape[1],
    num_features=X_num_train.shape[1],
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Early stopping
best_acc = 0
patience = 5
wait = 0
best_model_state = None
best_preds = []
best_labels = []

epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []

    for xb_embed, xb_num, yb in train_dl:
        xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb_embed, xb_num)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
        train_labels.extend(yb.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)

    # Evaluation
    model.eval()
    test_preds, test_labels = [], []

    with torch.no_grad():
        for xb_embed, xb_num, yb in test_dl:
            xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)
            preds = model(xb_embed, xb_num)
            test_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
            test_labels.extend(yb.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")

    # Save best model and predictions
    if test_acc > best_acc:
        best_acc = test_acc
        best_model_state = copy.deepcopy(model.state_dict())
        best_preds = test_preds.copy()
        best_labels = test_labels.copy()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best Test Acc: {best_acc*100:.2f}%")
            break

# Load best model
model.load_state_dict(best_model_state)

# Final evaluation on best model
print("\n--- Final Evaluation on Best Model ---")
print(f"Best Test Accuracy: {best_acc*100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(best_labels, best_preds))
print("\nClassification Report:")
print(classification_report(best_labels, best_preds, digits=4))


Epoch 01 | Loss: 10.3691 | Train Acc: 52.63% | Test Acc: 53.66%
Epoch 02 | Loss: 9.4639 | Train Acc: 57.26% | Test Acc: 47.80%
Epoch 03 | Loss: 9.2285 | Train Acc: 59.58% | Test Acc: 53.17%
Epoch 04 | Loss: 8.8458 | Train Acc: 61.05% | Test Acc: 71.71%
Epoch 05 | Loss: 8.2297 | Train Acc: 64.22% | Test Acc: 73.17%
Epoch 06 | Loss: 8.6283 | Train Acc: 63.25% | Test Acc: 70.24%
Epoch 07 | Loss: 8.1842 | Train Acc: 67.16% | Test Acc: 60.00%
Epoch 08 | Loss: 7.7244 | Train Acc: 70.45% | Test Acc: 61.95%
Epoch 09 | Loss: 7.8816 | Train Acc: 67.89% | Test Acc: 66.34%
Epoch 10 | Loss: 7.4687 | Train Acc: 68.01% | Test Acc: 68.78%
Early stopping at epoch 10. Best Test Acc: 73.17%

--- Final Evaluation on Best Model ---
Best Test Accuracy: 73.17%
Confusion Matrix:
[[60 35]
 [20 90]]

Classification Report:
              precision    recall  f1-score   support

           0     0.7500    0.6316    0.6857        95
           1     0.7200    0.8182    0.7660       110

    accuracy               

# split dataset 90:10

In [None]:

# Clean embeddings
embedding_features = np.array([np.array(e, dtype=np.float32) for e in data['embedding'].values])
numeric_features = data[numerical_cols].fillna(0).astype(np.float32).values
labels = data['post-label_x'].values

# Train/test split
X_embed_train, X_embed_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    embedding_features, numeric_features, labels, test_size=0.2, random_state=42)

# Convert to tensors
X_embed_train_tensor = torch.tensor(X_embed_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, embed_dim]
X_embed_test_tensor = torch.tensor(X_embed_test, dtype=torch.float32).unsqueeze(1)

X_num_train_tensor = torch.tensor(X_num_train, dtype=torch.float32)
X_num_test_tensor = torch.tensor(X_num_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_embed_train_tensor, X_num_train_tensor, y_train_tensor)
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_embed_test_tensor, X_num_test_tensor, y_test_tensor)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=32)


In [None]:
from torch.utils.data import Dataset
import torch

class EnsembleDataset(Dataset):
    def __init__(self, texts, numeric_features, labels, tokenizer, max_len=512):
        self.texts = texts
        self.numeric_features = numeric_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        numeric = torch.tensor(self.numeric_features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)           # (seq_len)
        attention_mask = encoding['attention_mask'].squeeze(0) # (seq_len)

        return input_ids, attention_mask, numeric, label


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Attention Module
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        # gru_output: (batch, seq_len, hidden_dim)
        attn_weights = F.softmax(self.attn(gru_output), dim=1)  # (batch, seq_len, 1)
        context = torch.sum(attn_weights * gru_output, dim=1)   # (batch, hidden_dim)
        return context

# CNN Feature Extractor
class CNNExtractor(nn.Module):
    def __init__(self, embed_dim):
        super(CNNExtractor, self).__init__()
        self.conv1 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 32, 3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = x.transpose(1, 2)                    # (batch, embed_dim, seq_len)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)             # (batch, 32)
        x = self.dropout(x)
        return x

# BiGRU with Attention Feature Extractor
class BiGRUExtractor(nn.Module):
    def __init__(self, embed_dim, hidden_size=64):
        super(BiGRUExtractor, self).__init__()
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        out, _ = self.gru(x)                      # (batch, seq_len, hidden*2)
        attn_out = self.attention(out)            # (batch, hidden*2)
        attn_out = self.bn(attn_out)
        attn_out = self.dropout(attn_out)
        return attn_out

# Full Ensemble Model
class EnsembleSmallData(nn.Module):
    def __init__(self, embed_dim, num_features, num_classes=2):
        super(EnsembleSmallData, self).__init__()
        self.cnn_branch = CNNExtractor(embed_dim)
        self.gru_branch = BiGRUExtractor(embed_dim, hidden_size=64)
        self.fc_numeric = nn.Linear(num_features, 32)
        self.bn_numeric = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(0.4)

        self.classifier = nn.Linear(32 + 128 + 32, num_classes)  # CNN(32) + GRU(128) + numeric(32)

    def forward(self, embed_x, numeric_x):
        cnn_feat = self.cnn_branch(embed_x)                           # (batch, 32)
        gru_feat = self.gru_branch(embed_x)                           # (batch, 128)
        num_feat = F.relu(self.bn_numeric(self.fc_numeric(numeric_x)))# (batch, 32)
        num_feat = self.dropout(num_feat)

        combined = torch.cat([cnn_feat, gru_feat, num_feat], dim=1)   # (batch, 192)
        combined = self.dropout(combined)

        out = self.classifier(combined)                               # (batch, num_classes)
        return out


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import copy

model = EnsembleSmallData(
    embed_dim=X_embed_train.shape[1],
    num_features=X_num_train.shape[1],
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Early stopping
best_acc = 0
patience = 5
wait = 0
best_model_state = None
best_preds = []
best_labels = []

epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []

    for xb_embed, xb_num, yb in train_dl:
        xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb_embed, xb_num)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
        train_labels.extend(yb.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)

    # Evaluation
    model.eval()
    test_preds, test_labels = [], []

    with torch.no_grad():
        for xb_embed, xb_num, yb in test_dl:
            xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)
            preds = model(xb_embed, xb_num)
            test_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
            test_labels.extend(yb.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")

    # Save best model and predictions
    if test_acc > best_acc:
        best_acc = test_acc
        best_model_state = copy.deepcopy(model.state_dict())
        best_preds = test_preds.copy()
        best_labels = test_labels.copy()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best Test Acc: {best_acc*100:.2f}%")
            break

# Load best model
model.load_state_dict(best_model_state)

# Final evaluation on best model
print("\n--- Final Evaluation on Best Model ---")
print(f"Best Test Accuracy: {best_acc*100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(best_labels, best_preds))
print("\nClassification Report:")
print(classification_report(best_labels, best_preds, digits=4))


Epoch 01 | Loss: 23.6397 | Train Acc: 52.77% | Test Acc: 50.49%
Epoch 02 | Loss: 21.3151 | Train Acc: 58.74% | Test Acc: 66.02%
Epoch 03 | Loss: 20.5819 | Train Acc: 58.85% | Test Acc: 70.87%
Epoch 04 | Loss: 19.4197 | Train Acc: 62.43% | Test Acc: 71.84%
Epoch 05 | Loss: 19.1942 | Train Acc: 62.21% | Test Acc: 62.14%
Epoch 06 | Loss: 18.7187 | Train Acc: 64.50% | Test Acc: 78.64%
Epoch 07 | Loss: 18.2749 | Train Acc: 66.12% | Test Acc: 57.28%
Epoch 08 | Loss: 18.4906 | Train Acc: 64.06% | Test Acc: 72.82%
Epoch 09 | Loss: 17.4652 | Train Acc: 67.43% | Test Acc: 73.79%
Epoch 10 | Loss: 17.3900 | Train Acc: 68.73% | Test Acc: 69.90%
Epoch 11 | Loss: 17.5114 | Train Acc: 68.08% | Test Acc: 73.79%
Early stopping at epoch 11. Best Test Acc: 78.64%

--- Final Evaluation on Best Model ---
Best Test Accuracy: 78.64%
Confusion Matrix:
[[33 15]
 [ 7 48]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8250    0.6875    0.7500        48
         

#Abalation study

without entailment positive prob

In [41]:
# Convert embeddings to array
X_text = np.stack(data['embedding'].values)

# Select numeric features (update with real column names)
numerical_cols = ['likescount_x', 'commentscount_x', 'followers_y', 'followings_y',
                  'is user verified(0 verified, 1 unverified)_y', 'join_days_ago',
                  'stance_agree', 'stance_disagree', 'stance_query', 'stance_comment', 'title_length', 'title_sentiment', 'clickbait_flag']

X_numeric = data[numerical_cols].fillna(0).values  # Handle NaNs if any

# Combine text embeddings and numeric features
X = np.hstack([X_text, X_numeric])

# Target variable
y = data['post-label_x'].values  # Make sure this column exists


In [42]:
X_text = np.stack(data['embedding'].values)
for col in numerical_cols:
    print(f"{col} unique values (sample):", data[col].unique()[:5])
def parse_shorthand(value):
    try:
        value = str(value).strip().lower()
        if 'k' in value:
            return float(value.replace('k', '')) * 1_000
        elif 'm' in value:
            return float(value.replace('m', '')) * 1_000_000
        else:
            return float(value)
    except:
        return np.nan
for col in numerical_cols:
    # Only apply to object (string-like) columns
    if data[col].dtype == 'object':
        data[col] = data[col].apply(parse_shorthand)
data[numerical_cols] = data[numerical_cols].fillna(0)
other_feature_columns = [col for col in numerical_cols if col != 'post-title_x']
other_features = data[other_feature_columns].values.astype(np.float32)
X_numeric = data[numerical_cols].fillna(0).astype(np.float32).values
X = np.hstack([X_text, X_numeric]).astype(np.float32)


likescount_x unique values (sample): [2.5000e+04 3.0000e+01 2.4000e+01 6.2000e+01 6.2584e+04]
commentscount_x unique values (sample): [1704.    4.    6.    3.  711.]
followers_y unique values (sample): [1.80e+04 2.40e+04 4.80e+03 1.01e+03 3.50e+06]
followings_y unique values (sample): [    0.  2659. 65300.   463.   868.]
is user verified(0 verified, 1 unverified)_y unique values (sample): [1 0]
join_days_ago unique values (sample): [   0. 4903.  734. 3929. 6517.]
stance_agree unique values (sample): [159.   0.   5.  15.   7.]
stance_disagree unique values (sample): [507.   0.   3.   2.   1.]
stance_query unique values (sample): [75.  1.  0.  4.  8.]
stance_comment unique values (sample): [963.   3.   1. 675.   6.]
title_length unique values (sample): [0.]
title_sentiment unique values (sample): [0.]
clickbait_flag unique values (sample): [0.]


In [43]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [50]:

# Clean embeddings
embedding_features = np.array([np.array(e, dtype=np.float32) for e in data['embedding'].values])
numeric_features = data[numerical_cols].fillna(0).astype(np.float32).values
labels = data['post-label_x'].values

# Train/test split
X_embed_train, X_embed_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    embedding_features, numeric_features, labels, test_size=0.1, random_state=42)

# Convert to tensors
X_embed_train_tensor = torch.tensor(X_embed_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, embed_dim]
X_embed_test_tensor = torch.tensor(X_embed_test, dtype=torch.float32).unsqueeze(1)

X_num_train_tensor = torch.tensor(X_num_train, dtype=torch.float32)
X_num_test_tensor = torch.tensor(X_num_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_embed_train_tensor, X_num_train_tensor, y_train_tensor)
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_embed_test_tensor, X_num_test_tensor, y_test_tensor)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=32)
from torch.utils.data import Dataset
import torch

class EnsembleDataset(Dataset):
    def __init__(self, texts, numeric_features, labels, tokenizer, max_len=512):
        self.texts = texts
        self.numeric_features = numeric_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        numeric = torch.tensor(self.numeric_features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)           # (seq_len)
        attention_mask = encoding['attention_mask'].squeeze(0) # (seq_len)

        return input_ids, attention_mask, numeric, label
import torch
import torch.nn as nn
import torch.nn.functional as F

# Attention Module
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        # gru_output: (batch, seq_len, hidden_dim)
        attn_weights = F.softmax(self.attn(gru_output), dim=1)  # (batch, seq_len, 1)
        context = torch.sum(attn_weights * gru_output, dim=1)   # (batch, hidden_dim)
        return context

# CNN Feature Extractor
class CNNExtractor(nn.Module):
    def __init__(self, embed_dim):
        super(CNNExtractor, self).__init__()
        self.conv1 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 32, 3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = x.transpose(1, 2)                    # (batch, embed_dim, seq_len)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)             # (batch, 32)
        x = self.dropout(x)
        return x

# BiGRU with Attention Feature Extractor
class BiGRUExtractor(nn.Module):
    def __init__(self, embed_dim, hidden_size=64):
        super(BiGRUExtractor, self).__init__()
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        out, _ = self.gru(x)                      # (batch, seq_len, hidden*2)
        attn_out = self.attention(out)            # (batch, hidden*2)
        attn_out = self.bn(attn_out)
        attn_out = self.dropout(attn_out)
        return attn_out

# Full Ensemble Model
class EnsembleSmallData(nn.Module):
    def __init__(self, embed_dim, num_features, num_classes=2):
        super(EnsembleSmallData, self).__init__()
        self.cnn_branch = CNNExtractor(embed_dim)
        self.gru_branch = BiGRUExtractor(embed_dim, hidden_size=64)
        self.fc_numeric = nn.Linear(num_features, 32)
        self.bn_numeric = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(0.4)

        self.classifier = nn.Linear(32 + 128 + 32, num_classes)  # CNN(32) + GRU(128) + numeric(32)

    def forward(self, embed_x, numeric_x):
        cnn_feat = self.cnn_branch(embed_x)                           # (batch, 32)
        gru_feat = self.gru_branch(embed_x)                           # (batch, 128)
        num_feat = F.relu(self.bn_numeric(self.fc_numeric(numeric_x)))# (batch, 32)
        num_feat = self.dropout(num_feat)

        combined = torch.cat([cnn_feat, gru_feat, num_feat], dim=1)   # (batch, 192)
        combined = self.dropout(combined)

        out = self.classifier(combined)                               # (batch, num_classes)
        return out
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import copy

model = EnsembleSmallData(
    embed_dim=X_embed_train.shape[1],
    num_features=X_num_train.shape[1],
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Early stopping
best_acc = 0
patience = 5
wait = 0
best_model_state = None
best_preds = []
best_labels = []

epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []

    for xb_embed, xb_num, yb in train_dl:
        xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb_embed, xb_num)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
        train_labels.extend(yb.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)

    # Evaluation
    model.eval()
    test_preds, test_labels = [], []

    with torch.no_grad():
        for xb_embed, xb_num, yb in test_dl:
            xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)
            preds = model(xb_embed, xb_num)
            test_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
            test_labels.extend(yb.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")

    # Save best model and predictions
    if test_acc > best_acc:
        best_acc = test_acc
        best_model_state = copy.deepcopy(model.state_dict())
        best_preds = test_preds.copy()
        best_labels = test_labels.copy()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best Test Acc: {best_acc*100:.2f}%")
            break

# Load best model
model.load_state_dict(best_model_state)

# Final evaluation on best model
print("\n--- Final Evaluation on Best Model ---")
print(f"Best Test Accuracy: {best_acc*100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(best_labels, best_preds))
print("\nClassification Report:")
print(classification_report(best_labels, best_preds, digits=4))


Epoch 01 | Loss: 22.2307 | Train Acc: 56.03% | Test Acc: 56.31%
Epoch 02 | Loss: 20.9494 | Train Acc: 60.48% | Test Acc: 68.93%
Epoch 03 | Loss: 19.1360 | Train Acc: 62.54% | Test Acc: 76.70%
Epoch 04 | Loss: 19.6680 | Train Acc: 62.32% | Test Acc: 72.82%
Epoch 05 | Loss: 19.2028 | Train Acc: 63.95% | Test Acc: 64.08%
Epoch 06 | Loss: 18.1265 | Train Acc: 65.47% | Test Acc: 71.84%
Epoch 07 | Loss: 17.6475 | Train Acc: 67.32% | Test Acc: 69.90%
Epoch 08 | Loss: 17.7632 | Train Acc: 67.21% | Test Acc: 71.84%
Early stopping at epoch 8. Best Test Acc: 76.70%

--- Final Evaluation on Best Model ---
Best Test Accuracy: 76.70%
Confusion Matrix:
[[35 13]
 [11 44]]

Classification Report:
              precision    recall  f1-score   support

           0     0.7609    0.7292    0.7447        48
           1     0.7719    0.8000    0.7857        55

    accuracy                         0.7670       103
   macro avg     0.7664    0.7646    0.7652       103
weighted avg     0.7668    0.7670    0.

without stance aggregation and enatilment

In [51]:
# Convert embeddings to array
X_text = np.stack(data['embedding'].values)

# Select numeric features (update with real column names)
numerical_cols = ['likescount_x', 'commentscount_x', 'followers_y', 'followings_y',
                  'is user verified(0 verified, 1 unverified)_y', 'join_days_ago',
                  'title_length', 'title_sentiment', 'clickbait_flag']

X_numeric = data[numerical_cols].fillna(0).values  # Handle NaNs if any

# Combine text embeddings and numeric features
X = np.hstack([X_text, X_numeric])

# Target variable
y = data['post-label_x'].values  # Make sure this column exists
X_text = np.stack(data['embedding'].values)
for col in numerical_cols:
    print(f"{col} unique values (sample):", data[col].unique()[:5])
def parse_shorthand(value):
    try:
        value = str(value).strip().lower()
        if 'k' in value:
            return float(value.replace('k', '')) * 1_000
        elif 'm' in value:
            return float(value.replace('m', '')) * 1_000_000
        else:
            return float(value)
    except:
        return np.nan
for col in numerical_cols:
    # Only apply to object (string-like) columns
    if data[col].dtype == 'object':
        data[col] = data[col].apply(parse_shorthand)
data[numerical_cols] = data[numerical_cols].fillna(0)
other_feature_columns = [col for col in numerical_cols if col != 'post-title_x']
other_features = data[other_feature_columns].values.astype(np.float32)
X_numeric = data[numerical_cols].fillna(0).astype(np.float32).values
X = np.hstack([X_text, X_numeric]).astype(np.float32)

# Clean embeddings
embedding_features = np.array([np.array(e, dtype=np.float32) for e in data['embedding'].values])
numeric_features = data[numerical_cols].fillna(0).astype(np.float32).values
labels = data['post-label_x'].values

# Train/test split
X_embed_train, X_embed_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    embedding_features, numeric_features, labels, test_size=0.1, random_state=42)

# Convert to tensors
X_embed_train_tensor = torch.tensor(X_embed_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, embed_dim]
X_embed_test_tensor = torch.tensor(X_embed_test, dtype=torch.float32).unsqueeze(1)

X_num_train_tensor = torch.tensor(X_num_train, dtype=torch.float32)
X_num_test_tensor = torch.tensor(X_num_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_embed_train_tensor, X_num_train_tensor, y_train_tensor)
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_embed_test_tensor, X_num_test_tensor, y_test_tensor)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=32)
from torch.utils.data import Dataset
import torch

class EnsembleDataset(Dataset):
    def __init__(self, texts, numeric_features, labels, tokenizer, max_len=512):
        self.texts = texts
        self.numeric_features = numeric_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        numeric = torch.tensor(self.numeric_features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)           # (seq_len)
        attention_mask = encoding['attention_mask'].squeeze(0) # (seq_len)

        return input_ids, attention_mask, numeric, label
import torch
import torch.nn as nn
import torch.nn.functional as F

# Attention Module
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        # gru_output: (batch, seq_len, hidden_dim)
        attn_weights = F.softmax(self.attn(gru_output), dim=1)  # (batch, seq_len, 1)
        context = torch.sum(attn_weights * gru_output, dim=1)   # (batch, hidden_dim)
        return context

# CNN Feature Extractor
class CNNExtractor(nn.Module):
    def __init__(self, embed_dim):
        super(CNNExtractor, self).__init__()
        self.conv1 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 32, 3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = x.transpose(1, 2)                    # (batch, embed_dim, seq_len)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)             # (batch, 32)
        x = self.dropout(x)
        return x

# BiGRU with Attention Feature Extractor
class BiGRUExtractor(nn.Module):
    def __init__(self, embed_dim, hidden_size=64):
        super(BiGRUExtractor, self).__init__()
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        out, _ = self.gru(x)                      # (batch, seq_len, hidden*2)
        attn_out = self.attention(out)            # (batch, hidden*2)
        attn_out = self.bn(attn_out)
        attn_out = self.dropout(attn_out)
        return attn_out

# Full Ensemble Model
class EnsembleSmallData(nn.Module):
    def __init__(self, embed_dim, num_features, num_classes=2):
        super(EnsembleSmallData, self).__init__()
        self.cnn_branch = CNNExtractor(embed_dim)
        self.gru_branch = BiGRUExtractor(embed_dim, hidden_size=64)
        self.fc_numeric = nn.Linear(num_features, 32)
        self.bn_numeric = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(0.4)

        self.classifier = nn.Linear(32 + 128 + 32, num_classes)  # CNN(32) + GRU(128) + numeric(32)

    def forward(self, embed_x, numeric_x):
        cnn_feat = self.cnn_branch(embed_x)                           # (batch, 32)
        gru_feat = self.gru_branch(embed_x)                           # (batch, 128)
        num_feat = F.relu(self.bn_numeric(self.fc_numeric(numeric_x)))# (batch, 32)
        num_feat = self.dropout(num_feat)

        combined = torch.cat([cnn_feat, gru_feat, num_feat], dim=1)   # (batch, 192)
        combined = self.dropout(combined)

        out = self.classifier(combined)                               # (batch, num_classes)
        return out
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import copy

model = EnsembleSmallData(
    embed_dim=X_embed_train.shape[1],
    num_features=X_num_train.shape[1],
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Early stopping
best_acc = 0
patience = 5
wait = 0
best_model_state = None
best_preds = []
best_labels = []

epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []

    for xb_embed, xb_num, yb in train_dl:
        xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb_embed, xb_num)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
        train_labels.extend(yb.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)

    # Evaluation
    model.eval()
    test_preds, test_labels = [], []

    with torch.no_grad():
        for xb_embed, xb_num, yb in test_dl:
            xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)
            preds = model(xb_embed, xb_num)
            test_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
            test_labels.extend(yb.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")

    # Save best model and predictions
    if test_acc > best_acc:
        best_acc = test_acc
        best_model_state = copy.deepcopy(model.state_dict())
        best_preds = test_preds.copy()
        best_labels = test_labels.copy()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best Test Acc: {best_acc*100:.2f}%")
            break

# Load best model
model.load_state_dict(best_model_state)

# Final evaluation on best model
print("\n--- Final Evaluation on Best Model ---")
print(f"Best Test Accuracy: {best_acc*100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(best_labels, best_preds))
print("\nClassification Report:")
print(classification_report(best_labels, best_preds, digits=4))



likescount_x unique values (sample): [2.5000e+04 3.0000e+01 2.4000e+01 6.2000e+01 6.2584e+04]
commentscount_x unique values (sample): [1704.    4.    6.    3.  711.]
followers_y unique values (sample): [1.80e+04 2.40e+04 4.80e+03 1.01e+03 3.50e+06]
followings_y unique values (sample): [    0.  2659. 65300.   463.   868.]
is user verified(0 verified, 1 unverified)_y unique values (sample): [1 0]
join_days_ago unique values (sample): [   0. 4903.  734. 3929. 6517.]
title_length unique values (sample): [0.]
title_sentiment unique values (sample): [0.]
clickbait_flag unique values (sample): [0.]
Epoch 01 | Loss: 22.5690 | Train Acc: 53.53% | Test Acc: 71.84%
Epoch 02 | Loss: 21.4803 | Train Acc: 57.98% | Test Acc: 69.90%
Epoch 03 | Loss: 19.5780 | Train Acc: 62.11% | Test Acc: 70.87%
Epoch 04 | Loss: 18.7392 | Train Acc: 62.43% | Test Acc: 75.73%
Epoch 05 | Loss: 18.8642 | Train Acc: 65.36% | Test Acc: 71.84%
Epoch 06 | Loss: 18.0994 | Train Acc: 63.84% | Test Acc: 67.96%
Epoch 07 | Loss: 

only title+title features

In [52]:
# Convert embeddings to array
X_text = np.stack(data['embedding'].values)

# Select numeric features (update with real column names)
numerical_cols = ['title_length', 'title_sentiment', 'clickbait_flag']

X_numeric = data[numerical_cols].fillna(0).values  # Handle NaNs if any

# Combine text embeddings and numeric features
X = np.hstack([X_text, X_numeric])

# Target variable
y = data['post-label_x'].values  # Make sure this column exists
X_text = np.stack(data['embedding'].values)
for col in numerical_cols:
    print(f"{col} unique values (sample):", data[col].unique()[:5])
def parse_shorthand(value):
    try:
        value = str(value).strip().lower()
        if 'k' in value:
            return float(value.replace('k', '')) * 1_000
        elif 'm' in value:
            return float(value.replace('m', '')) * 1_000_000
        else:
            return float(value)
    except:
        return np.nan
for col in numerical_cols:
    # Only apply to object (string-like) columns
    if data[col].dtype == 'object':
        data[col] = data[col].apply(parse_shorthand)
data[numerical_cols] = data[numerical_cols].fillna(0)
other_feature_columns = [col for col in numerical_cols if col != 'post-title_x']
other_features = data[other_feature_columns].values.astype(np.float32)
X_numeric = data[numerical_cols].fillna(0).astype(np.float32).values
X = np.hstack([X_text, X_numeric]).astype(np.float32)

# Clean embeddings
embedding_features = np.array([np.array(e, dtype=np.float32) for e in data['embedding'].values])
numeric_features = data[numerical_cols].fillna(0).astype(np.float32).values
labels = data['post-label_x'].values

# Train/test split
X_embed_train, X_embed_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    embedding_features, numeric_features, labels, test_size=0.1, random_state=42)

# Convert to tensors
X_embed_train_tensor = torch.tensor(X_embed_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, embed_dim]
X_embed_test_tensor = torch.tensor(X_embed_test, dtype=torch.float32).unsqueeze(1)

X_num_train_tensor = torch.tensor(X_num_train, dtype=torch.float32)
X_num_test_tensor = torch.tensor(X_num_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_embed_train_tensor, X_num_train_tensor, y_train_tensor)
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_embed_test_tensor, X_num_test_tensor, y_test_tensor)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=32)
from torch.utils.data import Dataset
import torch

class EnsembleDataset(Dataset):
    def __init__(self, texts, numeric_features, labels, tokenizer, max_len=512):
        self.texts = texts
        self.numeric_features = numeric_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        numeric = torch.tensor(self.numeric_features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)           # (seq_len)
        attention_mask = encoding['attention_mask'].squeeze(0) # (seq_len)

        return input_ids, attention_mask, numeric, label
import torch
import torch.nn as nn
import torch.nn.functional as F

# Attention Module
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        # gru_output: (batch, seq_len, hidden_dim)
        attn_weights = F.softmax(self.attn(gru_output), dim=1)  # (batch, seq_len, 1)
        context = torch.sum(attn_weights * gru_output, dim=1)   # (batch, hidden_dim)
        return context

# CNN Feature Extractor
class CNNExtractor(nn.Module):
    def __init__(self, embed_dim):
        super(CNNExtractor, self).__init__()
        self.conv1 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 32, 3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = x.transpose(1, 2)                    # (batch, embed_dim, seq_len)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)             # (batch, 32)
        x = self.dropout(x)
        return x

# BiGRU with Attention Feature Extractor
class BiGRUExtractor(nn.Module):
    def __init__(self, embed_dim, hidden_size=64):
        super(BiGRUExtractor, self).__init__()
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        out, _ = self.gru(x)                      # (batch, seq_len, hidden*2)
        attn_out = self.attention(out)            # (batch, hidden*2)
        attn_out = self.bn(attn_out)
        attn_out = self.dropout(attn_out)
        return attn_out

# Full Ensemble Model
class EnsembleSmallData(nn.Module):
    def __init__(self, embed_dim, num_features, num_classes=2):
        super(EnsembleSmallData, self).__init__()
        self.cnn_branch = CNNExtractor(embed_dim)
        self.gru_branch = BiGRUExtractor(embed_dim, hidden_size=64)
        self.fc_numeric = nn.Linear(num_features, 32)
        self.bn_numeric = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(0.4)

        self.classifier = nn.Linear(32 + 128 + 32, num_classes)  # CNN(32) + GRU(128) + numeric(32)

    def forward(self, embed_x, numeric_x):
        cnn_feat = self.cnn_branch(embed_x)                           # (batch, 32)
        gru_feat = self.gru_branch(embed_x)                           # (batch, 128)
        num_feat = F.relu(self.bn_numeric(self.fc_numeric(numeric_x)))# (batch, 32)
        num_feat = self.dropout(num_feat)

        combined = torch.cat([cnn_feat, gru_feat, num_feat], dim=1)   # (batch, 192)
        combined = self.dropout(combined)

        out = self.classifier(combined)                               # (batch, num_classes)
        return out
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import copy

model = EnsembleSmallData(
    embed_dim=X_embed_train.shape[1],
    num_features=X_num_train.shape[1],
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Early stopping
best_acc = 0
patience = 5
wait = 0
best_model_state = None
best_preds = []
best_labels = []

epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []

    for xb_embed, xb_num, yb in train_dl:
        xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb_embed, xb_num)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
        train_labels.extend(yb.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)

    # Evaluation
    model.eval()
    test_preds, test_labels = [], []

    with torch.no_grad():
        for xb_embed, xb_num, yb in test_dl:
            xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)
            preds = model(xb_embed, xb_num)
            test_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
            test_labels.extend(yb.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")

    # Save best model and predictions
    if test_acc > best_acc:
        best_acc = test_acc
        best_model_state = copy.deepcopy(model.state_dict())
        best_preds = test_preds.copy()
        best_labels = test_labels.copy()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best Test Acc: {best_acc*100:.2f}%")
            break

# Load best model
model.load_state_dict(best_model_state)

# Final evaluation on best model
print("\n--- Final Evaluation on Best Model ---")
print(f"Best Test Accuracy: {best_acc*100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(best_labels, best_preds))
print("\nClassification Report:")
print(classification_report(best_labels, best_preds, digits=4))



title_length unique values (sample): [0.]
title_sentiment unique values (sample): [0.]
clickbait_flag unique values (sample): [0.]
Epoch 01 | Loss: 21.9246 | Train Acc: 56.03% | Test Acc: 63.11%
Epoch 02 | Loss: 20.1801 | Train Acc: 61.02% | Test Acc: 62.14%
Epoch 03 | Loss: 20.0255 | Train Acc: 60.48% | Test Acc: 71.84%
Epoch 04 | Loss: 19.4021 | Train Acc: 61.13% | Test Acc: 65.05%
Epoch 05 | Loss: 18.6753 | Train Acc: 64.93% | Test Acc: 70.87%
Epoch 06 | Loss: 18.4781 | Train Acc: 65.91% | Test Acc: 71.84%
Epoch 07 | Loss: 17.9541 | Train Acc: 66.88% | Test Acc: 60.19%
Epoch 08 | Loss: 18.2616 | Train Acc: 67.43% | Test Acc: 65.05%
Early stopping at epoch 8. Best Test Acc: 71.84%

--- Final Evaluation on Best Model ---
Best Test Accuracy: 71.84%
Confusion Matrix:
[[26 22]
 [ 7 48]]

Classification Report:
              precision    recall  f1-score   support

           0     0.7879    0.5417    0.6420        48
           1     0.6857    0.8727    0.7680        55

    accuracy    

only title

In [53]:
# Convert embeddings to array
X_text = np.stack(data['embedding'].values)

# Select numeric features (update with real column names)
numerical_cols = ['post-title_x']

X_numeric = data[numerical_cols].fillna(0).values  # Handle NaNs if any

# Combine text embeddings and numeric features
X = np.hstack([X_text, X_numeric])

# Target variable
y = data['post-label_x'].values  # Make sure this column exists
X_text = np.stack(data['embedding'].values)
for col in numerical_cols:
    print(f"{col} unique values (sample):", data[col].unique()[:5])
def parse_shorthand(value):
    try:
        value = str(value).strip().lower()
        if 'k' in value:
            return float(value.replace('k', '')) * 1_000
        elif 'm' in value:
            return float(value.replace('m', '')) * 1_000_000
        else:
            return float(value)
    except:
        return np.nan
for col in numerical_cols:
    # Only apply to object (string-like) columns
    if data[col].dtype == 'object':
        data[col] = data[col].apply(parse_shorthand)
data[numerical_cols] = data[numerical_cols].fillna(0)
other_feature_columns = [col for col in numerical_cols if col != 'post-title_x']
other_features = data[other_feature_columns].values.astype(np.float32)
X_numeric = data[numerical_cols].fillna(0).astype(np.float32).values
X = np.hstack([X_text, X_numeric]).astype(np.float32)

# Clean embeddings
embedding_features = np.array([np.array(e, dtype=np.float32) for e in data['embedding'].values])
numeric_features = data[numerical_cols].fillna(0).astype(np.float32).values
labels = data['post-label_x'].values

# Train/test split
X_embed_train, X_embed_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    embedding_features, numeric_features, labels, test_size=0.1, random_state=42)

# Convert to tensors
X_embed_train_tensor = torch.tensor(X_embed_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, embed_dim]
X_embed_test_tensor = torch.tensor(X_embed_test, dtype=torch.float32).unsqueeze(1)

X_num_train_tensor = torch.tensor(X_num_train, dtype=torch.float32)
X_num_test_tensor = torch.tensor(X_num_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_embed_train_tensor, X_num_train_tensor, y_train_tensor)
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_embed_test_tensor, X_num_test_tensor, y_test_tensor)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=32)
from torch.utils.data import Dataset
import torch

class EnsembleDataset(Dataset):
    def __init__(self, texts, numeric_features, labels, tokenizer, max_len=512):
        self.texts = texts
        self.numeric_features = numeric_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        numeric = torch.tensor(self.numeric_features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)           # (seq_len)
        attention_mask = encoding['attention_mask'].squeeze(0) # (seq_len)

        return input_ids, attention_mask, numeric, label
import torch
import torch.nn as nn
import torch.nn.functional as F

# Attention Module
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        # gru_output: (batch, seq_len, hidden_dim)
        attn_weights = F.softmax(self.attn(gru_output), dim=1)  # (batch, seq_len, 1)
        context = torch.sum(attn_weights * gru_output, dim=1)   # (batch, hidden_dim)
        return context

# CNN Feature Extractor
class CNNExtractor(nn.Module):
    def __init__(self, embed_dim):
        super(CNNExtractor, self).__init__()
        self.conv1 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 32, 3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = x.transpose(1, 2)                    # (batch, embed_dim, seq_len)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)             # (batch, 32)
        x = self.dropout(x)
        return x

# BiGRU with Attention Feature Extractor
class BiGRUExtractor(nn.Module):
    def __init__(self, embed_dim, hidden_size=64):
        super(BiGRUExtractor, self).__init__()
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        out, _ = self.gru(x)                      # (batch, seq_len, hidden*2)
        attn_out = self.attention(out)            # (batch, hidden*2)
        attn_out = self.bn(attn_out)
        attn_out = self.dropout(attn_out)
        return attn_out

# Full Ensemble Model
class EnsembleSmallData(nn.Module):
    def __init__(self, embed_dim, num_features, num_classes=2):
        super(EnsembleSmallData, self).__init__()
        self.cnn_branch = CNNExtractor(embed_dim)
        self.gru_branch = BiGRUExtractor(embed_dim, hidden_size=64)
        self.fc_numeric = nn.Linear(num_features, 32)
        self.bn_numeric = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(0.4)

        self.classifier = nn.Linear(32 + 128 + 32, num_classes)  # CNN(32) + GRU(128) + numeric(32)

    def forward(self, embed_x, numeric_x):
        cnn_feat = self.cnn_branch(embed_x)                           # (batch, 32)
        gru_feat = self.gru_branch(embed_x)                           # (batch, 128)
        num_feat = F.relu(self.bn_numeric(self.fc_numeric(numeric_x)))# (batch, 32)
        num_feat = self.dropout(num_feat)

        combined = torch.cat([cnn_feat, gru_feat, num_feat], dim=1)   # (batch, 192)
        combined = self.dropout(combined)

        out = self.classifier(combined)                               # (batch, num_classes)
        return out
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import copy

model = EnsembleSmallData(
    embed_dim=X_embed_train.shape[1],
    num_features=X_num_train.shape[1],
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Early stopping
best_acc = 0
patience = 5
wait = 0
best_model_state = None
best_preds = []
best_labels = []

epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []

    for xb_embed, xb_num, yb in train_dl:
        xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb_embed, xb_num)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
        train_labels.extend(yb.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)

    # Evaluation
    model.eval()
    test_preds, test_labels = [], []

    with torch.no_grad():
        for xb_embed, xb_num, yb in test_dl:
            xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)
            preds = model(xb_embed, xb_num)
            test_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
            test_labels.extend(yb.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")

    # Save best model and predictions
    if test_acc > best_acc:
        best_acc = test_acc
        best_model_state = copy.deepcopy(model.state_dict())
        best_preds = test_preds.copy()
        best_labels = test_labels.copy()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best Test Acc: {best_acc*100:.2f}%")
            break

# Load best model
model.load_state_dict(best_model_state)

# Final evaluation on best model
print("\n--- Final Evaluation on Best Model ---")
print(f"Best Test Accuracy: {best_acc*100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(best_labels, best_preds))
print("\nClassification Report:")
print(classification_report(best_labels, best_preds, digits=4))



post-title_x unique values (sample): ['وفاقی اردو یونیورسٹی کی جانب سے ارشد ندیم کود ہزار روپے کا انعام..❤️'
 "وفاقی اردو یونیورسٹی کی جانب سے ارشد ندیم کو دو ہزار روپے کا انعام ❤️\n.\n.\n.\n.\n.\n.\nToday's Best Photo ❤❤❤❤❤❤\nFollow me \n❤❤❤❤❤❤❤❤\n#photography \n#photooftheday \n#photographychallenge \n#PhotoEditingChallenge\n#BestPhotographyChallenge\n#photochallenge \n#moodchallengemoodchallenge \n#moodchallengechallenge1kToday\n#RaisZada #RaisZadaUbaid #Ubaid ❤️"
 'وفاقی اردو یونیورسٹی نے ارشد ندیم کو 2000 روپے کی خطیر رقم کا چیک دیا  ، اللہ کا شکر ہے ارشد ندیم بحفاظت گھر تک پہنچ گئے ۔'
 'unseen footage of arshad nadeem and maryam nawaz… hug each other'
 'ارشد ندیم کی بیوی نے پہلی بار ٹی وی پر آکر انکشاف کر دیاجب میدان میں ناکامی ہوتی تو کیا کرتی تھی؟ \n#sunonewshd #ArshadNadeem #arshadnadeemgoldmedal #arshadnadeemfamily']
Epoch 01 | Loss: 22.3301 | Train Acc: 55.05% | Test Acc: 66.99%
Epoch 02 | Loss: 20.7613 | Train Acc: 59.07% | Test Acc: 68.93%
Epoch 03 | Loss: 20.4152 | Train

title + Comment

In [60]:
# Convert embeddings to array
X_text = np.stack(data['embedding'].values)

# Select numeric features (update with real column names)
numerical_cols = ['post-title_x', 'commenttext']

X_numeric = data[numerical_cols].fillna(0).values  # Handle NaNs if any

# Combine text embeddings and numeric features
X = np.hstack([X_text, X_numeric])

# Target variable
y = data['post-label_x'].values  # Make sure this column exists
X_text = np.stack(data['embedding'].values)
for col in numerical_cols:
    print(f"{col} unique values (sample):", data[col].unique()[:5])
def parse_shorthand(value):
    try:
        value = str(value).strip().lower()
        if 'k' in value:
            return float(value.replace('k', '')) * 1_000
        elif 'm' in value:
            return float(value.replace('m', '')) * 1_000_000
        else:
            return float(value)
    except:
        return np.nan
for col in numerical_cols:
    # Only apply to object (string-like) columns
    if data[col].dtype == 'object':
        data[col] = data[col].apply(parse_shorthand)
data[numerical_cols] = data[numerical_cols].fillna(0)
other_feature_columns = [col for col in numerical_cols if col != 'post-title_x']
other_features = data[other_feature_columns].values.astype(np.float32)
X_numeric = data[numerical_cols].fillna(0).astype(np.float32).values
X = np.hstack([X_text, X_numeric]).astype(np.float32)

# Clean embeddings
embedding_features = np.array([np.array(e, dtype=np.float32) for e in data['embedding'].values])
numeric_features = data[numerical_cols].fillna(0).astype(np.float32).values
labels = data['post-label_x'].values

# Train/test split
X_embed_train, X_embed_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    embedding_features, numeric_features, labels, test_size=0.1, random_state=42)

# Convert to tensors
X_embed_train_tensor = torch.tensor(X_embed_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, embed_dim]
X_embed_test_tensor = torch.tensor(X_embed_test, dtype=torch.float32).unsqueeze(1)

X_num_train_tensor = torch.tensor(X_num_train, dtype=torch.float32)
X_num_test_tensor = torch.tensor(X_num_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_embed_train_tensor, X_num_train_tensor, y_train_tensor)
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_embed_test_tensor, X_num_test_tensor, y_test_tensor)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=32)
from torch.utils.data import Dataset
import torch

class EnsembleDataset(Dataset):
    def __init__(self, texts, numeric_features, labels, tokenizer, max_len=512):
        self.texts = texts
        self.numeric_features = numeric_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        numeric = torch.tensor(self.numeric_features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)           # (seq_len)
        attention_mask = encoding['attention_mask'].squeeze(0) # (seq_len)

        return input_ids, attention_mask, numeric, label
import torch
import torch.nn as nn
import torch.nn.functional as F

# Attention Module
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        # gru_output: (batch, seq_len, hidden_dim)
        attn_weights = F.softmax(self.attn(gru_output), dim=1)  # (batch, seq_len, 1)
        context = torch.sum(attn_weights * gru_output, dim=1)   # (batch, hidden_dim)
        return context

# CNN Feature Extractor
class CNNExtractor(nn.Module):
    def __init__(self, embed_dim):
        super(CNNExtractor, self).__init__()
        self.conv1 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 32, 3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = x.transpose(1, 2)                    # (batch, embed_dim, seq_len)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)             # (batch, 32)
        x = self.dropout(x)
        return x

# BiGRU with Attention Feature Extractor
class BiGRUExtractor(nn.Module):
    def __init__(self, embed_dim, hidden_size=64):
        super(BiGRUExtractor, self).__init__()
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        out, _ = self.gru(x)                      # (batch, seq_len, hidden*2)
        attn_out = self.attention(out)            # (batch, hidden*2)
        attn_out = self.bn(attn_out)
        attn_out = self.dropout(attn_out)
        return attn_out

# Full Ensemble Model
class EnsembleSmallData(nn.Module):
    def __init__(self, embed_dim, num_features, num_classes=2):
        super(EnsembleSmallData, self).__init__()
        self.cnn_branch = CNNExtractor(embed_dim)
        self.gru_branch = BiGRUExtractor(embed_dim, hidden_size=64)
        self.fc_numeric = nn.Linear(num_features, 32)
        self.bn_numeric = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(0.4)

        self.classifier = nn.Linear(32 + 128 + 32, num_classes)  # CNN(32) + GRU(128) + numeric(32)

    def forward(self, embed_x, numeric_x):
        cnn_feat = self.cnn_branch(embed_x)                           # (batch, 32)
        gru_feat = self.gru_branch(embed_x)                           # (batch, 128)
        num_feat = F.relu(self.bn_numeric(self.fc_numeric(numeric_x)))# (batch, 32)
        num_feat = self.dropout(num_feat)

        combined = torch.cat([cnn_feat, gru_feat, num_feat], dim=1)   # (batch, 192)
        combined = self.dropout(combined)

        out = self.classifier(combined)                               # (batch, num_classes)
        return out
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import copy

model = EnsembleSmallData(
    embed_dim=X_embed_train.shape[1],
    num_features=X_num_train.shape[1],
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Early stopping
best_acc = 0
patience = 5
wait = 0
best_model_state = None
best_preds = []
best_labels = []

epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []

    for xb_embed, xb_num, yb in train_dl:
        xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb_embed, xb_num)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
        train_labels.extend(yb.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)

    # Evaluation
    model.eval()
    test_preds, test_labels = [], []

    with torch.no_grad():
        for xb_embed, xb_num, yb in test_dl:
            xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)
            preds = model(xb_embed, xb_num)
            test_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
            test_labels.extend(yb.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")

    # Save best model and predictions
    if test_acc > best_acc:
        best_acc = test_acc
        best_model_state = copy.deepcopy(model.state_dict())
        best_preds = test_preds.copy()
        best_labels = test_labels.copy()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best Test Acc: {best_acc*100:.2f}%")
            break

# Load best model
model.load_state_dict(best_model_state)

# Final evaluation on best model
print("\n--- Final Evaluation on Best Model ---")
print(f"Best Test Accuracy: {best_acc*100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(best_labels, best_preds))
print("\nClassification Report:")
print(classification_report(best_labels, best_preds, digits=4))



post-title_x unique values (sample): [0.]
commenttext unique values (sample): [0.00000000e+00 3.00621111e+09]
Epoch 01 | Loss: 21.3352 | Train Acc: 56.68% | Test Acc: 70.87%
Epoch 02 | Loss: 20.6317 | Train Acc: 57.98% | Test Acc: 59.22%
Epoch 03 | Loss: 19.7693 | Train Acc: 62.21% | Test Acc: 73.79%
Epoch 04 | Loss: 19.3519 | Train Acc: 60.37% | Test Acc: 74.76%
Epoch 05 | Loss: 19.0190 | Train Acc: 63.19% | Test Acc: 75.73%
Epoch 06 | Loss: 18.3563 | Train Acc: 63.41% | Test Acc: 75.73%
Epoch 07 | Loss: 18.3879 | Train Acc: 65.91% | Test Acc: 75.73%
Epoch 08 | Loss: 18.1121 | Train Acc: 65.80% | Test Acc: 66.99%
Epoch 09 | Loss: 17.5734 | Train Acc: 66.88% | Test Acc: 74.76%
Epoch 10 | Loss: 17.5097 | Train Acc: 69.60% | Test Acc: 60.19%
Early stopping at epoch 10. Best Test Acc: 75.73%

--- Final Evaluation on Best Model ---
Best Test Accuracy: 75.73%
Confusion Matrix:
[[32 16]
 [ 9 46]]

Classification Report:
              precision    recall  f1-score   support

           0     

content +social engagemnents

In [64]:
# Convert embeddings to array
X_text = np.stack(data['embedding'].values)

# Select numeric features (update with real column names)
numerical_cols = ['likescount_x', 'commentscount_x','commenttext',
                  'title_length', 'title_sentiment', 'clickbait_flag']

X_numeric = data[numerical_cols].fillna(0).values  # Handle NaNs if any

# Combine text embeddings and numeric features
X = np.hstack([X_text, X_numeric])

# Target variable
y = data['post-label_x'].values  # Make sure this column exists
X_text = np.stack(data['embedding'].values)
for col in numerical_cols:
    print(f"{col} unique values (sample):", data[col].unique()[:5])
def parse_shorthand(value):
    try:
        value = str(value).strip().lower()
        if 'k' in value:
            return float(value.replace('k', '')) * 1_000
        elif 'm' in value:
            return float(value.replace('m', '')) * 1_000_000
        else:
            return float(value)
    except:
        return np.nan
for col in numerical_cols:
    # Only apply to object (string-like) columns
    if data[col].dtype == 'object':
        data[col] = data[col].apply(parse_shorthand)
data[numerical_cols] = data[numerical_cols].fillna(0)
other_feature_columns = [col for col in numerical_cols if col != 'post-title_x']
other_features = data[other_feature_columns].values.astype(np.float32)
X_numeric = data[numerical_cols].fillna(0).astype(np.float32).values
X = np.hstack([X_text, X_numeric]).astype(np.float32)

# Clean embeddings
embedding_features = np.array([np.array(e, dtype=np.float32) for e in data['embedding'].values])
numeric_features = data[numerical_cols].fillna(0).astype(np.float32).values
labels = data['post-label_x'].values

# Train/test split
X_embed_train, X_embed_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    embedding_features, numeric_features, labels, test_size=0.1, random_state=42)

# Convert to tensors
X_embed_train_tensor = torch.tensor(X_embed_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, embed_dim]
X_embed_test_tensor = torch.tensor(X_embed_test, dtype=torch.float32).unsqueeze(1)

X_num_train_tensor = torch.tensor(X_num_train, dtype=torch.float32)
X_num_test_tensor = torch.tensor(X_num_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_embed_train_tensor, X_num_train_tensor, y_train_tensor)
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_embed_test_tensor, X_num_test_tensor, y_test_tensor)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=32)
from torch.utils.data import Dataset
import torch

class EnsembleDataset(Dataset):
    def __init__(self, texts, numeric_features, labels, tokenizer, max_len=512):
        self.texts = texts
        self.numeric_features = numeric_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        numeric = torch.tensor(self.numeric_features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)           # (seq_len)
        attention_mask = encoding['attention_mask'].squeeze(0) # (seq_len)

        return input_ids, attention_mask, numeric, label
import torch
import torch.nn as nn
import torch.nn.functional as F

# Attention Module
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        # gru_output: (batch, seq_len, hidden_dim)
        attn_weights = F.softmax(self.attn(gru_output), dim=1)  # (batch, seq_len, 1)
        context = torch.sum(attn_weights * gru_output, dim=1)   # (batch, hidden_dim)
        return context

# CNN Feature Extractor
class CNNExtractor(nn.Module):
    def __init__(self, embed_dim):
        super(CNNExtractor, self).__init__()
        self.conv1 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 32, 3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = x.transpose(1, 2)                    # (batch, embed_dim, seq_len)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)             # (batch, 32)
        x = self.dropout(x)
        return x

# BiGRU with Attention Feature Extractor
class BiGRUExtractor(nn.Module):
    def __init__(self, embed_dim, hidden_size=64):
        super(BiGRUExtractor, self).__init__()
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        out, _ = self.gru(x)                      # (batch, seq_len, hidden*2)
        attn_out = self.attention(out)            # (batch, hidden*2)
        attn_out = self.bn(attn_out)
        attn_out = self.dropout(attn_out)
        return attn_out

# Full Ensemble Model
class EnsembleSmallData(nn.Module):
    def __init__(self, embed_dim, num_features, num_classes=2):
        super(EnsembleSmallData, self).__init__()
        self.cnn_branch = CNNExtractor(embed_dim)
        self.gru_branch = BiGRUExtractor(embed_dim, hidden_size=64)
        self.fc_numeric = nn.Linear(num_features, 32)
        self.bn_numeric = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(0.4)

        self.classifier = nn.Linear(32 + 128 + 32, num_classes)  # CNN(32) + GRU(128) + numeric(32)

    def forward(self, embed_x, numeric_x):
        cnn_feat = self.cnn_branch(embed_x)                           # (batch, 32)
        gru_feat = self.gru_branch(embed_x)                           # (batch, 128)
        num_feat = F.relu(self.bn_numeric(self.fc_numeric(numeric_x)))# (batch, 32)
        num_feat = self.dropout(num_feat)

        combined = torch.cat([cnn_feat, gru_feat, num_feat], dim=1)   # (batch, 192)
        combined = self.dropout(combined)

        out = self.classifier(combined)                               # (batch, num_classes)
        return out
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import copy

model = EnsembleSmallData(
    embed_dim=X_embed_train.shape[1],
    num_features=X_num_train.shape[1],
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Early stopping
best_acc = 0
patience = 5
wait = 0
best_model_state = None
best_preds = []
best_labels = []

epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []

    for xb_embed, xb_num, yb in train_dl:
        xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb_embed, xb_num)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
        train_labels.extend(yb.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)

    # Evaluation
    model.eval()
    test_preds, test_labels = [], []

    with torch.no_grad():
        for xb_embed, xb_num, yb in test_dl:
            xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)
            preds = model(xb_embed, xb_num)
            test_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
            test_labels.extend(yb.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")

    # Save best model and predictions
    if test_acc > best_acc:
        best_acc = test_acc
        best_model_state = copy.deepcopy(model.state_dict())
        best_preds = test_preds.copy()
        best_labels = test_labels.copy()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best Test Acc: {best_acc*100:.2f}%")
            break

# Load best model
model.load_state_dict(best_model_state)

# Final evaluation on best model
print("\n--- Final Evaluation on Best Model ---")
print(f"Best Test Accuracy: {best_acc*100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(best_labels, best_preds))
print("\nClassification Report:")
print(classification_report(best_labels, best_preds, digits=4))



likescount_x unique values (sample): [2.5000e+04 3.0000e+01 2.4000e+01 6.2000e+01 6.2584e+04]
commentscount_x unique values (sample): [1704.    4.    6.    3.  711.]
commenttext unique values (sample): [0.00000000e+00 3.00621111e+09]
title_length unique values (sample): [0.]
title_sentiment unique values (sample): [0.]
clickbait_flag unique values (sample): [0.]
Epoch 01 | Loss: 21.7954 | Train Acc: 55.05% | Test Acc: 62.14%
Epoch 02 | Loss: 20.5424 | Train Acc: 58.63% | Test Acc: 73.79%
Epoch 03 | Loss: 20.3627 | Train Acc: 60.59% | Test Acc: 73.79%
Epoch 04 | Loss: 19.0645 | Train Acc: 63.63% | Test Acc: 73.79%
Epoch 05 | Loss: 18.8779 | Train Acc: 63.84% | Test Acc: 60.19%
Epoch 06 | Loss: 18.4735 | Train Acc: 63.84% | Test Acc: 71.84%
Epoch 07 | Loss: 18.2953 | Train Acc: 65.58% | Test Acc: 68.93%
Early stopping at epoch 7. Best Test Acc: 73.79%

--- Final Evaluation on Best Model ---
Best Test Accuracy: 73.79%
Confusion Matrix:
[[27 21]
 [ 6 49]]

Classification Report:
          

In [None]:
#content+user profiling

In [65]:
# Convert embeddings to array
X_text = np.stack(data['embedding'].values)

# Select numeric features (update with real column names)
numerical_cols = ['followers_y', 'followings_y',
                  'is user verified(0 verified, 1 unverified)_y', 'join_days_ago',
                  'title_length', 'title_sentiment', 'clickbait_flag']

X_numeric = data[numerical_cols].fillna(0).values  # Handle NaNs if any

# Combine text embeddings and numeric features
X = np.hstack([X_text, X_numeric])

# Target variable
y = data['post-label_x'].values  # Make sure this column exists
X_text = np.stack(data['embedding'].values)
for col in numerical_cols:
    print(f"{col} unique values (sample):", data[col].unique()[:5])
def parse_shorthand(value):
    try:
        value = str(value).strip().lower()
        if 'k' in value:
            return float(value.replace('k', '')) * 1_000
        elif 'm' in value:
            return float(value.replace('m', '')) * 1_000_000
        else:
            return float(value)
    except:
        return np.nan
for col in numerical_cols:
    # Only apply to object (string-like) columns
    if data[col].dtype == 'object':
        data[col] = data[col].apply(parse_shorthand)
data[numerical_cols] = data[numerical_cols].fillna(0)
other_feature_columns = [col for col in numerical_cols if col != 'post-title_x']
other_features = data[other_feature_columns].values.astype(np.float32)
X_numeric = data[numerical_cols].fillna(0).astype(np.float32).values
X = np.hstack([X_text, X_numeric]).astype(np.float32)

# Clean embeddings
embedding_features = np.array([np.array(e, dtype=np.float32) for e in data['embedding'].values])
numeric_features = data[numerical_cols].fillna(0).astype(np.float32).values
labels = data['post-label_x'].values

# Train/test split
X_embed_train, X_embed_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
    embedding_features, numeric_features, labels, test_size=0.1, random_state=42)

# Convert to tensors
X_embed_train_tensor = torch.tensor(X_embed_train, dtype=torch.float32).unsqueeze(1)  # [batch, 1, embed_dim]
X_embed_test_tensor = torch.tensor(X_embed_test, dtype=torch.float32).unsqueeze(1)

X_num_train_tensor = torch.tensor(X_num_train, dtype=torch.float32)
X_num_test_tensor = torch.tensor(X_num_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# DataLoaders
train_dataset = torch.utils.data.TensorDataset(X_embed_train_tensor, X_num_train_tensor, y_train_tensor)
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = torch.utils.data.TensorDataset(X_embed_test_tensor, X_num_test_tensor, y_test_tensor)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=32)
from torch.utils.data import Dataset
import torch

class EnsembleDataset(Dataset):
    def __init__(self, texts, numeric_features, labels, tokenizer, max_len=512):
        self.texts = texts
        self.numeric_features = numeric_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        numeric = torch.tensor(self.numeric_features[idx], dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze(0)           # (seq_len)
        attention_mask = encoding['attention_mask'].squeeze(0) # (seq_len)

        return input_ids, attention_mask, numeric, label
import torch
import torch.nn as nn
import torch.nn.functional as F

# Attention Module
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, gru_output):
        # gru_output: (batch, seq_len, hidden_dim)
        attn_weights = F.softmax(self.attn(gru_output), dim=1)  # (batch, seq_len, 1)
        context = torch.sum(attn_weights * gru_output, dim=1)   # (batch, hidden_dim)
        return context

# CNN Feature Extractor
class CNNExtractor(nn.Module):
    def __init__(self, embed_dim):
        super(CNNExtractor, self).__init__()
        self.conv1 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 32, 3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        x = x.transpose(1, 2)                    # (batch, embed_dim, seq_len)
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)             # (batch, 32)
        x = self.dropout(x)
        return x

# BiGRU with Attention Feature Extractor
class BiGRUExtractor(nn.Module):
    def __init__(self, embed_dim, hidden_size=64):
        super(BiGRUExtractor, self).__init__()
        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)
        self.bn = nn.BatchNorm1d(hidden_size * 2)
        self.dropout = nn.Dropout(0.4)

    def forward(self, x):
        out, _ = self.gru(x)                      # (batch, seq_len, hidden*2)
        attn_out = self.attention(out)            # (batch, hidden*2)
        attn_out = self.bn(attn_out)
        attn_out = self.dropout(attn_out)
        return attn_out

# Full Ensemble Model
class EnsembleSmallData(nn.Module):
    def __init__(self, embed_dim, num_features, num_classes=2):
        super(EnsembleSmallData, self).__init__()
        self.cnn_branch = CNNExtractor(embed_dim)
        self.gru_branch = BiGRUExtractor(embed_dim, hidden_size=64)
        self.fc_numeric = nn.Linear(num_features, 32)
        self.bn_numeric = nn.BatchNorm1d(32)
        self.dropout = nn.Dropout(0.4)

        self.classifier = nn.Linear(32 + 128 + 32, num_classes)  # CNN(32) + GRU(128) + numeric(32)

    def forward(self, embed_x, numeric_x):
        cnn_feat = self.cnn_branch(embed_x)                           # (batch, 32)
        gru_feat = self.gru_branch(embed_x)                           # (batch, 128)
        num_feat = F.relu(self.bn_numeric(self.fc_numeric(numeric_x)))# (batch, 32)
        num_feat = self.dropout(num_feat)

        combined = torch.cat([cnn_feat, gru_feat, num_feat], dim=1)   # (batch, 192)
        combined = self.dropout(combined)

        out = self.classifier(combined)                               # (batch, num_classes)
        return out
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import copy

model = EnsembleSmallData(
    embed_dim=X_embed_train.shape[1],
    num_features=X_num_train.shape[1],
    num_classes=2
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Early stopping
best_acc = 0
patience = 5
wait = 0
best_model_state = None
best_preds = []
best_labels = []

epochs = 100

for epoch in range(epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []

    for xb_embed, xb_num, yb in train_dl:
        xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xb_embed, xb_num)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
        train_labels.extend(yb.cpu().numpy())

    train_acc = accuracy_score(train_labels, train_preds)

    # Evaluation
    model.eval()
    test_preds, test_labels = [], []

    with torch.no_grad():
        for xb_embed, xb_num, yb in test_dl:
            xb_embed, xb_num, yb = xb_embed.to(device), xb_num.to(device), yb.to(device)
            preds = model(xb_embed, xb_num)
            test_preds.extend(torch.argmax(preds, dim=1).cpu().numpy())
            test_labels.extend(yb.cpu().numpy())

    test_acc = accuracy_score(test_labels, test_preds)

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")

    # Save best model and predictions
    if test_acc > best_acc:
        best_acc = test_acc
        best_model_state = copy.deepcopy(model.state_dict())
        best_preds = test_preds.copy()
        best_labels = test_labels.copy()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch+1}. Best Test Acc: {best_acc*100:.2f}%")
            break

# Load best model
model.load_state_dict(best_model_state)

# Final evaluation on best model
print("\n--- Final Evaluation on Best Model ---")
print(f"Best Test Accuracy: {best_acc*100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(best_labels, best_preds))
print("\nClassification Report:")
print(classification_report(best_labels, best_preds, digits=4))



followers_y unique values (sample): [1.80e+04 2.40e+04 4.80e+03 1.01e+03 3.50e+06]
followings_y unique values (sample): [    0.  2659. 65300.   463.   868.]
is user verified(0 verified, 1 unverified)_y unique values (sample): [1 0]
join_days_ago unique values (sample): [   0. 4903.  734. 3929. 6517.]
title_length unique values (sample): [0.]
title_sentiment unique values (sample): [0.]
clickbait_flag unique values (sample): [0.]
Epoch 01 | Loss: 23.3475 | Train Acc: 52.12% | Test Acc: 69.90%
Epoch 02 | Loss: 21.1183 | Train Acc: 56.24% | Test Acc: 72.82%
Epoch 03 | Loss: 20.2781 | Train Acc: 59.83% | Test Acc: 70.87%
Epoch 04 | Loss: 19.5542 | Train Acc: 59.28% | Test Acc: 71.84%
Epoch 05 | Loss: 18.8246 | Train Acc: 62.65% | Test Acc: 76.70%
Epoch 06 | Loss: 18.5631 | Train Acc: 63.52% | Test Acc: 76.70%
Epoch 07 | Loss: 18.0071 | Train Acc: 66.88% | Test Acc: 70.87%
Epoch 08 | Loss: 17.3447 | Train Acc: 68.95% | Test Acc: 74.76%
Epoch 09 | Loss: 17.3867 | Train Acc: 67.54% | Test Acc