In [None]:

!pip install -q scikit-learn seaborn contractions nltk gensim

import os, re, time, random, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import contractions
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
nltk.download('wordnet', quiet=True)
from nltk.corpus import wordnet
import warnings
warnings.filterwarnings("ignore")

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

In [None]:

MODEL_NAME = "bert-base-uncased"   # keep architecture same
MAX_LEN = 256                      # reduce to 128 if OOM
BATCH_SIZE = 8                     # adjust if OOM
EPOCHS = 10
LR = 2e-5
N_SPLITS = 5
SEED = 42
GRAD_ACCUM = 1      # set >1 for gradient accumulation
WEIGHT_DECAY = 0.01
DROPOUT = 0.4
EARLYSTOP_PATIENCE = 3

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)


In [None]:

input_root = '/kaggle/input'
csvs = []
for root, dirs, files in os.walk(input_root):
    for f in files:
        if f.lower().endswith('.csv'):
            csvs.append(os.path.join(root, f))
if len(csvs) == 0:
    raise FileNotFoundError("No CSV found in /kaggle/input. Upload your dataset.")
csv_path = csvs[0]
print("Using CSV:", csv_path)
raw_df = pd.read_csv(csv_path)
print("Raw shape:", raw_df.shape)
display(raw_df.head())

In [None]:

label_map = {
    'pants-fire': 0, 'pants on fire': 0, 'pants on fire.': 0,
    'false': 1,
    'barely-true': 2, 'barely true': 2,
    'half-true': 3, 'half true': 3,
    'mostly-true': 4, 'mostly true': 4,
    'true': 5
}


def find_col(df, names):
    for c in df.columns:
        if c.lower() in names:
            return c
    return None


In [None]:
stmt_col = find_col(raw_df, ['statement','statements','claim','text','statement_text'])
label_col = find_col(raw_df, ['label','labels','label_mapped','truth','veracity','class'])
speaker_col = find_col(raw_df, ['speaker','speaker_name','speakers','speaker_id'])
party_col = find_col(raw_df, ['party','political_party'])
subject_col = find_col(raw_df, ['subject','topic'])
context_col = find_col(raw_df, ['context','source','venue'])

if stmt_col is None or label_col is None:
    raise ValueError("Couldn't auto-detect 'statement' or 'label' columns. Rename CSV columns accordingly.")
print("Detected columns -> statement:", stmt_col, ", label:", label_col)
meta_cols = {}
if speaker_col: meta_cols['speaker'] = speaker_col
if party_col: meta_cols['party'] = party_col
if subject_col: meta_cols['subject'] = subject_col
if context_col: meta_cols['context'] = context_col
print("Detected metadata columns:", meta_cols)

raw_df['label_raw'] = raw_df[label_col].astype(str).str.lower().str.strip()
raw_df['label_mapped'] = raw_df['label_raw'].map(label_map)
if raw_df['label_mapped'].isna().any():
    print("Unmapped labels examples (first 20):", raw_df[raw_df['label_mapped'].isna()][label_col].unique()[:20])

In [None]:

keep_cols = [stmt_col, 'label_mapped']
for v in meta_cols.values():
    keep_cols.append(v)
DF = raw_df[keep_cols].dropna(subset=[stmt_col,'label_mapped']).rename(columns={stmt_col:'statement','label_mapped':'label'}).reset_index(drop=True)
DF['label'] = DF['label'].astype(int)
print("Prepared DF shape:", DF.shape)
print(DF['label'].value_counts())

In [None]:
def mask_person_names(text):
    toks = text.split()
    i = 0; out=[]
    while i < len(toks):
        if toks[i][0:1].isupper() and toks[i].isalpha():
            j=i+1; seq=[toks[i]]
            while j<len(toks) and toks[j][0:1].isupper() and toks[j].isalpha():
                seq.append(toks[j]); j+=1
            if len(seq)>=1:
                out.append("<person>"); i=j; continue
        out.append(toks[i]); i+=1
    return " ".join(out)

def clean_text(text):
    if not isinstance(text,str): return ""
    text = text.strip().replace('\n',' ').replace('\r',' ')
    text = contractions.fix(text)
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    text = re.sub(r'\[.*?\]|\(.*?\)|\<.*?\>', ' ', text)
    text = re.sub(r"[^a-z0-9\s'\.,!?;-]", ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    toks = text.split()
    for i in range(len(toks)-1):
        if toks[i] in ('not','no','never'):
            toks[i] = toks[i] + "_" + toks[i+1]
            toks[i+1] = ''
    text = " ".join([t for t in toks if t])
    text = mask_person_names(text)
    return text

DF['clean_statement'] = DF['statement'].astype(str).map(clean_text)
DF = DF[DF['clean_statement'].str.len() > 5].reset_index(drop=True)
print("After cleaning:", DF.shape)

In [None]:
# ============ Topic modelling (LDA) - add topic_id metadata ============
n_topics = 10
vectorizer = CountVectorizer(max_df=0.9, min_df=5, stop_words='english')
X_counts = vectorizer.fit_transform(DF['clean_statement'].values)
lda = LatentDirichletAllocation(n_components=n_topics, random_state=SEED, n_jobs=-1)
topic_dist = lda.fit_transform(X_counts)
DF['topic_id'] = topic_dist.argmax(axis=1)
print("Topic distribution:", DF['topic_id'].value_counts().to_dict())

In [None]:
# ============ Inject metadata and speaker history into model input ============
def make_model_input(row):
    parts = [row['clean_statement']]
    if 'speaker' in DF.columns and pd.notna(row.get('speaker')):
        parts.append(f"speaker={row.get('speaker')}")
    if 'party' in DF.columns and pd.notna(row.get('party')):
        parts.append(f"party={row.get('party')}")
    if 'subject' in DF.columns and pd.notna(row.get('subject')):
        parts.append(f"subject={row.get('subject')}")
    if 'context' in DF.columns and pd.notna(row.get('context')):
        parts.append(f"context={row.get('context')}")
    parts.append(f"topic={int(row['topic_id'])}")
    return " [SEP] ".join(parts)

DF['model_input'] = DF.apply(make_model_input, axis=1)

# speaker history: true rate per speaker if available
if 'speaker' in DF.columns:
    speaker_true_rate = DF.groupby('speaker')['label'].apply(lambda arr: np.mean(np.array(arr)==5)).to_dict()
    DF['speaker_true_rate'] = DF['speaker'].map(speaker_true_rate).fillna(0.0)
    DF['model_input'] = DF.apply(lambda r: r['model_input'] + f" [SEP] speaker_true_rate={r['speaker_true_rate']:.3f}", axis=1)
else:
    DF['speaker_true_rate'] = 0.0

print("Sample model_input:")
display(DF[['model_input','label']].head())


In [None]:
# ============ Tokenizer & Dataset class (with augmentation) ============
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def synonym_replace(text, p=0.05):
    toks = text.split()
    out=[]
    for t in toks:
        if random.random() < p:
            syns = wordnet.synsets(t)
            if syns:
                lemmas = [l.name().replace('_',' ') for s in syns for l in s.lemmas()]
                lemmas = [w for w in lemmas if w.lower()!=t.lower() and w.isalpha()]
                if lemmas:
                    out.append(random.choice(lemmas)); continue
        out.append(t)
    return " ".join(out)

In [None]:
class LIARDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, augment=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment = augment
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        if self.augment:
            if random.random() < 0.1:
                text = synonym_replace(text, p=0.05)
            if random.random() < 0.05:
                toks = text.split()
                if len(toks)>3:
                    pos = random.randint(0,len(toks)-1)
                    toks[pos] = '[MASK]'; text = ' '.join(toks)
        enc = tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {'ids': enc['input_ids'].squeeze(0), 'mask': enc['attention_mask'].squeeze(0), 'label': torch.tensor(int(self.labels[idx]), dtype=torch.long)}


In [None]:


# ============ Model: BERT + BiLSTM + Attention ============
class BertBiLSTMAttn(nn.Module):
    def __init__(self, model_name, lstm_hidden=128, num_classes=6, dropout=DROPOUT, freeze_bert=False):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        if freeze_bert:
            for p in self.bert.parameters(): p.requires_grad=False
        self.lstm = nn.LSTM(input_size=self.bert.config.hidden_size, hidden_size=lstm_hidden, num_layers=1, batch_first=True, bidirectional=True)
        self.attn = nn.Linear(lstm_hidden*2, 1)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(lstm_hidden*2, num_classes)
    def forward(self, ids, mask):
        bert_out = self.bert(ids, attention_mask=mask)
        seq_output = bert_out.last_hidden_state
        lstm_out, _ = self.lstm(seq_output)
        attn_weights = torch.softmax(self.attn(lstm_out), dim=1)
        context = torch.sum(attn_weights * lstm_out, dim=1)
        context = self.dropout(context)
        out = self.classifier(context)
        return out

In [None]:
# ============ Training / Eval / EarlyStopping utilities ============
def train_epoch(model, loader, optimizer, scheduler, criterion, grad_accum_steps=1):
    model.train()
    total_loss=0.0; total_correct=0; total_samples=0
    optimizer.zero_grad()
    for step, batch in enumerate(loader, 1):
        ids = batch['ids'].to(device); mask = batch['mask'].to(device); labels = batch['label'].to(device)
        outputs = model(ids, mask)
        loss = criterion(outputs, labels) / grad_accum_steps
        loss.backward()
        if step % grad_accum_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            if scheduler is not None: scheduler.step()
            optimizer.zero_grad()
        total_loss += loss.item() * grad_accum_steps
        preds = outputs.argmax(1)
        total_correct += (preds == labels).sum().item()
        total_samples += labels.size(0)
    avg_loss = total_loss / len(loader)
    avg_acc = total_correct / total_samples if total_samples>0 else 0
    return avg_loss, avg_acc

In [None]:
def eval_model(model, loader, criterion=None):
    model.eval()
    all_preds=[]; all_labels=[]
    total_loss=0.0; total_samples=0
    with torch.no_grad():
        for batch in loader:
            ids = batch['ids'].to(device); mask = batch['mask'].to(device); labels = batch['label'].to(device)
            outputs = model(ids, mask)
            preds = outputs.argmax(1)
            all_preds.extend(preds.cpu().numpy()); all_labels.extend(labels.cpu().numpy())
            if criterion is not None:
                loss = criterion(outputs, labels)
                total_loss += loss.item(); total_samples += labels.size(0)
    avg_loss = (total_loss / len(loader)) if (criterion is not None and len(loader)>0) else None
    acc = accuracy_score(all_labels, all_preds) if len(all_labels)>0 else 0
    return avg_loss, acc, np.array(all_labels), np.array(all_preds)

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, verbose=False, delta=0.0, path='checkpoint.pt'):
        self.patience=patience; self.verbose=verbose; self.delta=delta; self.path=path
        self.best_score=None; self.counter=0; self.early_stop=False
    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score=score; self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose: print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience: self.early_stop = True
        else:
            self.best_score=score; self.save_checkpoint(val_loss, model); self.counter=0
    def save_checkpoint(self, val_loss, model):
        torch.save(model.state_dict(), self.path)


In [None]:
# ============ Prepare class weights & global criterion ============
classes = np.unique(DF['label'])
class_weights_np = compute_class_weight(class_weight='balanced', classes=classes, y=DF['label'].values)
class_weights = torch.tensor(class_weights_np, dtype=torch.float).to(device)
print("Class weights:", class_weights_np)
criterion_global = nn.CrossEntropyLoss(weight=class_weights)

In [None]:


# ============ K-Fold training (with metrics) ============
all_train_acc=[]; all_val_acc=[]; all_train_loss=[]; all_val_loss=[]
fold_accuracies=[]; class_names = ["pants-fire","false","barely-true","half-true","mostly-true","true"]

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
for fold, (train_idx, val_idx) in enumerate(skf.split(DF, DF['label']), 1):
    print(f"\n\n===== FOLD {fold}/{N_SPLITS} =====")
    train_df = DF.iloc[train_idx].reset_index(drop=True); val_df = DF.iloc[val_idx].reset_index(drop=True)

    train_data = LIARDataset(train_df['model_input'].values, train_df['label'].values, tokenizer, MAX_LEN, augment=True)
    val_data = LIARDataset(val_df['model_input'].values, val_df['label'].values, tokenizer, MAX_LEN, augment=False)

    train_labels = train_df['label'].values
    sample_weights = np.array([class_weights_np[label] for label in train_labels], dtype=np.double)
    sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=sampler, drop_last=False)
    val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)

    model = BertBiLSTMAttn(MODEL_NAME, lstm_hidden=128, num_classes=6, dropout=DROPOUT, freeze_bert=False).to(device)

    # freeze lower bert layers initially (layers 0..7)
    freeze_bert_layers(model, freeze_until=8)
    print("Initially frozen BERT layers 0..7 and embeddings")

    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=WEIGHT_DECAY)
    total_steps = max(1, len(train_loader) * EPOCHS // GRAD_ACCUM)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)
    criterion = nn.CrossEntropyLoss(weight=class_weights)

    early_stopper = EarlyStopping(patience=EARLYSTOP_PATIENCE, verbose=True, path=f'/kaggle/working/best_fold{fold}.pt')
    best_val_acc = 0.0

    for epoch in range(1, EPOCHS+1):
        # schedule: epoch 1-3 freeze lower 0..7; epoch4 freeze_until=10; epoch7 unfreeze all
        if epoch == 4:
            freeze_bert_layers(model, freeze_until=10)
            print("At epoch 4: now frozen layers 0..9 (so training layers 10..11)")
            optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=WEIGHT_DECAY)
            total_steps = max(1, len(train_loader) * (EPOCHS - epoch + 1) // GRAD_ACCUM)
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)
        if epoch == 7:
            unfreeze_all_bert(model)
            print("At epoch 7: fully unfreezed BERT")
            optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR/2, weight_decay=WEIGHT_DECAY)
            total_steps = max(1, len(train_loader) * (EPOCHS - epoch + 1) // GRAD_ACCUM)
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)

        print(f"\n--- Fold {fold} | Epoch {epoch}/{EPOCHS} ---")
        t0 = time.time()

        train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, criterion, grad_accum_steps=GRAD_ACCUM)
        val_loss, val_acc, y_true, y_pred = eval_model(model, val_loader, criterion)

        all_train_acc.append(train_acc); all_val_acc.append(val_acc)
        all_train_loss.append(train_loss); all_val_loss.append(val_loss if val_loss is not None else (1-val_acc))

        print(f"Train Loss {train_loss:.4f} | Train Acc {train_acc:.4f}")
        print(f"Val   Loss {val_loss:.4f} | Val   Acc {val_acc:.4f} | Time {time.time()-t0:.1f}s")

        # === Overall metrics (already val_acc given) ===
        overall_precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
        overall_recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
        overall_f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
        overall_accuracy = accuracy_score(y_true, y_pred)
        print("\nOverall metrics:")
        print(f"Accuracy: {overall_accuracy:.4f}  Precision: {overall_precision:.4f}  Recall: {overall_recall:.4f}  F1: {overall_f1:.4f}")

        # === Class-wise metrics ===
        print("\nClass-wise metrics (precision / recall / f1 / support):")
        print(classification_report(y_true, y_pred, target_names=class_names, digits=4, zero_division=0))

        # === Confusion matrix ===
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(7,5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
        plt.title(f"Fold {fold} Confusion Matrix - Epoch {epoch}")
        plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.show()

        # save best model by val_acc
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f'/kaggle/working/bert_bilstm_attn_fold{fold}.pt')
            print(f"Saved best model for fold {fold} (val_acc={best_val_acc:.4f})")

        early_stopper(val_loss if val_loss is not None else (1 - val_acc), model)
        if early_stopper.early_stop:
            print("Early stopping triggered for this fold.")
            break

    fold_accuracies.append(best_val_acc)
    print(f"Best val acc fold {fold}: {best_val_acc:.4f}")

print("\nCross-validation fold accuracies:", fold_accuracies)
print("Mean accuracy:", np.mean(fold_accuracies))




In [None]:

plt.figure(figsize=(10,4))
plt.plot(all_train_acc, label='Train Acc')
plt.plot(all_val_acc, label='Val Acc')
plt.xlabel("Epoch steps"); plt.ylabel("Accuracy"); plt.legend(); plt.title("Train & Val Accuracy"); plt.show()

plt.figure(figsize=(10,4))
plt.plot(all_train_loss, label='Train Loss')
plt.plot(all_val_loss, label='Val Loss')
plt.xlabel("Epoch steps"); plt.ylabel("Loss"); plt.legend(); plt.title("Train & Val Loss"); plt.show()

best_path = '/kaggle/working/bert_bilstm_attn_fold1.pt'
if os.path.exists(best_path):
    print("Loading saved best model from:", best_path)
    model = BertBiLSTMAttn(MODEL_NAME, lstm_hidden=128, num_classes=6, dropout=DROPOUT, freeze_bert=False).to(device)
    model.load_state_dict(torch.load(best_path))
    model.eval()
else:
    print("No saved model for fold1 found.")