In [1]:
# Sneha Dubey
# Dr. Fang
# CSEN 346
# 13 Jun 2025

In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# === Load and preprocess LIAR dataset ===
def load_liar_split(fname):
    df = pd.read_csv(fname, sep='\t', header=None, names=[
        "id","label","statement","subject","speaker","job","state","party",
        "barely_true_cnt","false_cnt","half_true_cnt","mostly_true_cnt","pants_fire_cnt","context"
    ])
    df["party"] = df["party"].fillna("unknown")
    df["label3"] = df["label"].map({
        "pants-fire": 0, "false": 0,
        "barely-true": 1, "half-true": 1,
        "mostly-true": 2, "true": 2
    })
    return df

def get_party_vector(party):
    if isinstance(party, str):
        return {
            'democrat': [1, 0, 0],
            'republican': [0, 1, 0],
            'independent': [0, 0, 1]
        }.get(party.lower(), [0, 0, 0])
    else:
        return [0, 0, 0]

def build_vocab(texts, min_freq=2):
    cnt = Counter(w for s in texts for w in s.lower().split())
    vocab = {"<pad>":0, "<unk>":1}
    for w,f in cnt.items():
        if f >= min_freq:
            vocab[w] = len(vocab)
    return vocab

class LiarFancyDataset(Dataset):
    def __init__(self, df, vocab, maxlen=50):
        self.vocab = vocab
        self.texts = df["statement"].tolist()
        self.labels = df["label3"].tolist()
        self.maxlen = maxlen
        self.parties = df["party"].tolist()

    def __len__(self): return len(self.labels)

    def __getitem__(self, i):
        tokens = self.texts[i].lower().split()[:self.maxlen]
        idxs = [self.vocab.get(t, self.vocab["<unk>"]) for t in tokens]
        length = len(idxs)
        idxs += [self.vocab["<pad>"]] * (self.maxlen - length)
        meta = get_party_vector(self.parties[i])
        return torch.tensor(idxs), length, torch.tensor(meta, dtype=torch.float32), self.labels[i]


In [4]:
# === Attention Layer ===
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim*2, 1)

    def forward(self, lstm_out):
        weights = torch.softmax(self.attn(lstm_out).squeeze(-1), dim=1)
        context = torch.sum(weights.unsqueeze(-1) * lstm_out, dim=1)
        return context

In [5]:
# === Fancy Liar Model ===
class FancyLiarModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, lstm_hidden=128, cnn_channels=64, num_labels=3, meta_size=3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.emb_dropout = nn.Dropout(0.2)
        self.lstm = nn.LSTM(embed_dim, lstm_hidden, batch_first=True, bidirectional=True)
        self.attn = Attention(lstm_hidden)
        self.conv = nn.Conv1d(in_channels=embed_dim, out_channels=cnn_channels, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Sequential(
            nn.Linear(lstm_hidden*2 + cnn_channels + meta_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, x, lengths, meta):
        emb = self.emb(x)
        emb = self.emb_dropout(emb)
        cnn_feat = self.pool(self.conv(emb.transpose(1,2))).squeeze(-1)
        packed = pack_padded_sequence(emb, lengths, batch_first=True, enforce_sorted=False)
        out_packed, _ = self.lstm(packed)
        out, _ = pad_packed_sequence(out_packed, batch_first=True)
        attn_feat = self.attn(out)
        combined = torch.cat([attn_feat, cnn_feat, meta], dim=1)
        return self.fc(combined)

In [6]:
# === Training Setup ===
def train_epoch(model, train_dl, optimizer, criterion, device):
    model.train()
    total, correct = 0, 0
    for x, lengths, meta, y in train_dl:
        x = torch.stack(x).to(device)
        lengths = list(lengths)
        meta = torch.stack(meta).to(device)
        y = torch.tensor(y).to(device)

        optimizer.zero_grad()
        outputs = model(x, lengths, meta)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        preds = torch.argmax(outputs, dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)
    return correct / total



def eval_epoch(model, val_dl, criterion, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0

    with torch.no_grad():
        for x, lengths, meta, y in val_dl:
            x = torch.stack(x).to(device)
            lengths = list(lengths)
            meta = torch.stack(meta).to(device)
            y = torch.tensor(y).to(device)

            outputs = model(x, lengths, meta)
            loss = criterion(outputs, y)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    avg_loss = total_loss / len(val_dl)
    acc = np.mean(np.array(all_preds) == np.array(all_labels))

    print("\n=== Classification Report ===")
    print(classification_report(all_labels, all_preds, digits=4))
    
    print("=== Confusion Matrix ===")
    print(confusion_matrix(all_labels, all_preds))

    return acc


In [7]:
# Load data
df_train = load_liar_split("train.tsv")
df_val = load_liar_split("valid.tsv")

# Build vocab
vocab = build_vocab(df_train["statement"])

# Datasets & Dataloaders
train_ds = LiarFancyDataset(df_train, vocab)
val_ds = LiarFancyDataset(df_val, vocab)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=lambda b: list(zip(*b)))
val_dl = DataLoader(val_ds, batch_size=32, collate_fn=lambda b: list(zip(*b)))


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = FancyLiarModel(vocab_size=len(vocab)).to(device)

# Compute class weights to fight imbalance
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(df_train["label3"]),
                                     y=df_train["label3"])
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)


In [9]:
for epoch in range(10):
    train_acc = train_epoch(model, train_dl, optimizer, criterion, device)
    val_acc = eval_epoch(model, val_dl, criterion, device)
    print(f"Epoch {epoch+1}: Train Acc = {train_acc:.4f}, Val Acc = {val_acc:.4f}")



=== Classification Report ===
              precision    recall  f1-score   support

           0     0.3471    0.7520    0.4750       379
           1     0.4286    0.0124    0.0240       485
           2     0.4388    0.4690    0.4534       420

    accuracy                         0.3801      1284
   macro avg     0.4048    0.4111    0.3175      1284
weighted avg     0.4079    0.3801    0.2976      1284

=== Confusion Matrix ===
[[285   2  92]
 [319   6 160]
 [217   6 197]]
Epoch 1: Train Acc = 0.3719, Val Acc = 0.3801

=== Classification Report ===
              precision    recall  f1-score   support

           0     0.4142    0.5989    0.4898       379
           1     0.4729    0.1979    0.2791       485
           2     0.4447    0.5643    0.4974       420

    accuracy                         0.4361      1284
   macro avg     0.4439    0.4537    0.4221      1284
weighted avg     0.4463    0.4361    0.4127      1284

=== Confusion Matrix ===
[[227  46 106]
 [199  96 190]
 [12

In [10]:
final_val_acc = eval_epoch(model, val_dl, criterion, device)


=== Classification Report ===
              precision    recall  f1-score   support

           0     0.4654    0.3193    0.3787       379
           1     0.4279    0.5505    0.4815       485
           2     0.4500    0.4286    0.4390       420

    accuracy                         0.4424      1284
   macro avg     0.4478    0.4328    0.4331      1284
weighted avg     0.4462    0.4424    0.4373      1284

=== Confusion Matrix ===
[[121 175  83]
 [ 81 267 137]
 [ 58 182 180]]
