In [7]:
!pip install datasets
from datasets import load_dataset

# Load the FeTaQA dataset
dataset = load_dataset("DongfuJiang/FeTaQA")

# Access the train, validation, and test splits
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

# Display the first example in the training set
print(train_data[0])
# Function to flatten a table 
def flatten_table(table):
    flattened = []
    headers = table[0]  # First row is the header
    for row in table[1:]:
        for col_header, col_value in zip(headers, row):
            flattened.append((col_header, col_value))
    return flattened

{'feta_id': 18162, 'table_source_json': 'totto_source/train_json/example-10461.json', 'page_wikipedia_url': 'http://en.wikipedia.org/wiki/1982_Illinois_gubernatorial_election', 'table_page_title': '1982 Illinois gubernatorial election', 'table_section_title': 'Results', 'table_array': [['Party', 'Party', 'Candidate', 'Votes', '%', '±'], ['-', 'Republican', 'James R. Thompson (incumbent)', '1,816,101', '49.44', '-'], ['-', 'Democratic', 'Adlai Stevenson III', '1,811,027', '49.30', '-'], ['-', 'Libertarian', 'Bea Armstrong', '24,417', '0.66', '-'], ['-', 'Taxpayers', 'John E. Roche', '22,001', '0.60', '-'], ['-', 'N/A', 'write-ins', '161', '0.00', 'n-a'], ['Majority', 'Majority', 'Majority', '5,074', '0.14', '-'], ['Turnout', 'Turnout', 'Turnout', '3,673,707', '-', '-'], ['-', 'Republican hold', 'Republican hold', 'Swing', '-', '-']], 'highlighted_cell_ids': [[1, 2], [6, 3]], 'question': 'Who won the 1982 Illinois gubernatorial election, and how many votes was the margin?', 'answer': 'Th

In [8]:
def prepare_data_for_model(dataset, mode='tabular'):
    prepared_data = []

    for example in dataset:
        question = example['question']
        answer = example['answer']

        if mode == 'tabular':
            table = example['table_array']
            flattened_table = flatten_table(table)
            input_text = f"Question: {question} Context: {example['table_page_title']} Table: {flattened_table}"
            adapter = "tabular_adapter"

        elif mode == 'textual':
            context = example.get("context", "No context available")
            input_text = f"Question: {question} Context: {context}"
            adapter = "textual_adapter"

        prepared_data.append({
            'input': input_text,
            'output': answer,
            'adapter': adapter
        })

    return prepared_data

# Prepare data for training set
train_data = prepare_data_for_model(dataset['train'])
print(train_data[0])


{'input': "Question: Who won the 1982 Illinois gubernatorial election, and how many votes was the margin? Context: 1982 Illinois gubernatorial election Table: [('Party', '-'), ('Party', 'Republican'), ('Candidate', 'James R. Thompson (incumbent)'), ('Votes', '1,816,101'), ('%', '49.44'), ('±', '-'), ('Party', '-'), ('Party', 'Democratic'), ('Candidate', 'Adlai Stevenson III'), ('Votes', '1,811,027'), ('%', '49.30'), ('±', '-'), ('Party', '-'), ('Party', 'Libertarian'), ('Candidate', 'Bea Armstrong'), ('Votes', '24,417'), ('%', '0.66'), ('±', '-'), ('Party', '-'), ('Party', 'Taxpayers'), ('Candidate', 'John E. Roche'), ('Votes', '22,001'), ('%', '0.60'), ('±', '-'), ('Party', '-'), ('Party', 'N/A'), ('Candidate', 'write-ins'), ('Votes', '161'), ('%', '0.00'), ('±', 'n-a'), ('Party', 'Majority'), ('Party', 'Majority'), ('Candidate', 'Majority'), ('Votes', '5,074'), ('%', '0.14'), ('±', '-'), ('Party', 'Turnout'), ('Party', 'Turnout'), ('Candidate', 'Turnout'), ('Votes', '3,673,707'), ('%

In [9]:
!pip install transformers



In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration
from peft import get_peft_model, LoraConfig
from torch.optim import AdamW
from tqdm import tqdm

# Load pre-trained BART model and tokenizer
model_name = 'facebook/bart-large'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Define LoRA configuration for parameter-efficient fine-tuning
lora_config = LoraConfig(
    r=8,  # rank of the low-rank matrices
    lora_alpha=32,  # scaling factor for LoRA weights
    lora_dropout=0.1,  # dropout rate for LoRA layers
    bias="none",  # LoRA bias setting (none, all, or simple)
    task_type="CAUSAL_LM"  # Causal language modeling task type
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Freeze all layers except LoRA layers
for name, param in model.named_parameters():
    if "lora" not in name:
        param.requires_grad = False

# Dataset class
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(
            item['input'],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        labels = self.tokenizer(
            item['output'],
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        # Mask padding tokens for the loss calculation
        labels_ids = labels["input_ids"].squeeze()
        labels_ids[labels_ids == self.tokenizer.pad_token_id] = -100
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels_ids,
        }

# # Dummy data
# train_data = [
#     {'input': 'What is the capital of France?', 'output': 'Paris'},
#     {'input': 'What is the largest planet in our solar system?', 'output': 'Jupiter'}
# ]

# Create DataLoader
train_dataset = QADataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Optimizer for LoRA parameters
optimizer = AdamW(model.parameters(), lr=1e-4)

# Gradient accumulation
accumulation_steps = 4  # Accumulate gradients over 4 mini-batches

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Adapter Layer Pruning Function
def prune_adapter_layers(model, layers_to_prune):
    """
    Freeze (prune) specific adapter layers.
    layers_to_prune: List of layer names to prune, e.g., ["encoder.block.0.adapter"]
    """
    for name, param in model.named_parameters():
        for layer_name in layers_to_prune:
            if layer_name in name:
                param.requires_grad = False
                print(f"Pruning layer: {name}")

# Example of pruning layers from the encoder
layers_to_prune = ["encoder.block.0.adapter", "encoder.block.1.adapter"]
prune_adapter_layers(model, layers_to_prune)

# Perform grid search over different layer combinations to prune
def grid_search_pruning(model, encoder_layer_ranges, decoder_layer_ranges):
    for enc_layers in encoder_layer_ranges:
        for dec_layers in decoder_layer_ranges:
            # Construct layer names to prune based on the ranges provided
            encoder_layers_to_prune = [f"encoder.block.{i}.adapter" for i in range(enc_layers[0], enc_layers[1] + 1)]
            decoder_layers_to_prune = [f"decoder.block.{i}.adapter" for i in range(dec_layers[0], dec_layers[1] + 1)]
            
            # Combine encoder and decoder layers to prune
            layers_to_prune = encoder_layers_to_prune + decoder_layers_to_prune
            
            # Prune the layers
            print(f"\nPruning the following layers: {layers_to_prune}")
            prune_adapter_layers(model, layers_to_prune)
            
            # Perform your evaluation after pruning the layers (e.g., train and evaluate)
            # Here, you can add code to evaluate the model's performance after pruning the layers
            # For example: evaluate_model(model)

# Define grid ranges for encoder and decoder layers to prune
encoder_layer_ranges = [(0, 6), (0, 7), (0, 8), (0, 9), (0, 10), (0, 11)]  # Prune from encoder layers 0 to 6, 0 to 7, etc.
decoder_layer_ranges = [(12, 18), (12, 19), (12, 20), (12, 21), (12, 22), (12, 23)]  # Prune from decoder layers 12 to 18, 12 to 19, etc.

# Run grid search over layer pruning combinations
# grid_search_pruning(model, encoder_layer_ranges, decoder_layer_ranges)

# Training loop
model.train()
for epoch in range(1,2):  # 1 epochs
    print(f"Epoch {epoch}")
    total_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(tqdm(train_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Loss calculation
        loss = outputs.loss
        total_loss += loss.item()

        # Backpropagation (gradient accumulation)
        loss.backward()

        # Perform optimizer step after 'accumulation_steps' mini-batches
        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f"Average loss: {avg_loss:.4f}")


Epoch 1


100%|██████████| 1832/1832 [50:00<00:00,  1.64s/it]

Average loss: 1.6785





In [11]:
model.save_pretrained("./fetaqa_bart_lora")
tokenizer.save_pretrained("./fetaqa_bart_lora")


('./fetaqa_bart_lora/tokenizer_config.json',
 './fetaqa_bart_lora/special_tokens_map.json',
 './fetaqa_bart_lora/vocab.json',
 './fetaqa_bart_lora/merges.txt',
 './fetaqa_bart_lora/added_tokens.json')

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
from peft import PeftModel

tokenizer = BartTokenizer.from_pretrained("./fetaqa_bart_lora")
base_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
model = PeftModel.from_pretrained(base_model, "./fetaqa_bart_lora")


In [20]:
!pip install sacrebleu bert-score rouge_score datasets evaluate


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=74e40ac2c5d06458f6e6973b9539c228f54d4997dae112ceaf33aa97ef738ce6
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [21]:
import torch
from evaluate import load as load_metric
from tqdm import tqdm
from transformers import BartTokenizer, BartForConditionalGeneration
from bert_score import score as bertscore
from sacrebleu import corpus_bleu
from torch.utils.data import DataLoader
import os

# Load validation data
val_prepared = prepare_data_for_model(dataset["validation"])
val_dataset = QADataset(val_prepared, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Evaluation mode
model.eval()

# Prediction and reference containers
all_preds = []
all_refs = []

# Run inference
with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=5,
            early_stopping=True
        )

        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Fix label decoding (replace -100 with pad_token_id)
        labels = batch["labels"].clone()
        labels[labels == -100] = tokenizer.pad_token_id
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        all_preds.extend(decoded_preds)
        all_refs.extend(decoded_labels)

# ---------------------------
# Evaluation Metrics
# ---------------------------

# SacreBLEU
sacrebleu_score = corpus_bleu(all_preds, [all_refs])
print(f"\n🔵 SacreBLEU Score: {sacrebleu_score.score:.2f}")

# ROUGE
rouge = load_metric("rouge")
rouge_output = rouge.compute(predictions=all_preds, references=all_refs)
print("\n🟢 ROUGE Scores:")
for key in ["rouge1", "rouge2", "rougeL"]:
    score = rouge_output[key]
    print(f"{key.upper()}: F1={score:.4f}")

# BERTScore
P, R, F1 = bertscore(all_preds, all_refs, lang="en", rescale_with_baseline=True)
print(f"\n🔴 BERTScore:")
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall:    {R.mean().item():.4f}")
print(f"F1:        {F1.mean().item():.4f}")

# ---------------------------
# Save Predictions and References
# ---------------------------

os.makedirs("eval_outputs", exist_ok=True)

with open("eval_outputs/predictions_and_refs.txt", "w", encoding="utf-8") as f:
    for pred, ref in zip(all_preds, all_refs):
        f.write(f"Prediction: {pred.strip()}\n")
        f.write(f"Reference:  {ref.strip()}\n")
        f.write("-" * 80 + "\n")

with open("eval_outputs/predictions.txt", "w", encoding="utf-8") as f:
    for pred in all_preds:
        f.write(pred.strip() + "\n")

with open("eval_outputs/references.txt", "w", encoding="utf-8") as f:
    for ref in all_refs:
        f.write(ref.strip() + "\n")


100%|██████████| 251/251 [10:26<00:00,  2.50s/it]



🔵 SacreBLEU Score: 22.44

🟢 ROUGE Scores:


AttributeError: 'numpy.float64' object has no attribute 'mid'

In [22]:
import torch
from evaluate import load as load_metric
from tqdm import tqdm
from transformers import BartTokenizer, BartForConditionalGeneration
from bert_score import score as bertscore
from sacrebleu import corpus_bleu
from torch.utils.data import DataLoader
import os

# ---------------------------
# Evaluation Metrics
# ---------------------------

# SacreBLEU
sacrebleu_score = corpus_bleu(all_preds, [all_refs])
print(f"\n🔵 SacreBLEU Score: {sacrebleu_score.score:.2f}")

# ROUGE
rouge = load_metric("rouge")
rouge_output = rouge.compute(predictions=all_preds, references=all_refs)
print("\n🟢 ROUGE Scores:")
for key in ["rouge1", "rouge2", "rougeL"]:
    score = rouge_output[key]
    print(f"{key.upper()}: F1={score:.4f}")

# BERTScore
P, R, F1 = bertscore(all_preds, all_refs, lang="en", rescale_with_baseline=True)
print(f"\n🔴 BERTScore:")
print(f"Precision: {P.mean().item():.4f}")
print(f"Recall:    {R.mean().item():.4f}")
print(f"F1:        {F1.mean().item():.4f}")

# ---------------------------
# Save Predictions and References
# ---------------------------

os.makedirs("eval_outputs", exist_ok=True)

with open("eval_outputs/predictions_and_refs.txt", "w", encoding="utf-8") as f:
    for pred, ref in zip(all_preds, all_refs):
        f.write(f"Prediction: {pred.strip()}\n")
        f.write(f"Reference:  {ref.strip()}\n")
        f.write("-" * 80 + "\n")

with open("eval_outputs/predictions.txt", "w", encoding="utf-8") as f:
    for pred in all_preds:
        f.write(pred.strip() + "\n")

with open("eval_outputs/references.txt", "w", encoding="utf-8") as f:
    for ref in all_refs:
        f.write(ref.strip() + "\n")



🔵 SacreBLEU Score: 22.44

🟢 ROUGE Scores:
ROUGE1: F1=0.5623
ROUGE2: F1=0.3439
ROUGEL: F1=0.4666


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔴 BERTScore:
Precision: 0.5289
Recall:    0.4606
F1:        0.4935


In [23]:
def generate_answer(question, table_text, model, tokenizer, device, max_input_len=512, max_output_len=128):
    model.eval()
    
    input_text = f"question: {question} table: {table_text}"
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_input_len
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_output_len,
            num_beams=5,
            early_stopping=True
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

question = "What is the total sales in 2023?"
table_text = "Year | Sales\n2022 | 200\n2023 | 300\n2024 | 250"

answer = generate_answer(question, table_text, model, tokenizer, device)
print("Answer:", answer)


Answer: In 2023, the total sales are expected to reach 300,000.
