In [1]:
!pip install pandas numpy scikit-learn
!pip install rouge-score
!pip install bert-score
!pip install python-Levenshtein
!pip install sentence-transformers

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4a8407f5bf07cef9293506ccf75c06941193043eb213e18b1fed9dd2b5683651
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-man

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from rouge_score import rouge_scorer
from bert_score import BERTScorer
import Levenshtein
from sentence_transformers import SentenceTransformer, util
import torch
# Set device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

2025-05-18 10:12:06.440020: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747563126.906351      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747563127.028206      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
# Load all batch result files
result_dir = "/kaggle/input/outputs/results-blip/results-blip-1"
result_files = [
    os.path.join(result_dir, f)
    for f in os.listdir(result_dir)
    if f.startswith("blip_vqa_results_") and f.endswith(".csv")
]

all_batches = pd.concat([pd.read_csv(f) for f in result_files], ignore_index=True)

# Normalize text and keep only last word of predictions
def get_last_word(text):
    words = str(text).strip().split()
    return words[-1].lower() if words else ""

all_batches['answer'] = all_batches['answer'].astype(str).str.lower()
all_batches['blip_answer'] = all_batches['blip_answer'].apply(get_last_word)

# Extract processed predictions and references
predictions = all_batches['blip_answer'].tolist()
references = all_batches['answer'].tolist()

# Compute exact-match metrics
y_pred_bin = [int(p == r) for p, r in zip(predictions, references)]
y_true_bin = [1] * len(references)

acc = accuracy_score(y_true_bin, y_pred_bin)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true_bin, y_pred_bin, average="binary", zero_division=0
)

# Save predictions with last-word processing
df_pred_ref = pd.DataFrame({
    "Original_Prediction": all_batches['blip_answer'].tolist(),
    "Processed_Prediction": predictions,
    "Ground_Truth": references,
    "Exact_Match_Correct": y_pred_bin
})
df_pred_ref.to_csv('predictions_vs_references_blip.csv', index=False)
print("Saved: predictions_vs_references_blip.csv")

# Initialize metrics
final_metrics = {
    "exact_match_accuracy": acc,
    "exact_match_precision": prec,
    "exact_match_recall": rec,
    "exact_match_f1": f1
}

# --- ROUGE Scores ---
print("\n--- ROUGE Scores ---")
rouge_eval_scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rougeL': []}

for pred, ref in zip(predictions, references):
    if not pred or not ref:
        for key in rouge_scores:
            rouge_scores[key].append(0.0)
        continue
    
    scores = rouge_eval_scorer.score(ref, pred)
    for key in rouge_scores:
        rouge_scores[key].append(scores[key].fmeasure)

for key in rouge_scores:
    final_metrics[f"{key}_f1"] = np.mean(rouge_scores[key]) if rouge_scores[key] else 0.0
    print(f"Average {key.upper()} F1: {final_metrics[f'{key}_f1']:.3f}")

# --- BERTScore ---
print("\n--- BERTScore ---")
try:
    bert_eval_scorer = BERTScorer(lang="en", rescale_with_baseline=True, device=device)
    valid_pairs = [(p, r) for p, r in zip(predictions, references) if p and r]
    
    if valid_pairs:
        P, R, F1 = bert_eval_scorer.score([p[0] for p in valid_pairs], [p[1] for p in valid_pairs])
        final_metrics.update({
            "bertscore_precision": P.mean().item(),
            "bertscore_recall": R.mean().item(),
            "bertscore_f1": F1.mean().item()
        })
        print(f"BERTScore F1: {final_metrics['bertscore_f1']:.3f}")
    else:
        print("No valid pairs for BERTScore")
except Exception as e:
    print(f"BERTScore Error: {str(e)}")

# --- Levenshtein Similarity ---
print("\n--- Levenshtein Similarity ---")
lev_scores = []
for pred, ref in zip(predictions, references):
    if not pred and not ref:
        lev_scores.append(1.0)
    elif not pred or not ref:
        lev_scores.append(0.0)
    else:
        lev_scores.append(Levenshtein.ratio(pred, ref))
final_metrics["levenshtein"] = np.mean(lev_scores)
print(f"Levenshtein: {final_metrics['levenshtein']:.3f}")

# Save metrics
pd.DataFrame([final_metrics]).to_csv("metrics_summary_blip.csv", index=False)
print("\nFinal Metrics for BLIP:")
for k, v in final_metrics.items():
    print(f"{k:20}: {v:.3f}")

print("\nEvaluation complete for BLIP results!")

Saved: predictions_vs_references_blip.csv

--- ROUGE Scores ---
Average ROUGE1 F1: 0.403
Average ROUGEL F1: 0.403

--- BERTScore ---


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.812

--- Levenshtein Similarity ---
Levenshtein: 0.549

Final Metrics for BLIP:
exact_match_accuracy: 0.392
exact_match_precision: 1.000
exact_match_recall  : 0.392
exact_match_f1      : 0.563
rouge1_f1           : 0.403
rougeL_f1           : 0.403
bertscore_precision : 0.842
bertscore_recall    : 0.788
bertscore_f1        : 0.812
levenshtein         : 0.549

Evaluation complete for BLIP results!


In [4]:
# Load all batch result files
result_dir = "/kaggle/input/outputs/results-blip2"
result_files = [
    os.path.join(result_dir, f)
    for f in os.listdir(result_dir)
    if f.startswith("blip_2_vqa_results_batch_") and f.endswith(".csv")
]

all_batches = pd.concat([pd.read_csv(f) for f in result_files], ignore_index=True)

# Normalize text and keep only last word of predictions
def get_last_word(text):
    words = str(text).strip().split()
    return words[-1].lower() if words else ""

all_batches['answer'] = all_batches['answer'].astype(str).str.lower()
all_batches['blip_answer'] = all_batches['blip_answer'].apply(get_last_word)

# Extract processed predictions and references
predictions = all_batches['blip_answer'].tolist()
references = all_batches['answer'].tolist()

# Compute exact-match metrics
y_pred_bin = [int(p == r) for p, r in zip(predictions, references)]
y_true_bin = [1] * len(references)

acc = accuracy_score(y_true_bin, y_pred_bin)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true_bin, y_pred_bin, average="binary", zero_division=0
)

# Save predictions with last-word processing
df_pred_ref = pd.DataFrame({
    "Original_Prediction": all_batches['blip_answer'].tolist(),
    "Processed_Prediction": predictions,
    "Ground_Truth": references,
    "Exact_Match_Correct": y_pred_bin
})
df_pred_ref.to_csv('predictions_vs_references_blip2.csv', index=False)
print("Saved: predictions_vs_references_blip2.csv")

# Initialize metrics
final_metrics = {
    "exact_match_accuracy": acc,
    "exact_match_precision": prec,
    "exact_match_recall": rec,
    "exact_match_f1": f1
}

# --- ROUGE Scores ---
print("\n--- ROUGE Scores ---")
rouge_eval_scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rougeL': []}

for pred, ref in zip(predictions, references):
    if not pred or not ref:
        for key in rouge_scores:
            rouge_scores[key].append(0.0)
        continue
    
    scores = rouge_eval_scorer.score(ref, pred)
    for key in rouge_scores:
        rouge_scores[key].append(scores[key].fmeasure)

for key in rouge_scores:
    final_metrics[f"{key}_f1"] = np.mean(rouge_scores[key]) if rouge_scores[key] else 0.0
    print(f"Average {key.upper()} F1: {final_metrics[f'{key}_f1']:.3f}")

# --- BERTScore ---
print("\n--- BERTScore ---")
try:
    bert_eval_scorer = BERTScorer(lang="en", rescale_with_baseline=True, device=device)
    valid_pairs = [(p, r) for p, r in zip(predictions, references) if p and r]
    
    if valid_pairs:
        P, R, F1 = bert_eval_scorer.score([p[0] for p in valid_pairs], [p[1] for p in valid_pairs])
        final_metrics.update({
            "bertscore_precision": P.mean().item(),
            "bertscore_recall": R.mean().item(),
            "bertscore_f1": F1.mean().item()
        })
        print(f"BERTScore F1: {final_metrics['bertscore_f1']:.3f}")
    else:
        print("No valid pairs for BERTScore")
except Exception as e:
    print(f"BERTScore Error: {str(e)}")

# --- Levenshtein Similarity ---
print("\n--- Levenshtein Similarity ---")
lev_scores = []
for pred, ref in zip(predictions, references):
    if not pred and not ref:
        lev_scores.append(1.0)
    elif not pred or not ref:
        lev_scores.append(0.0)
    else:
        lev_scores.append(Levenshtein.ratio(pred, ref))
final_metrics["levenshtein"] = np.mean(lev_scores)
print(f"Levenshtein: {final_metrics['levenshtein']:.3f}")

# Save metrics
pd.DataFrame([final_metrics]).to_csv("metrics_summary_blip2.csv", index=False)
print("\nFinal Metrics for BLIP-2:")
for k, v in final_metrics.items():
    print(f"{k:20}: {v:.3f}")

print("\nEvaluation complete for BLIP-2 results!")

Saved: predictions_vs_references_blip2.csv

--- ROUGE Scores ---
Average ROUGE1 F1: 0.422
Average ROUGEL F1: 0.422

--- BERTScore ---


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.822

--- Levenshtein Similarity ---
Levenshtein: 0.565

Final Metrics for BLIP-2:
exact_match_accuracy: 0.408
exact_match_precision: 1.000
exact_match_recall  : 0.408
exact_match_f1      : 0.579
rouge1_f1           : 0.422
rougeL_f1           : 0.422
bertscore_precision : 0.858
bertscore_recall    : 0.792
bertscore_f1        : 0.822
levenshtein         : 0.565

Evaluation complete for BLIP-2 results!


In [6]:
# Load all batch result files
result_dir = "/kaggle/input/outputs/results-vilt/results-vilt"
result_files = [
    os.path.join(result_dir, f)
    for f in os.listdir(result_dir)
    if f.startswith("vilt_vqa_results_") and f.endswith(".csv")
]

all_batches = pd.concat([pd.read_csv(f) for f in result_files], ignore_index=True)

# Normalize text and keep only last word of predictions
def get_last_word(text):
    words = str(text).strip().split()
    return words[-1].lower() if words else ""

all_batches['answer'] = all_batches['answer'].astype(str).str.lower()
all_batches['vilt_answer'] = all_batches['vilt_answer'].apply(get_last_word)

# Extract processed predictions and references
predictions = all_batches['vilt_answer'].tolist()
references = all_batches['answer'].tolist()

# Compute exact-match metrics
y_pred_bin = [int(p == r) for p, r in zip(predictions, references)]
y_true_bin = [1] * len(references)

acc = accuracy_score(y_true_bin, y_pred_bin)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true_bin, y_pred_bin, average="binary", zero_division=0
)

# Save predictions with last-word processing
df_pred_ref = pd.DataFrame({
    "Original_Prediction": all_batches['vilt_answer'].tolist(),
    "Processed_Prediction": predictions,
    "Ground_Truth": references,
    "Exact_Match_Correct": y_pred_bin
})
df_pred_ref.to_csv('predictions_vs_references_vilt.csv', index=False)
print("Saved: predictions_vs_references_vilt.csv")

# Initialize metrics
final_metrics = {
    "exact_match_accuracy": acc,
    "exact_match_precision": prec,
    "exact_match_recall": rec,
    "exact_match_f1": f1
}

# --- ROUGE Scores ---
print("\n--- ROUGE Scores ---")
rouge_eval_scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rougeL': []}

for pred, ref in zip(predictions, references):
    if not pred or not ref:
        for key in rouge_scores:
            rouge_scores[key].append(0.0)
        continue
    
    scores = rouge_eval_scorer.score(ref, pred)
    for key in rouge_scores:
        rouge_scores[key].append(scores[key].fmeasure)

for key in rouge_scores:
    final_metrics[f"{key}_f1"] = np.mean(rouge_scores[key]) if rouge_scores[key] else 0.0
    print(f"Average {key.upper()} F1: {final_metrics[f'{key}_f1']:.3f}")

# --- BERTScore ---
print("\n--- BERTScore ---")
try:
    bert_eval_scorer = BERTScorer(lang="en", rescale_with_baseline=True, device=device)
    valid_pairs = [(p, r) for p, r in zip(predictions, references) if p and r]
    
    if valid_pairs:
        P, R, F1 = bert_eval_scorer.score([p[0] for p in valid_pairs], [p[1] for p in valid_pairs])
        final_metrics.update({
            "bertscore_precision": P.mean().item(),
            "bertscore_recall": R.mean().item(),
            "bertscore_f1": F1.mean().item()
        })
        print(f"BERTScore F1: {final_metrics['bertscore_f1']:.3f}")
    else:
        print("No valid pairs for BERTScore")
except Exception as e:
    print(f"BERTScore Error: {str(e)}")

# --- Levenshtein Similarity ---
print("\n--- Levenshtein Similarity ---")
lev_scores = []
for pred, ref in zip(predictions, references):
    if not pred and not ref:
        lev_scores.append(1.0)
    elif not pred or not ref:
        lev_scores.append(0.0)
    else:
        lev_scores.append(Levenshtein.ratio(pred, ref))
final_metrics["levenshtein"] = np.mean(lev_scores)
print(f"Levenshtein: {final_metrics['levenshtein']:.3f}")

# Save metrics
pd.DataFrame([final_metrics]).to_csv("metrics_summary_vilt.csv", index=False)
print("\nFinal Metrics for ViLT:")
for k, v in final_metrics.items():
    print(f"{k:20}: {v:.3f}")

print("\nEvaluation complete for ViLT results!")

Saved: predictions_vs_references_vilt.csv

--- ROUGE Scores ---
Average ROUGE1 F1: 0.278
Average ROUGEL F1: 0.278

--- BERTScore ---


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1: 0.777

--- Levenshtein Similarity ---
Levenshtein: 0.432

Final Metrics for ViLT:
exact_match_accuracy: 0.272
exact_match_precision: 1.000
exact_match_recall  : 0.272
exact_match_f1      : 0.428
rouge1_f1           : 0.278
rougeL_f1           : 0.278
bertscore_precision : 0.804
bertscore_recall    : 0.758
bertscore_f1        : 0.777
levenshtein         : 0.432

Evaluation complete for ViLT results!
