In [1]:
import difflib
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
from nltk.translate.meteor_score import meteor_score, single_meteor_score
from nltk.tokenize import word_tokenize

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Error loading wordnet: <urlopen error [WinError 10054] An
[nltk_data]     existing connection was forcibly closed by the remote
[nltk_data]     host>


False

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [WinError 10054] An
[nltk_data]     existing connection was forcibly closed by the remote
[nltk_data]     host>


False

In [4]:
#Open data files
msvd = open("data/msvd.txt", 'r') #MSVD dataset reference file
greedy = open("data/greedy.txt", 'r') #Greedy CNN ran with MSVD videos
a_greedy = open("data/greedy_adversarial.txt", 'r') #Greedy CNN ran with adversarial examples of MSVD videos
llava = open("data/llava.txt", 'r') #LLaVA UI ran with MSVD videos
a_llava = open("data/llava_adversarial.txt", 'r') #LLaVA UI ran with adversarial examples of MSVD videos

In [5]:
msvd_captions = {}
for line in msvd:
    video_id, caption = line.strip().split('\t')
    msvd_captions.setdefault(video_id, []).append(caption)

In [6]:
greedy_captions = {}
for line in greedy:
    video_id, caption = line.strip().split('\t')
    greedy_captions.setdefault(video_id, []).append(caption)

In [7]:
a_greedy_captions = {}
for line in a_greedy:
    video_id, caption = line.strip().split('\t')
    a_greedy_captions.setdefault(video_id, []).append(caption)

In [8]:
llava_captions = {}
for line in llava:
    video_id, caption = line.strip().split('\t')
    llava_captions.setdefault(video_id, []).append(caption)

In [9]:
a_llava_captions = {}
for line in a_llava:
    video_id, caption = line.strip().split('\t')
    a_llava_captions.setdefault(video_id, []).append(caption)

In [10]:
def format_captions_file(file_path):
    video_captions = {}
    with open(file_path, 'r') as file:
        for line in file:
            video_name, caption = line.strip().split('\t')
            caption = caption.replace('\n', '')  # Remove any newlines
            if video_name in video_captions:
                video_captions[video_name].append(caption.split())
            else:
                video_captions[video_name] = [caption.split()]
    return video_captions

In [11]:
msvd_data = format_captions_file("data/msvd.txt")
greedy_data = format_captions_file("data/greedy.txt")
a_greedy_data = format_captions_file("data/greedy_adversarial.txt")
llava_data = format_captions_file("data/llava.txt")
a_llava_data = format_captions_file("data/llava_adversarial.txt")

In [12]:
def calculate_rouge_l_score(reference_data, hypothesis_data):
    rouge = Rouge()
    scores = []
    
    for video_id, hypothesis_caption in hypothesis_data.items():
        reference_captions = reference_data.get(video_id, [])
        
        reference_scores = []
        for reference_caption in reference_captions:
            score = rouge.get_scores(hypothesis_caption[0], reference_caption)
            rouge_l_score = score[0]['rouge-l']['f']
            reference_scores.append(rouge_l_score)
        
        video_score = sum(reference_scores) / len(reference_scores) #avg rouge score of different reference captions for same video id
        scores.append(video_score)

    return sum(scores) / len(scores) #average of rouge-l scores for each video_id


In [13]:
#ROUGE-L Score for Greedy CNN 
r_greedy = calculate_rouge_l_score(msvd_captions, greedy_captions)
r_greedy

0.3031198773294219

In [14]:
#ROUGE-L Score for Greedy CNN (Adversarial)
r_a_greedy = calculate_rouge_l_score(msvd_captions, a_greedy_captions)
r_a_greedy

0.3023355286547355

In [15]:
#ROUGE-L Score for LLaVA
r_llava = calculate_rouge_l_score(msvd_captions, llava_captions)
r_llava

0.2426077790913993

In [16]:
#ROUGE-L Score for LLaVA
r_a_llava = calculate_rouge_l_score(msvd_captions, a_llava_captions)
r_a_llava

0.23943844852991245

In [17]:
def calculate_meteor_score(reference_data, hypothesis_data):
    score = 0
    video_count = 0
    for video_name, hypothesis_caption in hypothesis_data.items():
        reference_captions = reference_data.get(video_name, [])  # Get reference captions for the current video
        score += round(meteor_score(reference_captions, hypothesis_caption[0]), 4)
        video_count+=1

    score = round(score/video_count, 4)
    return score

In [18]:
#METEOR Score for Greedy CNN
m_greedy = calculate_meteor_score(msvd_data, greedy_data)
m_greedy

0.4803

In [19]:
#METEOR Score for Greedy CNN (Adversarial)
m_a_greedy = calculate_meteor_score(msvd_data, a_greedy_data)
m_a_greedy

0.4619

In [20]:
#METEOR Score for LLaVA
m_llava = calculate_meteor_score(msvd_data, llava_data)
m_llava

0.6001

In [21]:
#METEOR Score for LLaVA (Adversarial)
m_a_llava = calculate_meteor_score(msvd_data, a_llava_data)
m_a_llava

0.6029

In [22]:
def calculate_bleu_score(reference_captions, hypothesis_captions):
        
    hypotheses = []
    list_of_references = []

    for video_name, hypothesis_caption in hypothesis_captions.items():
        list_of_references.append(reference_captions.get(video_name))   # Get reference captions for the current video
        hypotheses.append(hypothesis_caption[0])

    bleu_score = corpus_bleu(list_of_references, hypotheses, weights = [(1,0,0,0), (0,1,0,0), (0,0,1,0), (0,0,0,1)]) 
    print(f"BLEU-1: {bleu_score[0]}\nBLEU-2: {bleu_score[1]}\nBLEU-3: {bleu_score[2]}\nBLEU-4: {bleu_score[3]}\n")


In [23]:
#BLEU-4 Score for Greedy CNN 
b_greedy = calculate_bleu_score(msvd_data, greedy_data)
b_greedy

BLEU-1: 0.7508650519031141
BLEU-2: 0.3556485355648535
BLEU-3: 0.19576719576719576
BLEU-4: 0.06474820143884892



In [24]:
#BLEU-4 Score for Greedy CNN (Adversarial Examples) 
b_a_greedy = calculate_bleu_score(msvd_data, a_greedy_data)
b_a_greedy

BLEU-1: 0.7460035523978685
BLEU-2: 0.3304535637149028
BLEU-3: 0.15977961432506887
BLEU-4: 0.03802281368821293



In [25]:
#BLEU-4 Score for LLaVA 
b_llava = calculate_bleu_score(msvd_data, llava_data)
b_llava

BLEU-1: 0.5592982456140351
BLEU-2: 0.2573584905660377
BLEU-3: 0.11020408163265306
BLEU-4: 0.050666666666666665



In [26]:
#BLEU-4 Score for LLaVA (Adversarial Examples) 
b_a_llava = calculate_bleu_score(msvd_data, a_llava_data)
b_a_llava

BLEU-1: 0.5717344753747323
BLEU-2: 0.26518063028439665
BLEU-3: 0.11157368859283927
BLEU-4: 0.04268846503178928



$\textbf{References:}$

$\text{ROUGE-L Score Code:}$
[1] https://pypi.org/project/rouge/

$\text{METEOR Score Code:}$
[2] https://www.nltk.org/api/nltk.translate.meteor_score.html

$\text{BLEU Scores Code:}$
[3] https://www.nltk.org/_modules/nltk/translate/bleu_score.html
