In [1]:
import together
import pandas as pd
import numpy as np
from tqdm import tqdm


In [2]:
## These explanations are generated by llama 3.2 [Using together.ai api call]

df_explanation = pd.read_csv('explanations.csv')

def remove_jpg_extension(image_name):
    final_image_name = image_name.split('.')[0]
    return final_image_name

df_explanation['image'] = df_explanation['image'].apply(remove_jpg_extension)

df_explanation.drop('Unnamed: 2',axis=1,inplace=True)

In [3]:
df_test = pd.read_csv('test_df.tsv',sep='\t')
df_test.columns = ["image",'captions','sarcasm_explanations']
df_test

Unnamed: 0,image,captions,sarcasm_explanations
0,700183392969756672,'oh i so love working late from home # work #...,the author hates working late from home.
1,928753954745475072,'yeaah ! buddy o miracle worker # infj emoji_...,"your anxiety is not cured when someone says ""d..."
2,935133439011049473,'rt <user> : something different ..... a delay...,the author is pissed to watch a full train lea...
3,933466049697198080,'oh really linkedin ? thanks for the super use...,the author doesn't find such notifications fro...
4,699788403940978689,'our beautiful view along pch ! # pch # highw...,the author can't really see the view because o...
...,...,...,...
346,1011850445043480900_1580447253,âï¸âï¸âï¸âï¸âï¸âï¸âï¸âï...,it isn't a cool week if it's 100 degrees.
347,636237012294327831_256939246,She's nothing like her dad... ;) #lovethem #fa...,"she's exactly like her dad, both are making si..."
348,899685897251069952,'this eclipse is even cooler than i thought it...,the author is disappointed with this eclipse s...
349,878368201221914624,'<user> app radar is definitely right on targe...,<user> app radar isn't right on target.


In [4]:
# We need to extract only test images from explanation dataset

test_image_list = df_test['image'].to_list()

df_explanation = df_explanation[df_explanation['image'].isin(test_image_list)]

In [5]:
df_explanation

Unnamed: 0,image,llm_explanation
0,899095756979576832,The image depicts a large outdoor screen broad...
2,704299596550434816,The image shows a message from an unknown poli...
3,1030084668871077796_1449452234,The image showcases a humorous quote about a p...
17,858271535584468992,The image presents a concise yet informative v...
28,725109055434289152,The image shows a weather forecast for Pittsbu...
...,...,...
3457,899685897251069952,The image depicts a photograph of a blue sky w...
3470,566063947261064369_324307476,The image shows a white car parked in a handic...
3482,869958145790980097,The image is a notice announcing the permanent...
3494,1000900004679480792_43891756,The image presents a close-up view of a broken...


In [6]:
final_df = pd.merge(df_explanation,df_test,how='inner',on='image')
final_df

Unnamed: 0,image,llm_explanation,captions,sarcasm_explanations
0,899095756979576832,The image depicts a large outdoor screen broad...,'seriously why would you ever live here ... #...,"who wouldn't live here, it's fun."
1,704299596550434816,The image shows a message from an unknown poli...,'awww poor cruz is broke and desperately needs...,this mail is fraud and one shouldn’t really be...
2,1030084668871077796_1449452234,The image showcases a humorous quote about a p...,HAPPENS EVERY TIME @rebelcircus #rebelcircus #...,it's awful to have to pee after settling into ...
3,858271535584468992,The image presents a concise yet informative v...,'# brexit explained see how the eu keeps all t...,the eu doesn't keep the money for itself.
4,725109055434289152,The image shows a weather forecast for Pittsbu...,forecast for pittsburgh is looking great ! can...,forecast for pittsburgh is rainy and not great...
...,...,...,...,...
346,899685897251069952,The image depicts a photograph of a blue sky w...,'this eclipse is even cooler than i thought it...,the author is disappointed with this eclipse s...
347,566063947261064369_324307476,The image shows a white car parked in a handic...,This guy gets a gold star for excellent parking.,this guy has parked his car partially covering...
348,869958145790980097,The image is a notice announcing the permanent...,'thanks to our elected mla and mp for raising ...,the author is pissed at their elected mla and ...
349,1000900004679480792_43891756,The image presents a close-up view of a broken...,Wow thanks Apple! I've never had a cord last t...,one hates when apple cords don't last longer t...


In [7]:
final_df['captions'][0]

"'seriously why would you ever live here ...  # lastbestplace # gocatsgo'"

In [8]:
context1 = f"This is the caption of image {final_df['captions'][0]}. The literal explanations of image is {final_df['llm_explanation'][0]}. The sarcasm explanation of the image given the caption is {final_df['sarcasm_explanations'][0]}."
context2 = f"This is the caption of image {final_df['captions'][1]}. The literal explanations of image is {final_df['llm_explanation'][1]}. The sarcasm explanation of the image given the caption is {final_df['sarcasm_explanations'][1]}."


In [9]:
# We'll use llm to get explanations by passing llm_commonsense_reasonsing and captions


import os
from together import Together

os.environ['TOGETHER_API_KEY'] = 'dc2960d78c59037c21bfc5f40e79b2597d9a37d856e504bdbb393a99702984aa'

api_key = os.environ.get('TOGETHER_API_KEY')

if not api_key:
    raise ValueError("API key not found. Please set the TOGETHER_API_KEY environment variable.")

client = Together(api_key=api_key)



In [10]:
generated_explanations = []

for index, row in tqdm(final_df.iterrows(), total=len(final_df), dynamic_ncols=True):
    caption = row['captions']
    exp = row['llm_explanation']
    
    # Few-shot learning prompt construction with examples
    prompt = f"""
    Here are a few examples:
    
    {context1}
    {context2}
    
    Now, given this new image caption: {caption}, and this explanation: {exp}, explain the hidden sarcasm in just one sentence.
    """
    
    try:
        response = client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
            messages=[
                {
                    "role" : "system",
                    "content" : "You are a helpful assistant who explains hidden sarcasm in images based on image explanations and captions."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            max_tokens=512,
            temperature=0.7,
            top_p=0.7,
            top_k=50,
            repetition_penalty=1,
            stop=["<|eot_id|>", "<|eom_id|>"],
            stream=False  
        )
        
        if hasattr(response, 'choices') and response.choices:
            choice = response.choices[0]
            if hasattr(choice, 'message') and choice.message:
                content = choice.message.content
                generated_explanations.append(content)
            else:
                generated_explanations.append("No message content found.")
        else:
            generated_explanations.append("No choices found in the response.")
    
    except Exception as e:
        generated_explanations.append(f"Error: {e}")

100%|█████████████████████████████████████████| 351/351 [40:59<00:00,  7.01s/it]


In [11]:
!pip install nltk bert-score rouge-score 

Defaulting to user installation because normal site-packages is not writeable


In [12]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import bert_score

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
final_df

Unnamed: 0,image,llm_explanation,captions,sarcasm_explanations
0,899095756979576832,The image depicts a large outdoor screen broad...,'seriously why would you ever live here ... #...,"who wouldn't live here, it's fun."
1,704299596550434816,The image shows a message from an unknown poli...,'awww poor cruz is broke and desperately needs...,this mail is fraud and one shouldn’t really be...
2,1030084668871077796_1449452234,The image showcases a humorous quote about a p...,HAPPENS EVERY TIME @rebelcircus #rebelcircus #...,it's awful to have to pee after settling into ...
3,858271535584468992,The image presents a concise yet informative v...,'# brexit explained see how the eu keeps all t...,the eu doesn't keep the money for itself.
4,725109055434289152,The image shows a weather forecast for Pittsbu...,forecast for pittsburgh is looking great ! can...,forecast for pittsburgh is rainy and not great...
...,...,...,...,...
346,899685897251069952,The image depicts a photograph of a blue sky w...,'this eclipse is even cooler than i thought it...,the author is disappointed with this eclipse s...
347,566063947261064369_324307476,The image shows a white car parked in a handic...,This guy gets a gold star for excellent parking.,this guy has parked his car partially covering...
348,869958145790980097,The image is a notice announcing the permanent...,'thanks to our elected mla and mp for raising ...,the author is pissed at their elected mla and ...
349,1000900004679480792_43891756,The image presents a close-up view of a broken...,Wow thanks Apple! I've never had a cord last t...,one hates when apple cords don't last longer t...


In [15]:
y_true = final_df['sarcasm_explanations'].values.tolist()
y_pred = generated_explanations

bleu_1 = 0
bleu_2 = 0
bleu_3 = 0
bleu_4 = 0
count = 0
weights_1 = (1./1.,)
weights_2 = (1./2. , 1./2.)
weights_3 = (1./3., 1./3., 1./3.)
weights_4 = (1./4., 1./4., 1./4., 1./4.)
met = 0

for reference, hypothesis in zip(y_true, y_pred):
    # met += nltk.translate.meteor_score.meteor_score([reference], hypothesis)
    reference = reference.split()
    hypothesis = hypothesis.split()
    bleu_1 += sentence_bleu([reference], hypothesis, weights_1) 
    bleu_2 += sentence_bleu([reference], hypothesis, weights_2)
    bleu_3 += sentence_bleu([reference], hypothesis, weights_3)
    bleu_4 += sentence_bleu([reference], hypothesis, weights_4)
    count += 1

bleu_1 = bleu_1/count
bleu_2 = bleu_2/count
bleu_3 = bleu_3/count
bleu_4 = bleu_4/count
# met = met/count
print("BLEU-1:", bleu_1)
print("BLEU-2:", bleu_2)
print("BLEU-3:", bleu_3)
print("BLEU-4:", bleu_4)

BLEU-1: 0.12338279100616056
BLEU-2: 0.05105599091408132
BLEU-3: 0.022603889230684685
BLEU-4: 0.010072774776937167
METEOR: 0


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [15]:
nltk.download('wordnet')

bleu_scores = []

references = final_df['sarcasm_explanations'].values.tolist()
e_new = generated_explanations

for ref, gen in zip(references, e_new):
    reference_tokens = [ref.split()]  
    candidate_tokens = gen.split()  
    score = sentence_bleu(reference_tokens, candidate_tokens)
    bleu_scores.append(score)
    print(f"BLEU score for reference: {ref} \nGenerated: {gen}\nScore: {score:.4f}\n")

average_score = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU score: {average_score:.4f}")


BLEU score for reference: who wouldn't live here, it's fun. 
Generated: The hidden sarcasm in the image is that the speaker is actually suggesting the opposite, implying that the location is desirable and fun, rather than a place to be avoided.
Score: 0.0000

BLEU score for reference: this mail is fraud and one shouldn’t really be worried about it. 
Generated: The hidden sarcasm in the image is that the speaker is being facetious and doesn't actually feel sorry for the politician, implying that the plea for donations is insincere or manipulative.
Score: 0.0000

BLEU score for reference: it's awful to have to pee after settling into bed with blankets, laptop, headphones, a drink and everything else. 
Generated: The hidden sarcasm in the image is that it humorously highlights the frustrating and relatable experience of finally getting comfortable, only to immediately realize the need to use the restroom, poking fun at the inconvenience.
Score: 0.0000

BLEU score for reference: the eu doe

[nltk_data] Downloading package wordnet to /home/priyash7/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [16]:
from rouge_score import rouge_scorer
import numpy as np

# Initialize the scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Assume `references` is a list of reference texts and `e_new` is the list of generated texts.
      # Your list of generated texts

# Initialize accumulators for ROUGE scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Iterate over reference and generated text pairs
for ref, gen in zip(references, e_new):
    scores = scorer.score(ref, gen)
    
    # Append scores to the accumulators
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)
    
    # Print individual scores (optional)
    print(f"Reference: {ref}\nGenerated: {gen}\n")
    print(f"ROUGE-1: {scores['rouge1']}")
    print(f"ROUGE-2: {scores['rouge2']}")
    print(f"ROUGE-L: {scores['rougeL']}\n")

# Calculate the average scores for each metric
avg_rouge1 = np.mean(rouge1_scores)
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)

# Print the average ROUGE scores
print(f"Average ROUGE-1 F1: {avg_rouge1}")
print(f"Average ROUGE-2 F1: {avg_rouge2}")
print(f"Average ROUGE-L F1: {avg_rougeL}")


Reference: who wouldn't live here, it's fun.
Generated: The hidden sarcasm in the image is that the speaker is actually suggesting the opposite, implying that the location is desirable and fun, rather than a place to be avoided.

ROUGE-1: Score(precision=0.03333333333333333, recall=0.125, fmeasure=0.052631578947368425)
ROUGE-2: Score(precision=0.0, recall=0.0, fmeasure=0.0)
ROUGE-L: Score(precision=0.03333333333333333, recall=0.125, fmeasure=0.052631578947368425)

Reference: this mail is fraud and one shouldn’t really be worried about it.
Generated: The hidden sarcasm in the image is that the speaker is being facetious and doesn't actually feel sorry for the politician, implying that the plea for donations is insincere or manipulative.

ROUGE-1: Score(precision=0.125, recall=0.3076923076923077, fmeasure=0.17777777777777778)
ROUGE-2: Score(precision=0.0, recall=0.0, fmeasure=0.0)
ROUGE-L: Score(precision=0.09375, recall=0.23076923076923078, fmeasure=0.13333333333333333)

Reference: it's

In [17]:
from collections import Counter

def simple_meteor_score(reference, e_new):
    def word_matches(ref, hyp):
        ref_counter = Counter(ref.split())
        hyp_counter = Counter(hyp.split())
        
        matches = sum(min(ref_counter[word], hyp_counter[word]) for word in hyp_counter)
        
        return matches
    
    def precision_recall(matches, reference_len, e_new_len):
        precision = matches / e_new_len if e_new_len > 0 else 0
        
        recall = matches / reference_len if reference_len > 0 else 0
        
        return precision, recall
    
    all_scores = []
    
    for hyp in e_new:
        hypothesis_scores = []
        for ref in reference:
            matches = word_matches(ref, hyp)
            precision, recall = precision_recall(matches, len(ref.split()), len(hyp.split()))

            if precision + recall == 0:
                f_mean = 0
            else:
                f_mean = (10 * precision * recall) / (9 * precision + recall)
            
            hypothesis_scores.append(f_mean)
        all_scores.append(max(hypothesis_scores))
    return np.mean(all_scores)


meteor_score = simple_meteor_score(references, e_new)
print(f"Simplified METEOR score: {meteor_score:.4f}")

Simplified METEOR score: 0.4199
