#Project Extension
##From Video Segments to Textual Answers Using NLQ and Video-LLaVA

##Setting Up AWS and Ego4D CLI for Data Access and Video Processing

In [1]:
import os
os.environ['AWS_ACCESS_KEY_ID'] = ""
os.environ['AWS_SECRET_ACCESS_KEY'] = ""

In [2]:
# Download the AWS and Ego4D CLIs, then download the annotations locally
%%bash

# Set up the AWS CLI
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip -o awscliv2.zip >/dev/null
sudo ./aws/install >/dev/null 2>&1
aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID" && aws configure set aws_secret_access_key "$AWS_SECRET_ACCESS_KEY"
rm "awscliv2.zip"


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 58.0M  100 58.0M    0     0   205M      0 --:--:-- --:--:-- --:--:--  205M


In [3]:
# Set up the Ego4D CLI
!pip install ego4d

Collecting ego4d
  Downloading ego4d-1.7.3.tar.gz (94 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/94.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.5/94.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting boto3 (from ego4d)
  Downloading boto3-1.35.17-py3-none-any.whl.metadata (6.6 kB)
Collecting dataclasses-json (from ego4d)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting iopath (from ego4d)
  Downloading iopath-0.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting botocore<1.36.0,>=1.35.17 (from boto3->ego4d)
  Downloading bo

##**From video interval to a textual answer**#
###We aim to select 50 queries and transition from video intervals to textual answers. To ensure the best results, we will focus on queries where the model achieved the highest IoU (Intersection over Union) scores, indicating the most accurate alignment between predicted and actual video segments.


**Data Loading:**
We load ground truth (nlq_val.json) and predictions (vslnet_19_3240_preds.json), containing actual and predicted video intervals for specific queries.

**IoU Calculation:**
IoU measures how well predicted segments match actual ones, calculated as the overlap divided by the union of the intervals.

**Extracting Data:**
For each query, we calculate IoU, capture key details, and sort results by IoU to prioritize the best predictions.

**Selecting Top 50:**
The top 50 predictions with the highest IoU are saved in a JSON file (selected_best_50.json) for further analysis.

In [7]:
import json

# Load data from ground truth and prediction JSON files
def load_data(ground_truth_path, predicted_path):
    with open(ground_truth_path, 'r') as gt_file:
        ground_truth_data = json.load(gt_file)

    with open(predicted_path, 'r') as preds_file:
        predicted_data = json.load(preds_file)

    return ground_truth_data, predicted_data

# Calculate Intersection over Union (IoU)
def calculate_iou(gt_start, gt_end, pred_start, pred_end):
    intersection = max(0, min(gt_end, pred_end) - max(gt_start, pred_start))
    union = (gt_end - gt_start) + (pred_end - pred_start) - intersection
    return intersection / union if union > 0 else 0

# Process each clip, match queries with predictions, and calculate IoU
def process_clip(clip, predictions, seen_set):
    results = []
    for annotation in clip.get('annotations', []):
        for query_index, query in enumerate(annotation.get('language_queries', [])):
            for result in predictions:
                if result.get('clip_uid') == clip.get('clip_uid') and result.get('annotation_uid') == annotation.get('annotation_uid'):
                    for pred_time in result.get('predicted_times', []):
                        if result.get('query_idx') == query_index:
                            key = (clip.get('clip_uid'), annotation.get('annotation_uid'), result.get('query_idx'))
                            if key not in seen_set:
                                seen_set.add(key)
                                iou = calculate_iou(query.get('clip_start_sec'), query.get('clip_end_sec'), *pred_time)
                                results.append({
                                    'clip_uid': clip.get('clip_uid'),
                                    'annotation_uid': annotation.get('annotation_uid'),
                                    'iou': iou,
                                    'predicted_start': pred_time[0],
                                    'predicted_end': pred_time[1],
                                    'query': query.get('query')
                                })
    return results

# Extract and sort results by IoU, returning the top N entries
def extract_and_sort_data(ground_truth_data, predicted_data, top_n=50):
    all_results = []
    seen = set()

    for video in ground_truth_data.get('videos', []):
        for clip in video.get('clips', []):
            all_results.extend(process_clip(clip, predicted_data.get('results', []), seen))

    all_results.sort(key=lambda x: x['iou'], reverse=True)
    return all_results[:top_n]

# Save the extracted data to a JSON file
def save_data(extracted_data, output_path):
    with open(output_path, 'w') as file:
        json.dump(extracted_data, file, indent=4)

# Main process: load data, extract top 50 results, and save them
ground_truth_data, predicted_data = load_data('/content/nlq_val.json', '/content/vslnet_19_3240_preds.json')
extracted_data = extract_and_sort_data(ground_truth_data, predicted_data)
save_data(extracted_data, '/content/selected_best_50.json')

print("Data saved to 'selected_best_50.json'")

Data saved to 'selected_best_50.json'


###Downloading video clips using the Ego4D CLI based on clip_uids stored in a JSON file.

In [10]:
import json
import os
import subprocess

# Function to read a JSON file and return its content
def read_json(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    with open(file_path, 'r') as file:
        return json.load(file)

# Function to handle downloading of a video clip via Ego4D CLI
def fetch_clip(clip_uid, destination_dir, dataset_version='v1'):
    download_cmd = [
        'ego4d', '--output_directory', destination_dir, '--datasets', 'clips',
        '--video_uids', clip_uid, '--version', dataset_version, '-y'
    ]
    try:
        subprocess.run(download_cmd, check=True, capture_output=True, text=True)
        print(clip_uid)
        return True
    except subprocess.CalledProcessError as error:
        print(f"Error while fetching clip {clip_uid}: {error.stderr}")
        return False

# Function to download all clips from the specified JSON file
def process_clips_from_json(json_path, output_directory):
    clip_data = read_json(json_path)
    fetched_clips = set()

    for entry in clip_data:
        clip_uid = entry.get('clip_uid')
        if clip_uid and clip_uid not in fetched_clips:
            if fetch_clip(clip_uid, output_directory):
                fetched_clips.add(clip_uid)

# File paths for input JSON and output directory for clips
input_json = '/content/selected_best_50.json'
output_folder = '/content/ego4d_videos'

# Ensure the output directory exists before fetching clips
os.makedirs(output_folder, exist_ok=True)

# Begin processing and downloading clips
process_clips_from_json(input_json, output_folder)


5d531ac1-010a-4e67-ba1a-96e485b14968
eaf8d34a-0e20-45d0-a288-569df047461e
ab094ea2-9251-4f10-945b-c2ab00c5282e
3d688dd9-8a21-43bd-9d1b-9be73b0b26bb
28c1f367-b80b-4072-a54c-fa17b207bf7b
f0dc57b0-e2a0-4b3b-8509-790fc888a36f
43db99a3-61ce-4548-ba5a-faf4c91c72f1
b810fff6-0df0-479a-b58c-012e42e4f7b3
39ec61c9-8725-47dc-8a18-f00e27b8ab2c
b704e90e-d433-4b13-9f78-f2194c5f3f57
1fb25bf7-09bf-4c4f-a050-c898f3362d36
f3e4cdf4-73fa-489a-8be3-c9265364da52
ec4a3ba3-eb00-4aa8-9b41-36043ece98f7
4ba774a8-cd2a-4889-9971-cc91f5c1afd4
e776ca99-4a92-4444-8a1f-fa55bfa381e0
3a1a5a27-7ac3-4323-8345-6717c175b09b
93231c7e-1cf4-4a20-b1f8-9cc9428915b2
74abeed9-a323-42ba-ae71-d5455219118e
5726971c-b3cc-43ed-8071-f6ee143e417d
88dcb32f-a537-47de-b3bf-f9149352bbb9
2276090d-3aab-4a4f-afbe-dcc083604160
633638e7-51a3-4901-af34-08161bb6578d
61de3e3f-8862-4d68-ab4a-2a26e81916d1
e4cf448f-e442-4e19-bf87-1eee8fbf59d8
c664f078-9b34-4a58-b949-180ac4bc0980
99db2a20-ae39-4df7-8a9c-908f282cd8d6
992cee60-a8af-423f-8a40-5b14208fcc1a
5

###This script extracts video segments from videos based on predicted start and end times in a JSON file. It uses **FFmpeg** to clip the segments and saves them to an output directory.

In [13]:
import json
import os
import subprocess

# Load JSON data from a file
def load_json_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Extract a video segment using FFmpeg based on start and end times
def extract_video_segment(video_file, start_time, end_time, output_file):
    command = [
        'ffmpeg',
        '-i', video_file,
        '-ss', str(start_time),
        '-to', str(end_time),
        '-c', 'copy',
        '-copyts',
        '-avoid_negative_ts', 'make_zero',
        output_file
    ]
    try:
        subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(output_file)
    except subprocess.CalledProcessError as e:
        print(f"Failed to extract segment from {video_file}: {e.stderr.decode()}")

# Extract video segments from videos based on JSON predictions
def extract_segments(data_file, videos_dir, output_dir):
    video_data = load_json_data(data_file)

    for entry in video_data:
        clip_uid = entry['clip_uid']
        annotation_uid = entry['annotation_uid']
        video_file = os.path.join(videos_dir, f"{clip_uid}.mp4")

        if not os.path.exists(video_file):
            print(f"Video not found: {video_file}")
            continue

        start_time = entry['predicted_start']
        end_time = entry['predicted_end']
        output_file = os.path.join(output_dir, f"{clip_uid}_annotation_{annotation_uid}.mp4")

        extract_video_segment(video_file, start_time, end_time, output_file)

# Define paths and create output directory if not exists
data_file = '/content/selected_best_50.json'
videos_dir = '/content/ego4d_videos/v1/clips'
output_dir = '/content/extracted_segments'

os.makedirs(output_dir, exist_ok=True)

# Begin segment extraction
extract_segments(data_file, videos_dir, output_dir)


/content/extracted_segments/5d531ac1-010a-4e67-ba1a-96e485b14968_annotation_3eb83361-fc41-4f9c-bb0c-978bebbf1175.mp4
/content/extracted_segments/eaf8d34a-0e20-45d0-a288-569df047461e_annotation_450c2f2b-5172-4d6f-91fa-495c7f1fb986.mp4
/content/extracted_segments/ab094ea2-9251-4f10-945b-c2ab00c5282e_annotation_d2ba7cbc-d85e-43e2-8078-193aa9e99c24.mp4
/content/extracted_segments/3d688dd9-8a21-43bd-9d1b-9be73b0b26bb_annotation_fa6199c4-090e-437f-b7fe-6cfb158cacb3.mp4
/content/extracted_segments/28c1f367-b80b-4072-a54c-fa17b207bf7b_annotation_12525c2c-1f9d-41ce-86fc-35942d49645e.mp4
/content/extracted_segments/f0dc57b0-e2a0-4b3b-8509-790fc888a36f_annotation_a6cd5d8f-1de0-4c51-bc5a-1d2c2d7fb086.mp4
/content/extracted_segments/43db99a3-61ce-4548-ba5a-faf4c91c72f1_annotation_46778482-a34b-4717-afef-5f86cf255587.mp4
/content/extracted_segments/b810fff6-0df0-479a-b58c-012e42e4f7b3_annotation_91e86f79-bde0-4a07-bec6-9d7c3bb4aa82.mp4
/content/extracted_segments/39ec61c9-8725-47dc-8a18-f00e27b8ab2c

### zip the "extracted_segments" folder and download it. To add the manual responses.

In [7]:
import shutil
from google.colab import files

# Define the source folder and the output zip file name
source_folder = '/content/extracted_segments'
output_zip = '/content/extracted_segments.zip'

# Create a zip file of the folder
shutil.make_archive(output_zip.replace('.zip', ''), 'zip', source_folder)

# Download the zip file
files.download(output_zip)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Install necessary libraries
!pip install transformers decord opencv-python av

# Import required libraries
import av
import numpy as np
from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration  # For video processing with transformers

# Load the pre-trained VideoLLaVA model and processor
model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")


Collecting decord
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Collecting av
  Downloading av-13.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m111.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading av-13.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: decord, av
Successfully installed av-13.0.0 decord-0.6.0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/112k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.81G [00:00<?, ?B/s]

##Video Query Response Generation Using LLaVA Model

In this section, we use the LLaVA model to process video segments and generate responses based on queries. The model extracts video frames and, with the help of a given text prompt, provides relevant answers for each video clip.

In [25]:
import os
import json
import numpy as np
import av

# Load JSON data from a file
def load_json_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Save JSON data to a file
def save_json_data(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

# Extract frames uniformly across a video using PyAV
def extract_video_frames(container, num_samples=8):
    frames = []
    total_frames = container.streams.video[0].frames
    step = max(1, total_frames // num_samples)
    indices = [i * step for i in range(num_samples)]

    container.seek(0)
    for frame_index, frame in enumerate(container.decode(video=0)):
        if frame_index in indices:
            frames.append(frame.to_ndarray(format='rgb24'))
            if len(frames) == num_samples:
                break
    return np.stack(frames)

# Process a single video segment based on a given prompt
def handle_video_segment(video_path, prompt):
    container = av.open(video_path)
    clip = extract_video_frames(container)

    inputs = processor(text=prompt, videos=clip, return_tensors="pt")
    generated_ids = model.generate(**inputs, max_length=100)
    return processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

# Process all video segments in the directory and update JSON with responses
def handle_videos_from_directory(json_file, videos_dir):
    data = load_json_data(json_file)

    for video_filename in os.listdir(videos_dir):
        if video_filename.endswith(".mp4"):
            video_path = os.path.join(videos_dir, video_filename)

            uids = video_filename.split("_annotation_")
            clip_uid = uids[0]
            annotation_uid = uids[1].split('.mp4')[0]

            video_entries = [entry for entry in data if entry['clip_uid'] == clip_uid and entry['annotation_uid'] == annotation_uid]

            if not video_entries:
                print(f"clip_uid {clip_uid} and annotation_uid {annotation_uid} not found in JSON.")
                continue

            for entry in video_entries:
                prompt = "USER: <video>" + entry.get('query', '') + " ASSISTANT:"
                if not prompt:
                    print(f"No query for {clip_uid}. Skipping...")
                    continue

                try:
                    response = handle_video_segment(video_path, prompt)
                    print(f"{video_filename}: {response}")
                    entry['response'] = response
                except Exception as error:
                    print(f"Error {video_path}: {str(error)}")

    save_json_data(data, json_file)

# File paths for data and videos
json_file_path = '/content/selected_best_50.json'
videos_directory = '/content/extracted_segments/'

# Start processing all videos and update the JSON file
handle_videos_from_directory(json_file_path, videos_directory)

28c1f367-b80b-4072-a54c-fa17b207bf7b_annotation_12525c2c-1f9d-41ce-86fc-35942d49645e.mp4: USER: How many phones did I hold in my hand? ASSISTANT: There are two phones in the man's hands.Ъ
b810fff6-0df0-479a-b58c-012e42e4f7b3_annotation_91e86f79-bde0-4a07-bec6-9d7c3bb4aa82.mp4: USER: What part of the lawn mower did I pull? ASSISTANT: You pulled the seat of the lawn mower.Ъ
e9becd62-c5ea-4104-9d81-781a78bb7dbe_annotation_63a8864c-5267-4c58-a21e-b7b46eb69353.mp4: USER: what food did I scoop? ASSISTANT: The person is scooping food from a bowl with a spoon.Ъ
4ba774a8-cd2a-4889-9971-cc91f5c1afd4_annotation_c0d973f1-9c48-4268-bc4a-db8062199894.mp4: USER: Where was the egg before I picked it? ASSISTANT: The egg was in the refrigerator before you picked it.Ъ
0ca4506c-962d-4cf1-aa6d-f8222f53dee6_annotation_d8c3d311-7798-4d65-87ae-b6eb3385e2cc.mp4: USER: Where did I put the board after I washed  it? ASSISTANT: After washing the board, you put it on the counter.Ъ
e4cf448f-e442-4e19-bf87-1eee8fbf59

###Cleaning the 'response' field in the selected_best_50.json file

In [26]:
import json

# Function to load JSON, clean up response fields, and save to a new file
def update_responses_and_save(input_file, output_file):
    with open(input_file, 'r') as file:
        data = json.load(file)

    # Iterate through entries to clean up the 'response' field
    for entry in data:
        if 'response' in entry:
            response = entry['response']
            if 'USER:' in response:
                response = response.split('USER:')[1].strip()
            if 'ASSISTANT:' in response:
                response = response.split('ASSISTANT:')[1].strip()
            # Clean unwanted characters like '\u042a'
            entry['response'] = response.replace("\u042a", "")

    # Save the cleaned data into the output file
    with open(output_file, 'w') as file:
        json.dump(data, file, indent=4)

# Define paths for input and output JSON files
input_file = '/content/selected_best_50.json'
output_file = '/content/clips_with_responses.json'

# Execute the function to update responses and save the output
update_responses_and_save(input_file, output_file)

##Multi-Metric Evaluation of Text Generation Using NLP Techniques

This script evaluates the performance of a language model by comparing its generated responses to human-provided reference answers using multiple natural language processing (NLP) metrics. Each metric focuses on different aspects of text quality, from exact word overlap to semantic meaning.

**BLEU Score:** Measures n-gram precision by comparing the word sequences in the generated text to the reference. The model achieved a Mean BLEU Score of **0.36**, indicating a moderate level of fluency and adequacy, with some alignment between the generated text and the reference.

**ROUGE-L Score:** Evaluates the longest common subsequence between the generated text and the reference, focusing on recall. The Mean ROUGE Score of **0.64** reflects that the model captures a substantial portion of the relevant content from the reference.

**BERTScore (F1):** Uses contextual embeddings to compare the semantic similarity between the generated text and the reference. A Mean BERTScore F1 of **0.69** indicates that the model is fairly effective at preserving the meaning and intent of the reference text, showing a strong alignment in terms of semantics.

**SPICE Score:** Focuses on the semantic content and the high-level meaning of the text. The Mean SPICE Score of **0.51** suggests that the model captures around half of the semantic content, indicating room for improvement in generating detailed, accurate descriptions.

**Word Mover's Distance (WMD):** Measures the semantic distance between the generated and reference texts using word embeddings. A Mean WMD Score of **0.34** suggests that the generated responses are somewhat similar to the reference in terms of meaning, but there is noticeable semantic divergence.



In [46]:
!pip install nltk
!pip install rouge
!pip install -U bert-score
# Install Gensim for Word Mover's Distance (WMD)
!pip install gensim
# Install pycocoevalcap for CIDEr and SPICE
!pip install git+https://github.com/salaniz/pycocoevalcap
!pip install pot

import json
import nltk
from collections import Counter
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from bert_score import score as bert_score, BERTScorer

nltk.download('wordnet')
nltk.download('omw-1.4')

from pycocoevalcap.spice.spice import Spice
from gensim.models import KeyedVectors
from gensim.similarities import WmdSimilarity
from google.colab import drive
import tarfile
drive.mount('/content/drive')

Collecting git+https://github.com/salaniz/pycocoevalcap
  Cloning https://github.com/salaniz/pycocoevalcap to /tmp/pip-req-build-glqznwkc
  Running command git clone --filter=blob:none --quiet https://github.com/salaniz/pycocoevalcap /tmp/pip-req-build-glqznwkc
  Resolved https://github.com/salaniz/pycocoevalcap to commit a24f74c408c918f1f4ec34e9514bc8a76ce41ffd
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pot
  Downloading POT-0.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Downloading POT-0.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (835 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m835.4/835.4 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pot
Successfully installed pot-0.9.4


In [51]:
# Load necessary models for the metrics
w2v_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz', binary=True)
spice_scorer = Spice()
bert_scorer = BERTScorer(model_type='roberta-large', lang='en', rescale_with_baseline=True)

# Function to calculate BLEU score
def compute_bleu(model_output, reference):
    reference = [reference.lower().split()]
    candidate = model_output.lower().split()
    return sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))

# Function to calculate ROUGE-L score
def compute_rouge(model_output, reference):
    rouge = Rouge()
    scores = rouge.get_scores(model_output, reference, avg=True)
    return scores['rouge-l']['f']

# Function to calculate BERTScore F1
def compute_bertscore(model_output, reference):
    _, _, F1 = bert_scorer.score([reference], [model_output])
    return F1.mean().item()

# Function to calculate SPICE score
def compute_spice(model_output, reference):
    predictions = {0: [model_output]}
    references = {0: [reference]}
    score, _ = spice_scorer.compute_score(references, predictions)
    return score

# Function to calculate Word Mover's Distance
def compute_wmd(model_output, reference):
    return w2v_model.wmdistance(model_output.split(), reference.split())

# Read data from JSON
with open('/content/selected_best_50_all_responses.json', 'r') as file:
    dataset = json.load(file)

# Initialize accumulators for the metrics
total_bleu, total_rouge, total_bert_f1, total_spice, total_wmd = 0, 0, 0, 0, 0
total_entries = len(dataset)

# Process each entry and compute the metrics
for entry in dataset:
    # Skip if either response is missing
    if not entry['manual_response'] or not entry['response']:
        continue

    # Calculate individual metrics for each entry
    bleu = compute_bleu(entry['response'], entry['manual_response'])
    rouge = compute_rouge(entry['response'], entry['manual_response'])
    bert_f1 = compute_bertscore(entry['response'], entry['manual_response'])
    spice = compute_spice(entry['response'], entry['manual_response'])
    wmd = compute_wmd(entry['response'], entry['manual_response'])

    # Accumulate the scores
    total_bleu += bleu
    total_rouge += rouge
    total_bert_f1 += bert_f1
    total_spice += spice
    total_wmd += wmd

# Calculate the mean for each metric
mean_bleu = total_bleu / total_entries
mean_rouge = total_rouge / total_entries
mean_bert_f1 = total_bert_f1 / total_entries
mean_spice = total_spice / total_entries
mean_wmd = total_wmd / total_entries

# Print the final average scores
print("Mean BLEU Score:", mean_bleu)
print("Mean ROUGE Score:", mean_rouge)
print("Mean BERTScore F1:", mean_bert_f1)
print("Mean SPICE Score:", mean_spice)
print("Mean WMD Score:", mean_wmd)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts 

Mean BLEU Score: 0.3615657801932653
Mean ROUGE Score: 0.6411511177074641
Mean BERTScore F1: 0.6868100455403328
Mean SPICE Score: 0.5139753579753579
Mean WMD Score: 0.34181011871695705
