# Gemini Experiments


---

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Installation

In [None]:
!pip install git+https://github.com/shahadMAlshalawi/Modular-Arabic-VQA.git --no-warn-conflicts --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.6/51.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━

In [None]:
import warnings
warnings.filterwarnings("ignore")

import textwrap
import tqdm
from PIL import Image
import requests
import torch
import numpy as np
import pandas as pd
from google.colab import userdata
import evaluate
import json
import datasets
import aravqa

### Configuration

In [None]:
'''
# This is to divide the dataset into batches in order to generate the answers via Gemini
dataset_size, num_segments = 214354, 215
segment_size, remainder = divmod(dataset_size, num_segments)
size_segments = [
    range(i * segment_size + min(i, remainder), (i + 1) * segment_size + min(i + 1, remainder))
    for i in range(num_segments)
]
size_segments
'''

'\ndataset_size, num_segments = 214354, 215\nsegment_size, remainder = divmod(dataset_size, num_segments)\nsize_segments = [\n    range(i * segment_size + min(i, remainder), (i + 1) * segment_size + min(i + 1, remainder))\n    for i in range(num_segments)\n]\nsize_segments\n'

In [None]:
from aravqa.core import Config, CaptionSelection
import textwrap
from google.colab import userdata
import torch

# Initialize General Constants
OPENAI_API_KEY = userdata.get('GPT_API_KEY')

# Initialize the configuration object
config = Config()

# -------------------- Dataset Configuration --------------------
config.VDS_PATH = "ShahadMAlshalawi/VQAv2-Violet-Captions-with-bert-similarity-Final"  # Violet captions dataset path
config.BDS_PATH = "ShahadMAlshalawi/VQAv2-BiT-Captions-with-bert-similarity-Final"  # GPT-4o captions dataset path instead of BiT captions
config.GPT4oDS_PATH = "ShahadMAlshalawi/VQAv2-GPT-4o-Captions-with-bert-similarity-Final"  # Path for GPT4o captions

config.LANGUAGE = "ar"  # Target language for questions and answers
config.SPLIT = "validation"  # Dataset split (e.g., train, validation, test)
config.USERNAME = "ShahadMAlshalawi"  # HF User's name or identifier

# -------------------- Device Configuration --------------------
config.DEVICE = "cpu"  # Execution device (e.g., 'cpu' or 'cuda') since here we request the gemini responces we do not need cuda

# -------------------- Processing Settings --------------------
config.BATCH_SIZE = 40  # Batch size for data processing
config.CAPTIONS = ["bit","violet","GPT4o"]  # Captioning models to include (e.g. "bit","violet","GPT4o","None"). we use the short name "bit" for the Bidirectional transformer model (AraBERT32-Flickr8k)
config.CAPTIONS_SEPARATOR = "\n"  # Default separator between captions (e.g., new line)
config.NUM_CAPTIONS = 6  # Number of captions to select (-1 for all captions)
config.CAPTION_SELECTION = CaptionSelection.RANDOM  # Caption selection strategy
config.RANDOM_SEED = 42 # Random seed for reproducibility

config.PATH_RESULT_FILE = f"drive/MyDrive/ColabData/Modular_Arabic_VQA_System_Experiments_Results/VQAv2-ar_experiments_results/gemini/Final_Results/{config.CAPTION_SELECTION}-{'-'.join(config.CAPTIONS)}.csv"

config.predictions_output_filename = f"/content/drive/MyDrive/ColabData/Modular_Arabic_VQA_System_Experiments_Results/VQAv2-ar_experiments_results/gemini/Final_Results/predictions_output_file/random_bit_violet_GPT4o/"

# -------------------- API and Model Configuration --------------------
config.API_KEY = userdata.get('GEMINI_API_KEY')  # API key for external services
config.MODEL_NAME = "models/gemini-1.5-flash"  # Model name for text generation

# -------------------- Gemini Text Generation Settings --------------------
config.GENERATION_CONFIG = {
    "temperature": 0.0,  # Controls randomness in text generation
    "top_p": 0.95,  # Nucleus sampling threshold
    "top_k": 40,  # Limits sampling to the top-k tokens
    "max_output_tokens": 20,  # Maximum number of tokens in the output (the answers need to be short)
    "response_mime_type": "text/plain",  # Format of the generated text
}

# -------------------- Prompt and Instruction Settings -------------------

# Optional System Instruction: General guideline for the LLM. [str,None]
config.SYSTEM_INSTRUCTION = textwrap.dedent(
    """
    You are a highly capable language model specialized in answering questions based on provided image captions.
    Here are five examples of Question-Captions pairs to guide you:
    Example 1:
    Captions:{"طفل يحمل مضرب بيسبول في الحديقة", "الصورة بالأبيض والأسود", "الطفل يبدو في وضع الاستعداد لضرب الكرة", "الشمس تضيء المنطقة المحيطة", "السياج الخشبي يظهر في الخلفية", "شجرة كبيرة تضيف ظلاً للمشهد"}
    Question:{"ما هو لون العشب في هذه الصورة؟"}
    Answer : {"أبيض وأسود"}

    Example 2:
    Captions:{"غرفة نوم بمكتب دراسي في الزاوية", "كرسي مكتب جلد مريح أمام الطاولة", "سرير بجانب المكتب مغطى ببطانية", "صور مؤطرة معلقة على الجدار", "نافذة كبيرة توفر إضاءة طبيعية" ,"طابعة وأوراق مرتبة على رف جانبي"}
    Question:{مما يتكون الكرسي}
    Answer : {الجلد والمعادن والخشب والبلاستك}

    Example 3:
    Captions:{"مبنى من الطوب بلون أحادي مع نافذة خشبية." ,"لافتة شارع معلقة مكتوب عليها \"ROMA 201\".", "باب خشبي كبير في منتصف المبنى." ,"عمود إنارة يقف بجانب رصيف الشارع." ,"صورة بالأبيض والأسود لشارع في مدينة أوروبية." ,"لافتة منطقة محظورة تشير إلى قيود المرور."}
    Question:{هل هذه الصورة ملونة}
    Answer : {لا}

    Example 4:
    Captions:{"لافتة زرقاء مكتوب عليها \"SALISBURY RD\".", "لافتة صفراء متهالكة مكتوب عليها \"LANDS LANE\" وتحمل اسم \"CITY OF SYDNEY\".", "لافتة خضراء مكتوب عليها \"CRUIKSHANK ST\".", "لافتة بيضاء عليها \"ROSS ST\" مكتوبًا بخط كبير وواضح.", "اللافتة بها رمز لجمعية تنظيف في اليسار.", "الاسم \"CAMPERDOWN\" مذكور فوق اسم الشارع."}
    Question:{كم عدد اللافتات؟}
    Answer : {أربع}

    Example 5:
    Captions:{"مبنى ضخم مكون من طوب بني قاتم." ,"برجان متساويان في الارتفاع على جانبي المبنى." ,"ساعة كبيرة ظاهرة على أحد الأبراج." ,"أشجار خضراء بجوار المبنى تضيف جمالية للمشهد." ,"ساحة كبيرة أمام المبنى حيث يسير الناس." ,"سماء زرقاء صافية في الخلفية تعكس يومًا مشمسًا."}
    Question:{كم عدد ناطحات السحاب هناك؟}.
    Answer : {اثنتان}
    Now, for each new pair your task is to analyze the captions and generate accurate, concise answers in the same language as the question.
    Ensure your response is relevant, clear, and avoids unnecessary details.
    """
).strip()

# Prompt Template: Structured format for generating questions and answers
config.PROMPT_TEMPLATE = textwrap.dedent(
    """
    Analyze the following image captions and answer the given question in the same language:
    Captions:{context}
    Question:{question}
    Answer concisely:
    """
).strip()

print(f"System Instruction:\n{config.SYSTEM_INSTRUCTION}")
print(f"\nPrompt Template:\n{config.PROMPT_TEMPLATE}")

#TODO: segmentation
config.SIZE_SEGMENTS = [
      range(0, 997),
      range(997, 1994),
      range(1994, 2991),
      range(2991, 3988),
      range(3988, 4985),
      range(4985, 5982),
      range(5982, 6979),
      range(6979, 7976),
      range(7976, 8973),
      range(8973, 9970),
      range(9970, 10967),
      range(10967, 11964),
      range(11964, 12961),
      range(12961, 13958),
      range(13958, 14955),
      range(14955, 15952),
      range(15952, 16949),
      range(16949, 17946),
      range(17946, 18943),
      range(18943, 19940),
      range(19940, 20937),
      range(20937, 21934),
      range(21934, 22931),
      range(22931, 23928),
      range(23928, 24925),
      range(24925, 25922),
      range(25922, 26919),
      range(26919, 27916),
      range(27916, 28913),
      range(28913, 29910),
      range(29910, 30907),
      range(30907, 31904),
      range(31904, 32901),
      range(32901, 33898),
      range(33898, 34895),
      range(34895, 35892),
      range(35892, 36889),
      range(36889, 37886),
      range(37886, 38883),
      range(38883, 39880),
      range(39880, 40877),
      range(40877, 41874),
      range(41874, 42871),
      range(42871, 43868),
      range(43868, 44865),
      range(44865, 45862),
      range(45862, 46859),
      range(46859, 47856),
      range(47856, 48853),
      range(48853, 49850),
      range(49850, 50847),
      range(50847, 51844),
      range(51844, 52841),
      range(52841, 53838),
      range(53838, 54835),
      range(54835, 55832),
      range(55832, 56829),
      range(56829, 57826),
      range(57826, 58823),
      range(58823, 59820),
      range(59820, 60817),
      range(60817, 61814),
      range(61814, 62811),
      range(62811, 63808),
      range(63808, 64805),
      range(64805, 65802),
      range(65802, 66799),
      range(66799, 67796),
      range(67796, 68793),
      range(68793, 69790),
      range(69790, 70787),
      range(70787, 71784),
      range(71784, 72781),
      range(72781, 73778),
      range(73778, 74775),
      range(74775, 75772),
      range(75772, 76769),
      range(76769, 77766),
      range(77766, 78763),
      range(78763, 79760),
      range(79760, 80757),
      range(80757, 81754),
      range(81754, 82751),
      range(82751, 83748),
      range(83748, 84745),
      range(84745, 85742),
      range(85742, 86739),
      range(86739, 87736),
      range(87736, 88733),
      range(88733, 89730),
      range(89730, 90727),
      range(90727, 91724),
      range(91724, 92721),
      range(92721, 93718),
      range(93718, 94715),
      range(94715, 95712),
      range(95712, 96709),
      range(96709, 97706),
      range(97706, 98703),
      range(98703, 99700),
      range(99700, 100697),
      range(100697, 101694),
      range(101694, 102691),
      range(102691, 103688),
      range(103688, 104685),
      range(104685, 105682),
      range(105682, 106679),
      range(106679, 107676),
      range(107676, 108673),
      range(108673, 109670),
      range(109670, 110667),
      range(110667, 111664),
      range(111664, 112661),
      range(112661, 113658),
      range(113658, 114655),
      range(114655, 115652),
      range(115652, 116649),
      range(116649, 117646),
      range(117646, 118643),
      range(118643, 119640),
      range(119640, 120637),
      range(120637, 121634),
      range(121634, 122631),
      range(122631, 123628),
      range(123628, 124625),
      range(124625, 125622),
      range(125622, 126619),
      range(126619, 127616),
      range(127616, 128613),
      range(128613, 129610),
      range(129610, 130607),
      range(130607, 131604),
      range(131604, 132601),
      range(132601, 133598),
      range(133598, 134595),
      range(134595, 135592),
      range(135592, 136589),
      range(136589, 137586),
      range(137586, 138583),
      range(138583, 139580),
      range(139580, 140577),
      range(140577, 141574),
      range(141574, 142571),
      range(142571, 143568),
      range(143568, 144565),
      range(144565, 145562),
      range(145562, 146559),
      range(146559, 147556),
      range(147556, 148553),
      range(148553, 149550),
      range(149550, 150547),
      range(150547, 151544),
      range(151544, 152541),
      range(152541, 153538),
      range(153538, 154535),
      range(154535, 155532),
      range(155532, 156529),
      range(156529, 157526),
      range(157526, 158523),
      range(158523, 159520),
      range(159520, 160517),
      range(160517, 161514),
      range(161514, 162511),
      range(162511, 163508),
      range(163508, 164505),
      range(164505, 165502),
      range(165502, 166499),
      range(166499, 167496),
      range(167496, 168493),
      range(168493, 169490),
      range(169490, 170487),
      range(170487, 171484),
      range(171484, 172481),
      range(172481, 173478),
      range(173478, 174475),
      range(174475, 175472),
      range(175472, 176469),
      range(176469, 177466),
      range(177466, 178463),
      range(178463, 179460),
      range(179460, 180457),
      range(180457, 181454),
      range(181454, 182451),
      range(182451, 183448),
      range(183448, 184445),
      range(184445, 185442),
      range(185442, 186439),
      range(186439, 187436),
      range(187436, 188433),
      range(188433, 189430),
      range(189430, 190427),
      range(190427, 191424),
      range(191424, 192421),
      range(192421, 193418),
      range(193418, 194415),
      range(194415, 195412),
      range(195412, 196409),
      range(196409, 197406),
      range(197406, 198403),
      range(198403, 199400),
      range(199400, 200397),
      range(200397, 201394),
      range(201394, 202391),
      range(202391, 203388),
      range(203388, 204385),
      range(204385, 205382),
      range(205382, 206379),
      range(206379, 207376),
      range(207376, 208373),
      range(208373, 209370),
      range(209370, 210367),
      range(210367, 211364),
      range(211364, 212361),
      range(212361, 213358),
      range(213358, 214354)
      ]

System Instruction:
You are a highly capable language model specialized in answering questions based on provided image captions.
Here are five examples of Question-Captions pairs to guide you:
Example 1:
Captions:{"طفل يحمل مضرب بيسبول في الحديقة", "الصورة بالأبيض والأسود", "الطفل يبدو في وضع الاستعداد لضرب الكرة", "الشمس تضيء المنطقة المحيطة", "السياج الخشبي يظهر في الخلفية", "شجرة كبيرة تضيف ظلاً للمشهد"}
Question:"ما هو لون العشب في هذه الصورة؟"
Answer : "أبيض وأسود"

Example 2:
Captions:{"غرفة نوم بمكتب دراسي في الزاوية", "كرسي مكتب جلد مريح أمام الطاولة", "سرير بجانب المكتب مغطى ببطانية", "صور مؤطرة معلقة على الجدار", "نافذة كبيرة توفر إضاءة طبيعية" ,"طابعة وأوراق مرتبة على رف جانبي"}
Question:{مما يتكون الكرسي}
Answer : {الجلد والمعادن والخشب والبلاستك}

Example 3:
Captions:{"مبنى من الطوب بلون أحادي مع نافذة خشبية." ,"لافتة شارع معلقة مكتوب عليها "ROMA 201".", "باب خشبي كبير في منتصف المبنى." ,"عمود إنارة يقف بجانب رصيف الشارع." ,"صورة بالأبيض والأسود لشارع في مدينة أوروبية." ,"

### Login Hugging Face

In [None]:
from huggingface_hub import login
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

### DataSet & DataLoader

In [None]:
from datasets import load_dataset
from aravqa.datasets.utils import prepare_dataset
from aravqa.datasets.utils import compute_similarity_captions
from aravqa.datasets import OKVQADataset
from aravqa.datasets import OKVQADataLoader
from aravqa.datasets import VQAv2Dataset
from aravqa.datasets import VQAv2DataLoader

In [None]:
# Load Datasets
VDS= load_dataset(config.VDS_PATH,split=Config.SPLIT)

BDS = load_dataset(config.BDS_PATH,split=Config.SPLIT)

GPT4oDS = load_dataset(config.GPT4oDS_PATH,split=Config.SPLIT)


README.md:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

(…)-00000-of-00022-cc683a104c517975.parquet:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

(…)-00001-of-00022-7d4a209a53a2f93c.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

(…)-00002-of-00022-c95510ebd8a04711.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

(…)-00003-of-00022-3d29e5ad421dba20.parquet:   0%|          | 0.00/107M [00:00<?, ?B/s]

(…)-00004-of-00022-05209a649f471e9c.parquet:   0%|          | 0.00/99.7M [00:00<?, ?B/s]

(…)-00005-of-00022-7eadcd41a3f9c40c.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

(…)-00006-of-00022-84b97a22fb1048c8.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

(…)-00007-of-00022-734353e9672378c5.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

(…)-00008-of-00022-cea42df2ccf3581c.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

(…)-00009-of-00022-3d015d14ac7bdcf1.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

(…)-00010-of-00022-51b14e7748687f89.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

(…)-00011-of-00022-a7ef4e6fb38abe7b.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

(…)-00012-of-00022-cc1d3a2c9fbfdeb8.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

(…)-00013-of-00022-9debd09ce39ae280.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

(…)-00014-of-00022-9ec683a88c418219.parquet:   0%|          | 0.00/99.7M [00:00<?, ?B/s]

(…)-00015-of-00022-b288aea1097b1ea4.parquet:   0%|          | 0.00/99.7M [00:00<?, ?B/s]

(…)-00016-of-00022-300ee2e6cbcdb7d0.parquet:   0%|          | 0.00/98.1M [00:00<?, ?B/s]

(…)-00017-of-00022-98370e130124e0cc.parquet:   0%|          | 0.00/98.5M [00:00<?, ?B/s]

(…)-00018-of-00022-bfd2d8e7aab1f8bb.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

(…)-00019-of-00022-06ef5e056bff8157.parquet:   0%|          | 0.00/98.5M [00:00<?, ?B/s]

(…)-00020-of-00022-4693440cbb401edd.parquet:   0%|          | 0.00/97.6M [00:00<?, ?B/s]

(…)-00021-of-00022-3211eb6a6c697cb5.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/214354 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/22 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

(…)-00000-of-00022-2abb049b7de7f0a1.parquet:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

(…)-00001-of-00022-37b116006bb5fae8.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

(…)-00002-of-00022-ed76da480ef61e63.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

(…)-00003-of-00022-c392cb3b3b215e71.parquet:   0%|          | 0.00/107M [00:00<?, ?B/s]

(…)-00004-of-00022-4c4129d3b7c46d83.parquet:   0%|          | 0.00/99.7M [00:00<?, ?B/s]

(…)-00005-of-00022-00c5b72839e71261.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

(…)-00006-of-00022-dc7e8b4dbebd872c.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

(…)-00007-of-00022-c7c6b79aae77f60d.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

(…)-00008-of-00022-f7066369c4eeb7e6.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

(…)-00009-of-00022-9558b9d994f37194.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

(…)-00010-of-00022-7c661a71cf7ddbe0.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

(…)-00011-of-00022-02662c2ab345a39d.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

(…)-00012-of-00022-7f54ceb8b00d23c8.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

(…)-00013-of-00022-6a08cd49cf1d6b20.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

(…)-00014-of-00022-7060e110db9a25a4.parquet:   0%|          | 0.00/99.7M [00:00<?, ?B/s]

(…)-00015-of-00022-ccfb49ba962250c8.parquet:   0%|          | 0.00/99.7M [00:00<?, ?B/s]

(…)-00016-of-00022-33dfd0395a720625.parquet:   0%|          | 0.00/98.1M [00:00<?, ?B/s]

(…)-00017-of-00022-716251921b6fb1cf.parquet:   0%|          | 0.00/98.5M [00:00<?, ?B/s]

(…)-00018-of-00022-9587aaac6f2cc2bd.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

(…)-00019-of-00022-445ed011e4880f54.parquet:   0%|          | 0.00/98.5M [00:00<?, ?B/s]

(…)-00020-of-00022-ff5549ea992a4953.parquet:   0%|          | 0.00/97.6M [00:00<?, ?B/s]

(…)-00021-of-00022-e96841220483c907.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/214354 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/22 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/958 [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

validation-00000-of-00022.parquet:   0%|          | 0.00/99.0M [00:00<?, ?B/s]

validation-00001-of-00022.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

validation-00002-of-00022.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

validation-00003-of-00022.parquet:   0%|          | 0.00/108M [00:00<?, ?B/s]

validation-00004-of-00022.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

validation-00005-of-00022.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

validation-00006-of-00022.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

validation-00007-of-00022.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

validation-00008-of-00022.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

validation-00009-of-00022.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

validation-00010-of-00022.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

validation-00011-of-00022.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

validation-00012-of-00022.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

validation-00013-of-00022.parquet:   0%|          | 0.00/104M [00:00<?, ?B/s]

validation-00014-of-00022.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

validation-00015-of-00022.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

validation-00016-of-00022.parquet:   0%|          | 0.00/99.8M [00:00<?, ?B/s]

validation-00017-of-00022.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

validation-00018-of-00022.parquet:   0%|          | 0.00/104M [00:00<?, ?B/s]

validation-00019-of-00022.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

validation-00020-of-00022.parquet:   0%|          | 0.00/99.2M [00:00<?, ?B/s]

validation-00021-of-00022.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/214354 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/22 [00:00<?, ?it/s]

####**Compute bertscore Similarity Captions**
* This section can be executed once to compute the similatity Score (bertScore) between captions and corresponding question and answers
* After saving the similarity scores no need to run it again durring expermints.

In [None]:
'''
!pip install evaluate
!pip install bert_score
'''

In [None]:
'''
from typing import List, Dict, Callable, Optional
from tqdm import tqdm
import evaluate
import numpy as np

def compute_bertscore(predictions: List[str], references: List[List[str]]) -> float:
    """
    Computes the BERTScore for a set of predictions and references.

    Args:
        predictions (List[str]): List of predicted sentences.
        references (List[List[str]]): List of lists of reference sentences.

    Returns:
        float: The computed BERTScore. Returns float("-inf") if computation fails.

    Raises:
        ValueError: If predictions or references are empty, or if their lengths do not match.
    """
    if not predictions or not references:
        raise ValueError("Predictions and references must not be empty.")
    if len(predictions) != len(references):
        raise ValueError("The number of predictions must match the number of reference sets.")

    bertscore_scorer = evaluate.load("bertscore")
    try:
        result = bertscore_scorer.compute(predictions=predictions, references=references, lang="ar", model_type="distilbert-base-multilingual-cased")
        return result["f1"]
    except Exception as e:
        print(f"Error computing bertscore: {e}")
        return float("-inf")  # Indicate computation failure

# ..................................................
# Test

predictions = ["دراجة نارية بيضاء وسوداء متوقفة في موقف للسيارات"]
references = [
            [
              "سباق",
              "يركب",
              "موتوكروس",
              "دراجة نارية",
            ],

             ]

bert_score = compute_bertscore(predictions=predictions,
                                references=references
                                )
print(f"Bert Score: {bert_score}")
'''

**VDS**

In [None]:
'''
from aravqa.datasets.utils import compute_similarity_captions
import os

# Define the number of segments
num_segments = 100

# Calculate segment size
segment_size = len(VDS) // num_segments

# Create a directory to store segments
save_dir = '/content/drive/MyDrive/ColabData/VDS_bert-similarity_Segments_Final'  # Update with your desired path
os.makedirs(save_dir, exist_ok=True)

# Process and save segments
for i in range(num_segments):
    start_idx = i * segment_size
    end_idx = (i + 1) * segment_size if i < num_segments - 1 else len(VDS)
    segment = VDS.select(range(start_idx, end_idx))

    segment = compute_similarity_captions(
        segment,
        question_similarity_scorer=compute_bertscore,
        answer_similarity_scorer=compute_bertscore
    )

    segment.save_to_disk(os.path.join(save_dir, f'segment_{i}.arrow'))
'''

In [None]:
'''
# Load segments from Drive
loaded_segments = []
for i in range(num_segments):
    segment_path = os.path.join(save_dir, f'segment_{i}.arrow')
    loaded_segments.append(datasets.load_from_disk(segment_path))

# Concatenate segments
VDS = datasets.concatenate_datasets(loaded_segments)
'''

In [None]:
#VDS.push_to_hub(f"{config.USERNAME}/VQAv2-Violet-Captions-with-bert-similarity-Final")

**BDS**

In [None]:
'''
from aravqa.datasets.utils import compute_similarity_captions
import os

# Define the number of segments
num_segments = 100

# Calculate segment size
segment_size = len(BDS) // num_segments

# Create a directory to store segments
save_dir = '/content/drive/MyDrive/ColabData/BDS_bert-similarity_Segments_Final'  # Update with your desired path
os.makedirs(save_dir, exist_ok=True)

# Process and save segments
for i in range(num_segments):
    start_idx = i * segment_size
    end_idx = (i + 1) * segment_size if i < num_segments - 1 else len(BDS)
    segment = BDS.select(range(start_idx, end_idx))

    segment = compute_similarity_captions(
        segment,
        question_similarity_scorer=compute_bertscore,
        answer_similarity_scorer=compute_bertscore
    )

    segment.save_to_disk(os.path.join(save_dir, f'segment_{i}.arrow'))
'''

In [None]:
'''
# Load segments from Drive
loaded_segments = []
for i in range(num_segments):
    segment_path = os.path.join(save_dir, f'segment_{i}.arrow')
    loaded_segments.append(datasets.load_from_disk(segment_path))

# Concatenate segments
BDS = datasets.concatenate_datasets(loaded_segments)
'''

In [None]:
#BDS.push_to_hub(f"{config.USERNAME}/VQAv2-BiT-Captions-with-bert-similarity-Final")

**GPT4oDS**

In [None]:
'''
from aravqa.datasets.utils import compute_similarity_captions
import os

# Define the number of segments
num_segments = 100

# Calculate segment size
segment_size = len(GPT4oDS) // num_segments

# Create a directory to store segments
save_dir = '/content/drive/MyDrive/ColabData/GPT4oDS_bert-similarity_Segments_Final/'  # Update with your desired path
os.makedirs(save_dir, exist_ok=True)

# Process and save segments
for i in range(83,num_segments):
    start_idx = i * segment_size
    end_idx = (i + 1) * segment_size if i < num_segments - 1 else len(GPT4oDS)
    segment = GPT4oDS.select(range(start_idx, end_idx))

    segment = compute_similarity_captions(
        segment,
        question_similarity_scorer=compute_bertscore,
        answer_similarity_scorer=compute_bertscore
    )
    checkpoint_dir = f"{save_dir}segment_{i}"
    segment.save_to_disk(checkpoint_dir)
'''

In [None]:
'''
# Load segments from Drive
loaded_segments = []
for i in range(num_segments):
    segment_path = os.path.join(save_dir, f'segment_{i}.arrow')
    loaded_segments.append(datasets.load_from_disk(segment_path))

# Concatenate segments
GPT4oDS = datasets.concatenate_datasets(loaded_segments)
'''

In [None]:
#GPT4oDS.push_to_hub(f"{config.USERNAME}/VQAv2-GPT-4o-Captions-with-bert-similarity-Final")

### Question Answering

In [None]:
import pandas as pd
from aravqa.modules.question_answering import GeminiAnswerer

llm = GeminiAnswerer(config)
print(llm)

In [None]:
# Here dataloading and prompt prepartion conducted in segments
# Process set of segments using loop (segments 137 to 150)
import json
for current_index_segment in range(137, 150):  # Loop through segments 0 to 10
    print(f"Processing segment {current_index_segment}...")
    rng = config.SIZE_SEGMENTS[current_index_segment]
    dataset = VQAv2Dataset(BDS.select(rng),VDS.select(rng),GPT4oDS.select(rng))
    dataloader = VQAv2DataLoader(dataset, config).get_dataloader()
    outputs = llm.generate_from_dataloader(dataloader)
    #saving the predictions to a JSON file
    with open(f"{config.predictions_output_filename}{current_index_segment}_outputs.json", 'w', encoding='utf-8') as f:
        json.dump(outputs, f, ensure_ascii=False, indent=2)

    print(f"Outputs saved to {config.predictions_output_filename}")
    pd.DataFrame.from_dict(outputs)

Processing segment 137...


Generating predictions from dataloader: 100%|██████████| 25/25 [19:35<00:00, 47.04s/it]


Outputs saved to /content/drive/MyDrive/ColabData/Modular_Arabic_VQA_System_Experiments_Results/VQAv2-ar_experiments_results/gemini/Final_Results/predictions_output_file/random_bit_violet_GPT4o/
Processing segment 138...


Generating predictions from dataloader: 100%|██████████| 25/25 [19:20<00:00, 46.40s/it]


Outputs saved to /content/drive/MyDrive/ColabData/Modular_Arabic_VQA_System_Experiments_Results/VQAv2-ar_experiments_results/gemini/Final_Results/predictions_output_file/random_bit_violet_GPT4o/
Processing segment 139...


Generating predictions from dataloader:  56%|█████▌    | 14/25 [11:14<08:36, 47.00s/it]

### Evaluation
After finishing answer generation for all tha dataset segments, we can proceed with the evaluation

In [None]:
from aravqa.modules.evaluation import BLEUEvaluator
from aravqa.modules.evaluation import BERTScoreEvaluator
from aravqa.modules.evaluation import FuzzEvaluator
import os

# Define the directory to save segmented results
segmented_results_dir = f"drive/MyDrive/ColabData/Modular_Arabic_VQA_System_Experiments_Results/VQAv2-ar_experiments_results/gemini/Final_Results/segmented_eval_results/{config.CAPTION_SELECTION}-{'-'.join(config.CAPTIONS)}/"
os.makedirs(segmented_results_dir, exist_ok=True)

# Initialize evaluators
bleu_evaluator = BLEUEvaluator(max_order=1)
BertScore_evaluator = BERTScoreEvaluator()
fuzzy_evaluator = FuzzEvaluator(OPENAI_API_KEY)


# Iterate through segments starting from segment 0
for i, rng in enumerate(config.SIZE_SEGMENTS[0:], start=0):
    print(f"Evaluating segment {i}...")

    # Select the data for the current segment
    segment_outputs = {
        "question_id": outputs["question_id"][rng.start:rng.stop],
        "questions": outputs["questions"][rng.start:rng.stop],
        "image_id": outputs["image_id"][rng.start:rng.stop],
        "answers": outputs["answers"][rng.start:rng.stop],
        "predictions": outputs["predictions"][rng.start:rng.stop]
    }

    # Perform evaluations for the current segment
    bleu_results = bleu_evaluator.evaluate(predictions=segment_outputs['predictions'],
                                           references=segment_outputs['answers']
                                           )

    bertScore_results = BertScore_evaluator.evaluate(predictions=segment_outputs['predictions'],
                                                   references=segment_outputs['answers']
                                                   )

    fuzzy_results = fuzzy_evaluator.evaluate(predictions=segment_outputs['predictions'],
                                           references=segment_outputs['answers'],
                                             questions=segment_outputs['questions']
                                           )

    # Prepare segment results dictionary
    segment_eval_results = {
        'question_id': segment_outputs['question_id'],
        'questions': segment_outputs['questions'],
        'image_id': segment_outputs['image_id'],
        'answers': segment_outputs['answers'],
        'predictions': segment_outputs['predictions'],
        'bleu': bleu_results['bleu'],
        'f1_bertscore': bertScore_results['f1_bertscore'],
        'fuzz_accuracy': fuzzy_results['fuzz_accuracy'],
    }

    # Add overall metrics to the first row of the segment results
    if len(segment_outputs['question_id']) > 0:
        segment_eval_results['overall_bleu'] = [bleu_results['overall_bleu']] + [None] * (len(segment_outputs['question_id']) - 1)
        segment_eval_results['overall_f1_bertscore'] = [bertScore_results['overall_f1_bertscore']] + [None] * (len(segment_outputs['question_id']) - 1)
        segment_eval_results['fuzz_overall_accuracy'] = [fuzzy_results['fuzz_overall_accuracy']] + [None] * (len(segment_outputs['question_id']) - 1)
    else:
        segment_eval_results['overall_bleu'] = []
        segment_eval_results['overall_f1_bertscore'] = []
        segment_eval_results['fuzz_overall_accuracy'] = []


    # Create a DataFrame and save the segment results to a CSV file
    segment_df = pd.DataFrame.from_dict(segment_eval_results)
    segment_filename = f"segment_{i}_results.csv"
    segment_file_path = os.path.join(segmented_results_dir, segment_filename)
    segment_df.to_csv(segment_file_path, index=False)

    print(f"Segment {i} results saved to {segment_file_path}")

print("\nEvaluation of all segments completed.")

# Update config.PATH_RESULT_FILE to reflect the segmented saving (optional, depends on how you want to use this config later)
config.PATH_RESULT_FILE = segmented_results_dir
print(f"\nUpdated config.PATH_RESULT_FILE to: {config.PATH_RESULT_FILE}")