<a href="https://colab.research.google.com/github/shahadMAlshalawi/Modular-Arabic-VQA/blob/main/notebooks/gemini_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gemini Experiments


---

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Installation

In [2]:
!pip install git+https://github.com/shahadMAlshalawi/Modular-Arabic-VQA.git --no-warn-conflicts --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m 

In [3]:
import warnings
warnings.filterwarnings("ignore")

import textwrap
import tqdm
from PIL import Image
import requests
import torch
import numpy as np
import pandas as pd
from google.colab import userdata
import evaluate
import json
import datasets
import aravqa

### Configuration

In [4]:
from aravqa.core import Config, CaptionSelection
import textwrap
from google.colab import userdata
import torch


# Initialize the configuration object
config = Config()

# -------------------- Dataset Configuration --------------------
config.VDS_PATH = "ShahadMAlshalawi/OKVQA-Encoder-Violet-Captions"  # Violet captions dataset path
config.BDS_PATH = "ShahadMAlshalawi/OKVQA-VinVL-BiT-Captions"  # BiT captions dataset path
config.LANGUAGE = "ar"  # Target language for questions and answers
config.SPLIT = "validation"  # Dataset split (e.g., train, validation, test)
config.USERNAME = "ShahadMAlshalawi"  # HF User's name or identifier

# -------------------- Device Configuration --------------------
config.DEVICE = "cpu"  # Execution device (e.g., 'cpu' or 'cuda')

# -------------------- Processing Settings --------------------
config.BATCH_SIZE = 5  # Batch size for data processing
config.CAPTIONS = ["bit", "violet"]  # Captioning models to include (e.g. "bit","violet",None).
config.CAPTIONS_SEPARATOR = "\n"  # Default separator between captions (e.g., new line)
config.NUM_CAPTIONS = 3  # Number of captions to select (-1 for all captions)
config.CAPTION_SELECTION = CaptionSelection.RANDOM  # Caption selection strategy
config.RANDOM_SEED = 42 # Random seed for reproducibility
config.PATH_RESULT_FILE = f"drive/MyDrive/{config.CAPTION_SELECTION}-{'-'.join(config.CAPTIONS)}.csv"

# -------------------- API and Model Configuration --------------------
config.API_KEY = userdata.get('GEMINI_API_KEY')  # API key for external services
config.MODEL_NAME = "models/gemini-1.5-flash"  # Model name for text generation

# -------------------- Text Generation Settings --------------------
config.GENERATION_CONFIG = {
    "temperature": 0.0,  # Controls randomness in text generation
    "top_p": 0.95,  # Nucleus sampling threshold
    "top_k": 40,  # Limits sampling to the top-k tokens
    "max_output_tokens": 20,  # Maximum number of tokens in the output
    "response_mime_type": "text/plain",  # Format of the generated text
}

# -------------------- Prompt and Instruction Settings -------------------

# Optional System Instruction: General guideline for the LLM. [str,None]
config.SYSTEM_INSTRUCTION = textwrap.dedent(
    """
    You are a highly capable language model specialized in answering questions based on provided image captions.
    Your task is to analyze the captions and generate accurate, concise answers in the same language as the question.
    Ensure your response is relevant, clear, and avoids unnecessary details.
    """
).strip()

# Prompt Template: Structured format for generating questions and answers
config.PROMPT_TEMPLATE = textwrap.dedent(
    """
    Analyze the following image captions and answer the given question in the same language:
    Captions:{context}
    Question:{question}
    Answer concisely:
    """
).strip()



print(f"System Instruction:\n{config.SYSTEM_INSTRUCTION}")
print(f"\nPrompt Template:\n{config.PROMPT_TEMPLATE}")

System Instruction:
You are a highly capable language model specialized in answering questions based on provided image captions.
Your task is to analyze the captions and generate accurate, concise answers in the same language as the question.
Ensure your response is relevant, clear, and avoids unnecessary details.

Prompt Template:
Analyze the following image captions and answer the given question in the same language:
Captions:{context}
Question:{question}
Answer concisely:


### Login Hugging Face

In [5]:
from huggingface_hub import login
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

### DataSet & DataLoader

In [6]:
from datasets import load_dataset
from aravqa.datasets.utils import prepare_dataset
from aravqa.datasets.utils import compute_similarity_captions
from aravqa.datasets import OKVQADataset
from aravqa.datasets import OKVQADataLoader
from aravqa.datasets import VQAv2Dataset
from aravqa.datasets import VQAv2DataLoader

In [None]:
# Load Datasets
BDS = load_dataset(config.BDS_PATH,split=Config.SPLIT).remove_columns("features")
VDS= load_dataset(config.VDS_PATH,split=Config.SPLIT).remove_columns("features")

In [8]:
BDS = BDS.select(range(10))
VDS = VDS.select(range(10))

In [None]:
# Prepare Datasets
BDS = prepare_dataset(BDS,language=Config.LANGUAGE)
VDS = prepare_dataset(VDS,language=Config.LANGUAGE)

print(f"BDS:{BDS} \n\n VDS:{VDS}")

**Compute Similarity Captions**

In [None]:
import evaluate
from typing import List, Dict, Callable, Optional

def compute_bleu_score(predictions: List[str], references: List[List[str]], max_order: int = 2) -> float:
    """
    Computes the BLEU score for a set of predictions and references.

    Args:
        predictions (List[str]): List of predicted sentences.
        references (List[List[str]]): List of lists of reference sentences.
        max_order (int): Maximum n-gram order for the BLEU score (default: 2).

    Returns:
        float: The computed BLEU score. Returns float("-inf") if computation fails.

    Raises:
        ValueError: If predictions or references are empty, or if their lengths do not match.
    """
    if not predictions or not references:
        raise ValueError("Predictions and references must not be empty.")
    if len(predictions) != len(references):
        raise ValueError("The number of predictions must match the number of reference sets.")

    bleu_scorer = evaluate.load("bleu")
    try:
        result = bleu_scorer.compute(predictions=predictions, references=references, max_order=max_order)
        return result["bleu"]
    except Exception as e:
        print(f"Error computing BLEU score: {e}")
        return float("-inf")  # Indicate computation failure

# ..................................................
# Test

predictions = ["دراجة نارية بيضاء وسوداء متوقفة في موقف للسيارات"]
references = [
            [
              "سباق",
              "يركب",
              "موتوكروس",
              "دراجة نارية",
            ],

             ]

bleu_score = compute_bleu_score(predictions=predictions,
                                references=references,
                                max_order=2
                                )
print(f"BLEU Score: {bleu_score}")

In [None]:
from aravqa.datasets.utils import compute_similarity_captions

BDS = compute_similarity_captions(BDS,
                                  question_similarity_scorer = compute_bleu_score,
                                  answer_similarity_scorer = compute_bleu_score
                                  )

VDS = compute_similarity_captions(VDS,
                                  question_similarity_scorer = compute_bleu_score,
                                  answer_similarity_scorer = compute_bleu_score
                                  )


print(f"BDS captions features: {BDS.features['captions']}")
print(f"VDS captions features: {VDS.features['captions']}")

In [12]:
# BDS.push_to_hub(f"{config.USERNAME}/OKVQA-VinVL-BiT-Captions-blue-score")
# VDS.push_to_hub(f"{config.USERNAME}/OKVQA-Encoder-Violet-Captions-blue-score")

In [13]:
# Dataset
dataset = OKVQADataset(BDS,VDS)
print(f"Dataset length: {len(dataset)}")
print(f"Dataset sample keys: {dataset[0].keys()}")
print(f"Dataset sample metadata: {dataset[0]['metadata']}")
print(f"Dataset sample question: {dataset[0]['question']}")
print(f"Dataset sample answers: {dataset[0]['answers']}")
print(f"Dataset sample bit captions: {dataset[0]['bit']}")
print(f"Dataset sample violet captions: {dataset[0]['violet']}")

Dataset length: 10
Dataset sample keys: dict_keys(['metadata', 'image', 'question', 'answers', 'bit', 'violet'])
Dataset sample metadata: {'image_id': 297147, 'question_id': 2971475, 'question_type': 'one', 'answer_type': 'other', 'confidence': 3}
Dataset sample question: في أي رياضة يمكنك استخدام هذا؟
Dataset sample answers: [{'answer': 'سباق', 'confidence': 'yes', 'id': 1, 'raw': 'سباق'}, {'answer': 'سباق', 'confidence': 'yes', 'id': 2, 'raw': 'سباق'}, {'answer': 'سباق', 'confidence': 'yes', 'id': 3, 'raw': 'سباق'}, {'answer': 'سباق', 'confidence': 'yes', 'id': 4, 'raw': 'سباق'}, {'answer': 'سباق', 'confidence': 'yes', 'id': 5, 'raw': 'سباق'}, {'answer': 'سباق', 'confidence': 'yes', 'id': 6, 'raw': 'سباق'}, {'answer': 'موتوكروس', 'confidence': 'yes', 'id': 7, 'raw': 'موتوكروس'}, {'answer': 'موتوكروس', 'confidence': 'yes', 'id': 8, 'raw': 'موتوكروس'}, {'answer': 'يركب', 'confidence': 'yes', 'id': 9, 'raw': 'يركب'}, {'answer': 'يركب', 'confidence': 'yes', 'id': 10, 'raw': 'يركب'}]
Data

In [14]:
# DataLoader
dataloader = OKVQADataLoader(dataset, config).get_dataloader()
batch = next(iter(dataloader))
print(f"Batch keys: {batch.keys()}")
print(f"\nBatch prompts:\n")
print("\n\n".join(batch['prompts'][:3]))

Batch keys: dict_keys(['question_id', 'image_id', 'prompts', 'answers'])

Batch prompts:

Analyze the following image captions and answer the given question in the same language:
Captions:
امراة ترتدي بدلة سوداء وبيضاء تركب دراجتها في الشارع
امراة ترتدي بدلة سوداء وبيضاء تركب دراجتها في موقف للسيارات
 دراجة نارية بيضاء وسوداء متوقفة في موقف للسيارات
Question:في أي رياضة يمكنك استخدام هذا؟
Answer concisely:

Analyze the following image captions and answer the given question in the same language:
Captions:
امراة في صالون تجميل تنظر الى شيء ما
امراة تجلس على طاولة في مطعم
 حمام مع حوض غسيل ومرايا كبيرة
Question:اذكر نوع النبات هذا؟
Answer concisely:

Analyze the following image captions and answer the given question in the same language:
Captions:
شخصان يجلسان في صف من الكتب
شخصان يجلسان في صفوف من الكتب
 رجل يحمل دبّة دمية بينما يجلس على مقعد
Question:أي لعبة هذه؟
Answer concisely:


### Question Answering

In [15]:
import pandas as pd
from aravqa.modules.question_answering import GeminiAnswerer

llm = GeminiAnswerer(config)
print(llm)

genai.GenerativeModel(
    model_name='models/gemini-1.5-flash',
    generation_config={'temperature': 0.0, 'top_p': 0.95, 'top_k': 40, 'max_output_tokens': 20, 'response_mime_type': 'text/plain'},
    safety_settings={},
    tools=None,
    system_instruction='You are a highly capable language model specialized in answering questions based on provided image captions.\nYour task is to analyze the captions and generate accurate, concise answers in the same language as the question.\nEnsure your response is relevant, clear, and avoids unnecessary details.',
    cached_content=None
)


In [16]:
outputs = llm.generate_from_dataloader(dataloader)
pd.DataFrame.from_dict(outputs)

Generating predictions from dataloader: 100%|██████████| 2/2 [00:21<00:00, 10.67s/it]


Unnamed: 0,question_id,image_id,answers,predictions
0,2971475,297147,"[سباق, سباق, سباق, سباق, سباق, سباق, موتوكروس,...",ركوب الدراجات
1,3397615,339761,"[كرمة, كرمة, كرمة, كرمة, تسلق, تسلق, تبدو وكأن...",لا توجد معلومات عن النباتات في التعليقات المقدمة.
2,3575865,357586,"[حيوان محشو, حيوان محشو, حيوان محشو, حيوان محش...",دبّة.
3,949225,94922,"[فم, فم, فم, فم, فم, فم, فم, فم, فم, فم]",فمه.
4,2076115,207611,"[قماش, قماش, قماش, قماش, طعام, طعام, غداء, غدا...",لا يُمكن معرفة ذلك من النصوص.
5,5723996,572399,"[رجل, رجل, رجل, رجل, رجل, رجل, رجال, رجال, رجا...",لا يُظهر أي من التعليقات شخصًا في المرحاض.
6,5759705,575970,"[جزيرة, جزيرة, جزيرة, جزيرة, جزيرة, جزيرة, جزي...",ثلاجة
7,3045575,304557,"[محل, محل, محل, محل, مدينة نيويورك, مدينة نيوي...",للتسوق أو للدراسة.
8,2183655,218365,"[أرضي, أرضي, أرضي, أرضي, نبات, نبات, جذع نبات ...",زهرة برتقالية.
9,2863135,286313,"[يتأرجح, يتأرجح, يتأرجح, يتأرجح, يتأرجح, يتأرج...",يلوح به.


### Evaluation

In [17]:
from aravqa.modules.evaluation import BLEUEvaluator
from aravqa.modules.evaluation import AccuracyEvaluator

bleu_evaluator = BLEUEvaluator(max_order=2)
bleu_results = bleu_evaluator.evaluate(predictions=outputs['predictions'],
                                       references=outputs['answers']
                                       )

bleu_results['overall_bleu'] = [bleu_results['overall_bleu']]
bleu_results['overall_bleu'].extend([None]*(len(bleu_results['bleu'])-1))


# ..........................................
accuracy_evaluator = AccuracyEvaluator()
accuracy_results = accuracy_evaluator.evaluate(predictions=outputs['predictions'],
                                               references=outputs['answers']
                                               )

accuracy_results['overall_accuracy'] = [accuracy_results['overall_accuracy']]
accuracy_results['overall_accuracy'].extend([None]*(len(accuracy_results['accuracy'])-1))

# ..........................................

outputs['bleu'] = bleu_results['bleu']
outputs['accuracy'] = accuracy_results['accuracy']

outputs['overall_accuracy'] = accuracy_results['overall_accuracy']
outputs['overall_bleu'] = bleu_results['overall_bleu']

outputs_df = pd.DataFrame.from_dict(outputs)

Evaluating BLEU scores: 100%|██████████| 10/10 [00:00<00:00, 103.49it/s]
Evaluating Accuracy scores: 100%|██████████| 10/10 [00:00<00:00, 16545.58it/s]


In [18]:
outputs_df

Unnamed: 0,question_id,image_id,answers,predictions,bleu,accuracy,overall_accuracy,overall_bleu
0,2971475,297147,"[سباق, سباق, سباق, سباق, سباق, سباق, موتوكروس,...",ركوب الدراجات,0.0,0.0,0.0,0.0
1,3397615,339761,"[كرمة, كرمة, كرمة, كرمة, تسلق, تسلق, تبدو وكأن...",لا توجد معلومات عن النباتات في التعليقات المقدمة.,0.0,0.0,,
2,3575865,357586,"[حيوان محشو, حيوان محشو, حيوان محشو, حيوان محش...",دبّة.,0.0,0.0,,
3,949225,94922,"[فم, فم, فم, فم, فم, فم, فم, فم, فم, فم]",فمه.,0.0,0.0,,
4,2076115,207611,"[قماش, قماش, قماش, قماش, طعام, طعام, غداء, غدا...",لا يُمكن معرفة ذلك من النصوص.,0.0,0.0,,
5,5723996,572399,"[رجل, رجل, رجل, رجل, رجل, رجل, رجال, رجال, رجا...",لا يُظهر أي من التعليقات شخصًا في المرحاض.,0.0,0.0,,
6,5759705,575970,"[جزيرة, جزيرة, جزيرة, جزيرة, جزيرة, جزيرة, جزي...",ثلاجة,0.0,0.0,,
7,3045575,304557,"[محل, محل, محل, محل, مدينة نيويورك, مدينة نيوي...",للتسوق أو للدراسة.,0.0,0.0,,
8,2183655,218365,"[أرضي, أرضي, أرضي, أرضي, نبات, نبات, جذع نبات ...",زهرة برتقالية.,0.0,0.0,,
9,2863135,286313,"[يتأرجح, يتأرجح, يتأرجح, يتأرجح, يتأرجح, يتأرج...",يلوح به.,0.0,0.0,,


In [19]:
# save to csv file
outputs_df.to_csv(config.PATH_RESULT_FILE,index=False)