In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Configuration

In [None]:
dataset_size , num_segments = 5046 , 5
segment_size, remainder = divmod(dataset_size, num_segments)
size_segments = [range(i * segment_size + min(i, remainder), (i + 1) * segment_size + min(i + 1, remainder)) for i in range(num_segments)]
size_segments

[range(0, 1010),
 range(1010, 2019),
 range(2019, 3028),
 range(3028, 4037),
 range(4037, 5046)]

In [None]:
import torch

class GeminiConfig:
  """
  Configuration for Gemini.
  Contains default parameters that can be used globally.
  """

  # General settings


  # Model settings
  MODEL_NAME = "models/gemini-1.5-flash"  #  Model name
  # Generation settings
  MAX_LENGTH = 20  # Maximum length of generated sequences

  # Dataset settings vqav2-ar-validation-data
  DATASET_PATH = "ShahadMAlshalawi/okvqa-ar"  # Path or name of the dataset
  LANGUAGE = "ar"  # Language for questions/answers ("ar" for Arabic, "en" for English)
  SPLIT = "validation"  # Dataset split to use ("train", "validation", "test")
  SAVE_SEGMENT_DIR = "/content/drive/MyDrive/ColabData/Gemini_VQA_Results/OKVQA-ar/VQA_generation_checkpoint"  # Directory to save extracted answers

  SIZE_SEGMENTS = [
      range(0, 1010),
      range(1010, 2019),
      range(2019, 3028),
      range(3028, 4037),
      range(4037, 5046)
      ]


  CURRENT_INDEX_SEGMENT = 0  # Index of the current segment being processed
  USERNAME = "ShahadMAlshalawi"  # Username for Hugging Face


# Gemini VQA Class

In [None]:
import torch
from typing import List, Union, Iterable, Dict
from PIL import Image
import numpy as np
import requests
from io import BytesIO
import base64
from google.colab import userdata

import google.generativeai as genai

class GeminiVQA:
    """
    Implementation of Visual Question Answering using Gemini via the Google AI Gemini API.
    """

    def __init__(self, api_key: str, model_name: str, max_tokens: int = 1024):
        """
        Initializes the Gemini VQA model.

        Args:
            api_key (str): Google AI Gemini API key.
            model_name (str): The name of the Gemini model to use.
            max_tokens (int): The maximum number of tokens to generate in the answer.
        """
        self.api_key = api_key
        genai.configure(api_key=self.api_key)
        self.model = genai.GenerativeModel(model_name)
        self.model_name = model_name
        self.max_tokens = max_tokens
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def _prepare_image(self, image: Union[str, np.ndarray, torch.Tensor, Image.Image]) -> Image.Image:
        """
        Prepares a single image for VQA, converting it to RGB format. Handles various input types.

        Args:
            image: The input image. Can be a file path, URL, NumPy array, PyTorch tensor, or PIL Image.

        Returns:
            A PIL Image object in RGB format.

        Raises:
            ValueError: If the input image type is unsupported or if an error occurs during image processing.
            requests.exceptions.RequestException: If there's an error downloading the image from a URL.
        """

        try:
            if isinstance(image, Image.Image):
                return image.convert("RGB")

            elif isinstance(image, str):
                if image.startswith("http"):
                    response = requests.get(image, stream=True)
                    response.raise_for_status()
                    return Image.open(BytesIO(response.content)).convert("RGB")
                else:
                    return Image.open(image).convert("RGB")

            elif isinstance(image, np.ndarray):
                return Image.fromarray(image).convert("RGB")

            elif torch.is_tensor(image):
                return Image.fromarray(image.permute(1, 2, 0).cpu().numpy().astype(np.uint8)).convert("RGB")

            else:
                raise ValueError(f"Unsupported image input type: {type(image)}")

        except requests.exceptions.RequestException as e:
            raise requests.exceptions.RequestException(f"Error downloading image: {e}")

    def prepare_inputs(self, image: Union[str, np.ndarray, Image.Image], question: str):
        """
        Prepares the image and question for the Gemini model, requesting the answer in Arabic.

        Args:
            image: The input image.
            question (str): The question about the image.

        Returns:
            List: A list containing the prepared image and the question with an Arabic instruction.
        """
        prepared_image = self._prepare_image(image)
        # Updated prompt to be an AI assistant generating accurate answers in Arabic
        # You are an AI assistant that generates accurate answer about image in Arabic.\nQuestion:
        prompt = f"You are an AI assistant that generates accurate answer about image in Arabic.\nQuestion: {question}"
        return [prepared_image, prompt]


    def answer_question(self, image: Union[str, np.ndarray, Image.Image], question: str) -> str:
        """
        Answers a question about an image using the Gemini API.

        Args:
            image: The input image.
            question (str): The question about the image.

        Returns:
            str: The generated answer.
        """
        prompt_parts = self.prepare_inputs(image, question)

        try:
            response = self.model.generate_content(prompt_parts, request_options={'timeout': 1200}, generation_config={
                "max_output_tokens": self.max_tokens
            })
            return response.text

        except Exception as e:
            print(f"Error generating answer: {e}")
            return "Error generating answer"

    def __call__(self, image: Union[str, np.ndarray, Image.Image], question: str) -> str:
        """
        Answers a question about an image using the Gemini API.

        Args:
            image: The input image.
            question (str): The question about the image.

        Returns:
            str: The generated answer.
        """
        return self.answer_question(image, question)

## Dataset

In [None]:
!pip install datasets --quiet

In [None]:
from huggingface_hub import login
from google.colab import userdata
from datasets import load_dataset

In [None]:
# Login to Hugging Face
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN)

In [None]:
dataset = load_dataset(GeminiConfig.DATASET_PATH,split=GeminiConfig.SPLIT)
dataset

In [None]:
print(dataset[0])

{'metadata': {'image_id': 297147, 'question_id': 2971475, 'question_type': 'one', 'answer_type': 'other', 'confidence': 3}, 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x799D36CE8E90>, 'question': {'en': 'What sport can you use this for?', 'ar': 'في أي رياضة يمكنك استخدام هذا؟'}, 'answers': {'en': ['race', 'race', 'race', 'race', 'race', 'race', 'motocross', 'motocross', 'ride', 'ride'], 'ar': ['سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'موتوكروس', 'موتوكروس', 'يركب', 'يركب'], 'raw_en': ['racing', 'racing', 'racing', 'racing', 'racing', 'racing', 'motocross', 'motocross', 'riding', 'riding'], 'raw_ar': ['سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'سباق', 'موتوكروس', 'موتوكروس', 'يركب', 'يركب'], 'confidence': ['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes'], 'id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}}


# Genearte Answers

In [None]:
api_key = userdata.get("GEMINI_API_KEY")

In [None]:

if api_key is None:
    print("Error: GEMINI_API_KEY not found. Please set the GEMINI_API_KEY environment variable or pass the api key as a parameter to the constructor.")
else:
    vqa_model = GeminiVQA(api_key=api_key, model_name=GeminiConfig.MODEL_NAME)

    # Example usage with an image and question
    # You can replace this with an image from your dataset or another source
    image_url = "/content/drive/MyDrive/Colab Notebooks/E1.png"
    question = "ما نوع السيارة التي تستعمل الشيء الظاهر في الصورة؟"

    try:
        answer = vqa_model(image_url, question)
        print(f"Question: {question}")
        print(f"Answer: {answer}")
    except Exception as e:
        print(f"An error occurred: {e}")

Question: ما نوع السيارة التي تستعمل الشيء الظاهر في الصورة؟
Answer: لا يمكن الإجابة على هذا السؤال بناءً على الصورة المقدمة. الصورة تعرض صنبور إطفاء حريق، ولا توجد أي سيارات ظاهرة فيه.


In [None]:
import json
import pandas as pd
from tqdm.auto import tqdm

# Assuming vqa_model is initialized in a previous cell
# api_key = userdata.get("GEMINI_API_KEY")
# vqa_model = GeminiVQA(api_key=api_key, model_name=GeminiConfig.MODEL_NAME)

# Process set of segments using loop
for current_index_segment in range(0, 6):  # Loop through segments 0 to 30
    print(f"Processing segment {current_index_segment}...")
    rng = GeminiConfig.SIZE_SEGMENTS[current_index_segment]
    data_segment = dataset.select(rng)

    outputs = {
        "question_id": [],
        "questions": [],
        "image_id": [],
        "answers": [],
        "predictions": []
    }

    for item in tqdm(data_segment, desc=f"Generating answers for segment {current_index_segment}"):
        try:
            question = item['question']['ar'] if GeminiConfig.LANGUAGE == 'ar' else item['question']['en']
            ground_truth_answers = item['answers']['ar'] if GeminiConfig.LANGUAGE == 'ar' else item['answers']['en']
            image = item['image']
            question_id = str(item['metadata']['question_id'])
            image_id = str(item['metadata']['image_id'])

            prediction = vqa_model(image, question)

            outputs["question_id"].append(question_id)
            outputs["questions"].append(question) # Append the question here
            outputs["image_id"].append(image_id)
            outputs["answers"].append(ground_truth_answers)
            outputs["predictions"].append(prediction)

        except Exception as e:
            print(f"Error processing item {item.get('metadata', {}).get('question_id', 'N/A')}: {e}")
            outputs["question_id"].append(str(item.get('metadata', {}).get('question_id', 'N/A')))
            outputs["questions"].append("N/A") # Append N/A for question if error
            outputs["image_id"].append(str(item.get('metadata', {}).get('image_id', 'N/A')))
            outputs["answers"].append([])  # Append empty list for ground truth if error
            outputs["predictions"].append("Error generating answer") # Indicate error in prediction


    #saving the predictions to a JSON file
    save_path = f"{GeminiConfig.SAVE_SEGMENT_DIR}/segment_{current_index_segment}_outputs.json"
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump(outputs, f, ensure_ascii=False, indent=2)

    print(f"Outputs saved to {save_path}")
    # Optionally display a sample of the outputs
    # display(pd.DataFrame.from_dict(outputs).head())



---



# Evaluation

### Installation

In [None]:
!pip install git+https://github.com/shahadMAlshalawi/Modular-Arabic-VQA.git --no-warn-conflicts --quiet

In [None]:
import warnings
warnings.filterwarnings("ignore")

import textwrap
import tqdm
from PIL import Image
import requests
import torch
import numpy as np
import pandas as pd
from google.colab import userdata
import evaluate
import json
import datasets
import aravqa

### Loading the data from drive

In [None]:
import os
import json
import pandas as pd

# Directory containing the saved JSON output files
output_dir = GeminiConfig.SAVE_SEGMENT_DIR

# Initialize a dictionary to store the combined outputs with the correct keys
outputs = {
    "question_id": [],
    "questions": [],
    "image_id": [],
    "answers": [],
    "predictions": []
}

# Iterate through the files in the directory and load the JSON data
for filename in sorted(os.listdir(output_dir)):
    if filename.endswith('.json'):
        file_path = os.path.join(output_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            # Append the data from each file to the combined outputs dictionary
            for key in outputs.keys():
                if key in data:  # Check if the key exists in the loaded data
                    outputs[key].extend(data[key])
                else:
                    # This case should not happen with the corrected keys, but good practice to handle
                    print(f"Warning: Key '{key}' not found in file '{filename}'. Skipping.")


# Display the first few entries of the combined outputs for verification
print("Combined outputs loaded successfully. First 5 entries:")
print(outputs['predictions'][:5])
print(outputs['questions'][:5])
print(outputs['answers'][:5])

In [None]:
# Print the last 5 entries of the combined outputs for verification
print("Combined outputs loaded successfully. Last 5 entries:")
print(outputs['predictions'][-2:])
print(outputs['questions'][-2:])
print(outputs['answers'][-2:])

## Evaluate in segments

In [None]:
from aravqa.modules.evaluation import BLEUEvaluator
from aravqa.modules.evaluation import BERTScoreEvaluator
from aravqa.modules.evaluation import FuzzEvaluator
import os

# Define the directory to save segmented results
segmented_results_dir = f"/content/drive/MyDrive/ColabData/Gemini_VQA_Results/VQAv2-ar/Evaluation_Results/"

OPENAI_API_KEY = userdata.get('GPT_API_KEY')

# Initialize evaluators
bleu_evaluator = BLEUEvaluator(max_order=1)
BertScore_evaluator = BERTScoreEvaluator()
fuzzy_evaluator = FuzzEvaluator(OPENAI_API_KEY)


# Iterate through segments starting from segment 56
for i, rng in enumerate(GeminiConfig.SIZE_SEGMENTS[1:], start=1):
    print(f"Evaluating segment {i}...")

    # Select the data for the current segment
    segment_outputs = {
        "question_id": outputs["question_id"][rng.start:rng.stop],
        "questions": outputs["questions"][rng.start:rng.stop],
        "image_id": outputs["image_id"][rng.start:rng.stop],
        "answers": outputs["answers"][rng.start:rng.stop],
        "predictions": outputs["predictions"][rng.start:rng.stop]
    }

    # Perform evaluations for the current segment
    bleu_results = bleu_evaluator.evaluate(predictions=segment_outputs['predictions'],
                                           references=segment_outputs['answers']
                                           )

    bertScore_results = BertScore_evaluator.evaluate(predictions=segment_outputs['predictions'],
                                                   references=segment_outputs['answers']
                                                   )

    fuzzy_results = fuzzy_evaluator.evaluate(predictions=segment_outputs['predictions'],
                                           references=segment_outputs['answers'],
                                             questions=segment_outputs['questions']
                                           )

    # Prepare segment results dictionary
    segment_eval_results = {
        'question_id': segment_outputs['question_id'],
        'questions': segment_outputs['questions'],
        'image_id': segment_outputs['image_id'],
        'answers': segment_outputs['answers'],
        'predictions': segment_outputs['predictions'],
        'bleu': bleu_results['bleu'],
        'f1_bertscore': bertScore_results['f1_bertscore'],
        'fuzz_accuracy': fuzzy_results['fuzz_accuracy'],
    }

    # Add overall metrics to the first row of the segment results
    if len(segment_outputs['question_id']) > 0:
        segment_eval_results['overall_bleu'] = [bleu_results['overall_bleu']] + [None] * (len(segment_outputs['question_id']) - 1)
        segment_eval_results['overall_f1_bertscore'] = [bertScore_results['overall_f1_bertscore']] + [None] * (len(segment_outputs['question_id']) - 1)
        segment_eval_results['fuzz_overall_accuracy'] = [fuzzy_results['fuzz_overall_accuracy']] + [None] * (len(segment_outputs['question_id']) - 1)
    else:
        segment_eval_results['overall_bleu'] = []
        segment_eval_results['overall_f1_bertscore'] = []
        segment_eval_results['fuzz_overall_accuracy'] = []


    # Create a DataFrame and save the segment results to a CSV file
    segment_df = pd.DataFrame.from_dict(segment_eval_results)
    segment_filename = f"segment_{i}_results.csv"
    segment_file_path = os.path.join(segmented_results_dir, segment_filename)
    segment_df.to_csv(segment_file_path, index=False)

    print(f"Segment {i} results saved to {segment_file_path}")

print("\nEvaluation of all segments completed.")
