train a small vision model for Q&A based on palp images?

In [None]:
#%%capture
#!python3 -m pip install git+https://github.com/p-lod/plodlib
#!pip install requests_cache
#!pip install rdflib

In [1]:
import plodlib
import json
import pandas as pd
from string import Template
import rdflib as rdf
import requests_cache
import requests

def search_images(search_term):
  """
    Searches the P-LOD triplestore for resources whose labels
    contain a specified term. Returns a list of image URLs
    and associated information for those resources.

    Args:
        search_term (str): The term to search for within labels.

    Returns:
        list: A list of dictionaries, each containing information about an image
        and its associated concept. Returns an empty list if no matching
        results are found.
    """

  # Connect to the remote triplestore with read-only connection
  store = rdf.plugins.stores.sparqlstore.SPARQLStore(query_endpoint = "http://52.170.134.25:3030/plod_endpoint/query",
                                       context_aware = False,
                                       returnFormat = 'json')
  g = rdf.Graph(store)

  qt = Template("""
      PREFIX p-lod: <urn:p-lod:id:>
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
      SELECT DISTINCT ?concept ?concept_label ?img_urn ?l_record ?l_media ?l_batch ?l_description
      WHERE {
        ?concept rdfs:label ?concept_label .
        FILTER regex(?concept_label, "$search_term", "i")

        { ?component p-lod:depicts ?concept .
          ?component p-lod:best-image ?img_urn .
         }
         UNION
         {
           ?concept p-lod:best-image ?img_urn .
         }

        ?img_urn p-lod:x-luna-record-id ?l_record .
        ?img_urn p-lod:x-luna-media-id  ?l_media .
        ?img_urn p-lod:x-luna-batch-id  ?l_batch .
        ?img_urn p-lod:x-luna-description ?l_description .
      }
  """)

  results = g.query(qt.substitute(search_term = search_term))
  results_df = pd.DataFrame(results, columns = results.json['head']['vars'])

  if len(results_df) == 0:
      return []

  def add_luna_info_search(row):

      img_src = None #default if no URLs present (probably means LUNA doesn't have image though triplestore thinks it does)
      img_description = None
      tilde_val = plodlib.luna_tilde_val(row['img_urn'])

      luna_json = json.loads(requests.get(f'https://umassamherst.lunaimaging.com/luna/servlet/as/fetchMediaSearch?mid=umass~{tilde_val}~{tilde_val}~{row["l_record"]}~{row["l_media"]}&fullData=true').text)

      if len(luna_json):
          img_attributes = json.loads(luna_json[0]['attributes'])
          if 'image_description_english' in img_attributes.keys():
              img_description = img_attributes['image_description_english']
          else:
            try:
                if   tilde_val == '14':
                  img_description = json.loads(luna_json[0]['fieldValues'])[2]['value']
                elif tilde_val == '16':
                  img_description = json.loads(luna_json[0]['fieldValues'])[1]['value']
                else:
                  img_description = f"unrecognized collection {tilde_val}"
            except:
                img_description = "Trying to get description failed"

          if 'urlSize4' in img_attributes.keys(): # use size 4, sure, but only if there's nothing else
            img_src = img_attributes['urlSize4']
          if 'urlSize2' in img_attributes.keys(): # preferred
            img_src = img_attributes['urlSize2']
          elif 'urlSize3' in img_attributes.keys():
            img_src = img_attributes['urlSize3']
          else:
            img_src = img_attributes['urlSize1']

      row['img_url'] = img_src
      row['img_current_description'] = img_description
      return row

  results_df = results_df.apply(add_luna_info_search, axis = 1)

  return results_df.to_dict(orient='records')




In [2]:
search_term = "artemis"
image_results = search_images(search_term)

if image_results:
    for item in image_results:
        print(f"Concept: {item['concept_label']}")
        print(f"Image URL: {item['img_url']}")
        print(f"Image Description: {item['img_current_description']}")
        print("---")
else:
    print(f"No images found for '{search_term}'")

Concept: Artemis
Image URL: http://umassamherst.lunaimaging.com/MediaManager/srvr?mediafile=/Size2/umass~14~14/4249/image61808.jpg
Image Description: VI.5.3 Pompeii. December 2007. Room 20, detail of faded painting from west wall showing Apollo & Artemis. See Helbig, W., 1868. Wandgemälde der vom Vesuv verschütteten Städte Campaniens. Leipzig: Breitkopf und Härtel. (200). .
---
Concept: Artemis
Image URL: http://umassamherst.lunaimaging.com/MediaManager/srvr?mediafile=/Size2/umass~16~16/4276/PPM4_252.jpg
Image Description: 76. Viridarium (32): Tino, practically intact until the Second World War, during which it was bombed, the S wall, with a red background and black plinth with yellow panel and sphinxes facing either side of a gorgonion, was imagined as preceded by two statues of Nymphs-fountains on small pillars, each supporting a shell-shaped basin, with birds in flight and tambourines suspended from ribbons, broken through by a large arched window through which the scene of Ancona's

In [3]:
import pandas as pd
import json

def flatten_json_to_csv(json_data, output_csv, explode_list_column=None):
  """
  Flattens JSON data and writes it to a CSV file.

  Args:
    json_data: A list of JSON objects (loaded from a file or a variable).
    output_csv: The path to the output CSV file.
    explode_list_column: (Optional) The name of a column that contains a list that you
                       want to "explode" into multiple rows. If None, lists are
                       converted to string representation.
  """
  df = pd.json_normalize(json_data)

  if explode_list_column:
      df = df.explode(explode_list_column)

  df.to_csv(output_csv, index=False)
  print(f"Flattened data written to: {output_csv}")

output_csv_file1 = search_term +"_output_flattened_default.csv"
flatten_json_to_csv(image_results, output_csv_file1)

Flattened data written to: artemis_output_flattened_default.csv


In [4]:
import pandas as pd
import requests
import os
from urllib.parse import urlparse, parse_qs
from tqdm import tqdm


def extract_filename_from_url(url):
    """Extracts the filename from a URL, handling cases with query parameters."""
    try:
      parsed_url = urlparse(url)
      if 'mediafile' in parsed_url.query:
          query_params = parse_qs(parsed_url.query)
          mediafile_value = query_params.get('mediafile', [None])[0]
          if mediafile_value:
              return os.path.basename(mediafile_value)

      return os.path.basename(parsed_url.path)
    except Exception as e:
        print(f"Could not parse url: {url}. Error: {e}")
        return None

def download_images_from_csv(csv_file, image_column="img_url", output_dir="downloaded_images"):
    """Downloads images from URLs in a CSV file's specified column."""

    df = pd.read_csv(csv_file)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    local_image_paths = []
    for url in tqdm(df[image_column], desc="Downloading Images"):
        if not isinstance(url, str):
            local_image_paths.append(None)
            continue
        try:
            response = requests.get(url, stream=True, timeout=10)

            if response.status_code == 200:
                filename = extract_filename_from_url(url)
                if not filename:
                  print(f"Could not extract a filename for {url}.")
                  local_image_paths.append(None)
                  continue

                filepath = os.path.join(output_dir, filename)
                with open(filepath, "wb") as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                local_image_paths.append(filepath)
            else:
                print(f"Failed to download {url}. Status code: {response.status_code}")
                local_image_paths.append(None)
        except Exception as e:
           print(f"Error downloading {url}: {e}")
           local_image_paths.append(None)

    df['local_image_path'] = local_image_paths
    df.to_csv(csv_file, index=False)
    return df
 #Usage

# Replace 'your_data.csv' with the actual path to your CSV file
csv_file = search_term + "_output_flattened_default.csv"
downloaded_df = download_images_from_csv(csv_file)
print(f"Images downloaded and csv updated. Saved paths to the column local_image_path.")

Downloading Images: 100%|████████████████████████████| 17/17 [00:06<00:00,  2.83it/s]

Images downloaded and csv updated. Saved paths to the column local_image_path.





now i would need to make sure that the csv file has a column that points to the actual image file name. Then i would want to pass each image one at a time to gemini, along with the description, and ask it to generate question and answer pairs that capture something of that knowledge as visible in the image, as well as other more generic things from the image ("what kinds of objects are visible") etc.

In [6]:
# training set generator, not tested yet

#set key
#from google.colab import userdata
GOOGLE_API_KEY = '' #userdata.get('GOOGLE_API_KEY')

In [7]:
import pandas as pd
import google.generativeai as genai
import os
from tqdm import tqdm
import json
import time
import traceback  # For detailed error messages
import re  # Import the regular expression module



genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash")  # Using Gemini Pro Vision


def upload_image(image_path):
    """Uploads the image to Gemini and returns the file object."""
    try:
        print(f"  Uploading image: {image_path}")
        file_object = genai.upload_file(image_path)
        print(f"  Image uploaded successfully: {image_path}")
        return file_object
    except Exception as e:
        print(f"  Error uploading image {image_path}: {e}")
        traceback.print_exc()
        return None


def generate_qa_pairs(image_path, text_description):
    image_file = upload_image(image_path)
    if image_file is None:
      return []


    prompt = (
        "Given the following image and text description, generate a set of "
        "question-answer pairs related to the COMPOSITION and CONTENT and MEANING of the image. Draw on information contained in the "
        "text description for content and meaning when appropriate. DO NOT return answers related to publishing, authorship, or physical location. DO NOT return phrases like 'according to the description', or 'according to the text'.\n"
        "Provide multiple answers where appropriate.  Format your output "
        "as a JSON list of dictionaries (e.g. [{'question': '...', 'answers': [...]}])\n"
        "If no questions can be generated, return an empty JSON array: [].\n"
        f"Image Description:\n{text_description}\n\n"
        "Image:\n"
    )

    try:
        print(f"  Sending prompt to Gemini for {image_path}")
        response = model.generate_content([image_file, "\n\n", prompt])
        print(f"  Gemini response received for {image_path}")

        if response.prompt_feedback and response.prompt_feedback.block_reason:
            print(f"  Safety blocked. Reason: {response.prompt_feedback.block_reason}, for image {image_path}")
            return []

        if response.text:
            try:
                # Remove markdown code block and "json" if present
                cleaned_response = re.sub(r'^\s*```(?:json)?\s*|(?:\s*```\s*)$', '', response.text, flags=re.MULTILINE)
                cleaned_response = cleaned_response.strip()

                qa_pairs = json.loads(cleaned_response)
                if not isinstance(qa_pairs, list):
                    print(f"  Invalid JSON response structure for {image_path}: {response.text}")
                    return []
                print(f"   JSON parsed successfully for {image_path}")
                return qa_pairs
            except json.JSONDecodeError as e:
                print(f"  Error decoding JSON: {e}, for image {image_path}. Response was: {response.text}")
                traceback.print_exc()
                return []
        else:
            print(f"  Empty response from Gemini for image {image_path}.")
            return []
    except Exception as e:
        print(f"  Error generating QA pairs: {e} for image {image_path}")
        traceback.print_exc()
        return []


def process_csv_and_generate_qa(input_csv, output_csv):
    qa_data = []  # Initialize here so it's available in the try block below
    try:
        df = pd.read_csv(input_csv)
    except FileNotFoundError:
        print(f"Input CSV file not found: {input_csv}")
        return

    try:
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Generating QA Pairs", position=0, leave=True):
            image_path = row['local_image_path']
            text_description = row['img_current_description']

            # Explicitly handle NaN
            if pd.isna(image_path):
              print(f"Skipping row {index} because 'local_image_path' is NaN.")
              continue

            if not (image_path and text_description):
                print(f"Image or description missing at row {index}. Skipping row.")
                continue

            if not isinstance(image_path, str):
                print(f"Image path is not a string (type is {type(image_path)}) at row {index}. Skipping.")
                continue

            if not os.path.exists(image_path):
                print(f"Image not found at {image_path} for row {index}. Skipping.")
                continue

            print(f"Processing row {index}, Image: {image_path}")

            qa_pairs = generate_qa_pairs(image_path, text_description)

            for qa in qa_pairs or []:
                qa_data.append({
                    'id': os.path.basename(image_path),
                    'question': qa.get('question', None),
                    'answers': str(qa.get('answers', None)) if qa.get('answers') is not None else '[]'
                })

            time.sleep(1)  # crucial delay


        qa_df = pd.DataFrame(qa_data)
        qa_df.to_csv(output_csv, index=False)
        print(f"QA pairs generated and saved to {output_csv}")

    except Exception as e:
      print(f"An error occurred during processing: {e}")
      traceback.print_exc()

      if qa_data:
        temp_output_csv = f"{output_csv}.partial.csv"
        temp_qa_df = pd.DataFrame(qa_data)
        temp_qa_df.to_csv(temp_output_csv, index=False)
        print(f"Partial QA data saved to {temp_output_csv}")
      else:
        print("No QA data to save.")



# Example Usage (replace with your file paths)
# process_csv_and_generate_qa("your_input.csv", "your_output.csv")

In [8]:

# do the thing!

input_csv_file = search_term + "_output_flattened_default.csv"
output_csv_file = search_term + '_qa_pairs.csv'
process_csv_and_generate_qa(input_csv_file, output_csv_file)

Generating QA Pairs:   0%|                                    | 0/17 [00:00<?, ?it/s]

Processing row 0, Image: downloaded_images/image61808.jpg
  Uploading image: downloaded_images/image61808.jpg
  Image uploaded successfully: downloaded_images/image61808.jpg
  Sending prompt to Gemini for downloaded_images/image61808.jpg
  Gemini response received for downloaded_images/image61808.jpg
   JSON parsed successfully for downloaded_images/image61808.jpg


Generating QA Pairs:   6%|█▋                          | 1/17 [00:07<01:58,  7.38s/it]

Processing row 1, Image: downloaded_images/PPM4_252.jpg
  Uploading image: downloaded_images/PPM4_252.jpg
  Image uploaded successfully: downloaded_images/PPM4_252.jpg
  Sending prompt to Gemini for downloaded_images/PPM4_252.jpg
  Gemini response received for downloaded_images/PPM4_252.jpg
   JSON parsed successfully for downloaded_images/PPM4_252.jpg


Generating QA Pairs:  12%|███▎                        | 2/17 [00:14<01:52,  7.47s/it]

Processing row 2, Image: downloaded_images/image47991.jpg
  Uploading image: downloaded_images/image47991.jpg
  Image uploaded successfully: downloaded_images/image47991.jpg
  Sending prompt to Gemini for downloaded_images/image47991.jpg
  Gemini response received for downloaded_images/image47991.jpg
   JSON parsed successfully for downloaded_images/image47991.jpg


Generating QA Pairs:  18%|████▉                       | 3/17 [00:22<01:46,  7.62s/it]

Processing row 3, Image: downloaded_images/PPM9_1433.jpg
  Uploading image: downloaded_images/PPM9_1433.jpg
  Image uploaded successfully: downloaded_images/PPM9_1433.jpg
  Sending prompt to Gemini for downloaded_images/PPM9_1433.jpg
  Gemini response received for downloaded_images/PPM9_1433.jpg
   JSON parsed successfully for downloaded_images/PPM9_1433.jpg


Generating QA Pairs:  24%|██████▌                     | 4/17 [00:29<01:34,  7.27s/it]

Processing row 4, Image: downloaded_images/image66705.jpg
  Uploading image: downloaded_images/image66705.jpg
  Image uploaded successfully: downloaded_images/image66705.jpg
  Sending prompt to Gemini for downloaded_images/image66705.jpg
  Gemini response received for downloaded_images/image66705.jpg
   JSON parsed successfully for downloaded_images/image66705.jpg


Generating QA Pairs:  29%|████████▏                   | 5/17 [00:37<01:28,  7.40s/it]

Processing row 5, Image: downloaded_images/PPM2_1005.jpg
  Uploading image: downloaded_images/PPM2_1005.jpg
  Image uploaded successfully: downloaded_images/PPM2_1005.jpg
  Sending prompt to Gemini for downloaded_images/PPM2_1005.jpg
  Gemini response received for downloaded_images/PPM2_1005.jpg
   JSON parsed successfully for downloaded_images/PPM2_1005.jpg


Generating QA Pairs:  35%|█████████▉                  | 6/17 [00:44<01:22,  7.46s/it]

Processing row 6, Image: downloaded_images/PPM1_110.jpg
  Uploading image: downloaded_images/PPM1_110.jpg
  Image uploaded successfully: downloaded_images/PPM1_110.jpg
  Sending prompt to Gemini for downloaded_images/PPM1_110.jpg
  Gemini response received for downloaded_images/PPM1_110.jpg
   JSON parsed successfully for downloaded_images/PPM1_110.jpg


Generating QA Pairs:  41%|███████████▌                | 7/17 [00:51<01:12,  7.27s/it]

Processing row 7, Image: downloaded_images/image61813.jpg
  Uploading image: downloaded_images/image61813.jpg
  Image uploaded successfully: downloaded_images/image61813.jpg
  Sending prompt to Gemini for downloaded_images/image61813.jpg
  Gemini response received for downloaded_images/image61813.jpg
   JSON parsed successfully for downloaded_images/image61813.jpg


Generating QA Pairs:  47%|█████████████▏              | 8/17 [00:58<01:05,  7.32s/it]

Processing row 8, Image: downloaded_images/image66619.jpg
  Uploading image: downloaded_images/image66619.jpg
  Image uploaded successfully: downloaded_images/image66619.jpg
  Sending prompt to Gemini for downloaded_images/image66619.jpg
  Gemini response received for downloaded_images/image66619.jpg
   JSON parsed successfully for downloaded_images/image66619.jpg


Generating QA Pairs:  53%|██████████████▊             | 9/17 [01:06<00:57,  7.25s/it]

Processing row 9, Image: downloaded_images/image40563.jpg
  Uploading image: downloaded_images/image40563.jpg
  Image uploaded successfully: downloaded_images/image40563.jpg
  Sending prompt to Gemini for downloaded_images/image40563.jpg
  Gemini response received for downloaded_images/image40563.jpg
   JSON parsed successfully for downloaded_images/image40563.jpg


Generating QA Pairs:  59%|███████████████▉           | 10/17 [01:13<00:50,  7.18s/it]

Processing row 10, Image: downloaded_images/PPM6_622.jpg
  Uploading image: downloaded_images/PPM6_622.jpg
  Image uploaded successfully: downloaded_images/PPM6_622.jpg
  Sending prompt to Gemini for downloaded_images/PPM6_622.jpg
  Gemini response received for downloaded_images/PPM6_622.jpg
   JSON parsed successfully for downloaded_images/PPM6_622.jpg


Generating QA Pairs:  65%|█████████████████▍         | 11/17 [01:19<00:42,  7.02s/it]

Processing row 11, Image: downloaded_images/image36179.jpg
  Uploading image: downloaded_images/image36179.jpg
  Image uploaded successfully: downloaded_images/image36179.jpg
  Sending prompt to Gemini for downloaded_images/image36179.jpg
  Gemini response received for downloaded_images/image36179.jpg
   JSON parsed successfully for downloaded_images/image36179.jpg


Generating QA Pairs:  71%|███████████████████        | 12/17 [01:27<00:35,  7.15s/it]

Processing row 12, Image: downloaded_images/image3669.jpg
  Uploading image: downloaded_images/image3669.jpg
  Image uploaded successfully: downloaded_images/image3669.jpg
  Sending prompt to Gemini for downloaded_images/image3669.jpg
  Gemini response received for downloaded_images/image3669.jpg
   JSON parsed successfully for downloaded_images/image3669.jpg


Generating QA Pairs:  76%|████████████████████▋      | 13/17 [01:33<00:27,  6.84s/it]

Processing row 13, Image: downloaded_images/PPM5_1073.jpg
  Uploading image: downloaded_images/PPM5_1073.jpg
  Image uploaded successfully: downloaded_images/PPM5_1073.jpg
  Sending prompt to Gemini for downloaded_images/PPM5_1073.jpg
  Gemini response received for downloaded_images/PPM5_1073.jpg
   JSON parsed successfully for downloaded_images/PPM5_1073.jpg


Generating QA Pairs:  82%|██████████████████████▏    | 14/17 [01:40<00:20,  6.99s/it]

Processing row 14, Image: downloaded_images/PPM7_1080.jpg
  Uploading image: downloaded_images/PPM7_1080.jpg
  Image uploaded successfully: downloaded_images/PPM7_1080.jpg
  Sending prompt to Gemini for downloaded_images/PPM7_1080.jpg
  Gemini response received for downloaded_images/PPM7_1080.jpg
   JSON parsed successfully for downloaded_images/PPM7_1080.jpg


Generating QA Pairs:  88%|███████████████████████▊   | 15/17 [01:47<00:13,  6.87s/it]

Processing row 15, Image: downloaded_images/image39397.jpg
  Uploading image: downloaded_images/image39397.jpg
  Image uploaded successfully: downloaded_images/image39397.jpg
  Sending prompt to Gemini for downloaded_images/image39397.jpg
  Gemini response received for downloaded_images/image39397.jpg
   JSON parsed successfully for downloaded_images/image39397.jpg


Generating QA Pairs:  94%|█████████████████████████▍ | 16/17 [01:54<00:06,  6.97s/it]

Processing row 16, Image: downloaded_images/PPM6_1557.jpg
  Uploading image: downloaded_images/PPM6_1557.jpg
  Image uploaded successfully: downloaded_images/PPM6_1557.jpg
  Sending prompt to Gemini for downloaded_images/PPM6_1557.jpg
  Gemini response received for downloaded_images/PPM6_1557.jpg
   JSON parsed successfully for downloaded_images/PPM6_1557.jpg


Generating QA Pairs: 100%|███████████████████████████| 17/17 [02:01<00:00,  7.13s/it]

QA pairs generated and saved to artemis_qa_pairs.csv





#-----

In [None]:
# original search query, annotated
import plodlib  # Import the plodlib package for interacting with P-LOD data
import json  # Import the json package for working with JSON data
import pandas as pd  # Import the pandas package for data manipulation
from string import Template  # Import Template from the string package for templating
import rdflib as rdf  # Import the rdflib package for working with RDF data
import requests_cache # Not used
import requests  # Import the requests package for making HTTP requests

def search_images(search_term):
  """
    Searches the P-LOD triplestore for resources whose labels
    contain a specified term. Returns a list of image URLs
    and associated information for those resources.

    Args:
        search_term (str): The term to search for within labels.

    Returns:
        list: A list of dictionaries, each containing information about an image
        and its associated concept. Returns an empty list if no matching
        results are found.
    """

  # 1. Connect to the remote triplestore (database) with read-only connection
  #    The triplestore stores data as RDF triples (subject, predicate, object)
  #    This block creates a connection to the database using rdflib
  store = rdf.plugins.stores.sparqlstore.SPARQLStore(
      query_endpoint = "http://52.170.134.25:3030/plod_endpoint/query", # URL of the SPARQL endpoint
      context_aware = False, # Don't worry about contexts
      returnFormat = 'json' # We want results in JSON format
      )
  g = rdf.Graph(store)  # Create an RDF graph object using our connection

  # 2. Define a SPARQL query to retrieve data from the triplestore
  #    SPARQL is a query language for RDF data
  #    This query retrieves concepts, their labels, image URNs, and associated LUNA IDs
  qt = Template("""
      PREFIX p-lod: <urn:p-lod:id:>  # Define a prefix for P-LOD identifiers
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>  # Define a prefix for RDF Schema

      SELECT DISTINCT ?concept ?concept_label ?img_urn ?l_record ?l_media ?l_batch ?l_description
      # Select distinct concepts, concept labels, image URNs, and LUNA IDs

      WHERE { # Start the query
        ?concept rdfs:label ?concept_label . # Find concepts and their labels
        FILTER regex(?concept_label, "$search_term", "i") # Filter by the search term (case-insensitive)

        # There are two ways that a concept can be associated with an image
        { ?component p-lod:depicts ?concept .  # Find components that depict a concept
          ?component p-lod:best-image ?img_urn . # Find the best image of that component
         }
         UNION # Or
         {
           ?concept p-lod:best-image ?img_urn . # Find if the concept itself has an image
         }

        ?img_urn p-lod:x-luna-record-id ?l_record . # Find the LUNA record ID for the image
        ?img_urn p-lod:x-luna-media-id  ?l_media . # Find the LUNA media ID for the image
        ?img_urn p-lod:x-luna-batch-id  ?l_batch . # Find the LUNA batch ID for the image
        ?img_urn p-lod:x-luna-description ?l_description . # Find the LUNA description for the image
      }
  """)

  # 3. Execute the SPARQL query
  #    The query is formatted using the Template and then sent to the triplestore.
  results = g.query(qt.substitute(search_term = search_term))

  # 4. Convert the results into a Pandas DataFrame
  #    A DataFrame is like a table that is easy to work with
  results_df = pd.DataFrame(results, columns = results.json['head']['vars'])

  # 5. If no results were found return an empty list
  if len(results_df) == 0:
      return []

  # 6. Define a helper function to get additional image info from LUNA
  def add_luna_info_search(row):
      """
      Retrieves image URLs and descriptions from LUNA based on
      the image information from the triplestore.

      Args:
          row (pandas.Series): A row from the DataFrame containing
              image info from the triplestore.

      Returns:
          pandas.Series: The row with additional img_url and img_current_description added
      """

      img_src = None #default if no URLs present (probably means LUNA doesn't have image though triplestore thinks it does)
      img_description = None #default if no image description
      tilde_val = plodlib.luna_tilde_val(row['img_urn']) # Get the tilde value from the image URN using plodlib

      # 7. Make a request to the LUNA API to fetch image data
      #    The URL is generated dynamically using the information retrieved from triplestore and plodlib
      luna_json = json.loads(requests.get(f'https://umassamherst.lunaimaging.com/luna/servlet/as/fetchMediaSearch?mid=umass~{tilde_val}~{tilde_val}~{row["l_record"]}~{row["l_media"]}&fullData=true').text)

      # 8. Extract relevant information from the JSON response
      if len(luna_json): # if the request returns information
          img_attributes = json.loads(luna_json[0]['attributes']) # Load all the attributes as json
          if 'image_description_english' in img_attributes.keys(): # If an english description is available, use it
              img_description = img_attributes['image_description_english']
          else: # if no english description is available, try other methods based on collection type
            try:
                if   tilde_val == '14': # collection 14 has this pattern
                  img_description = json.loads(luna_json[0]['fieldValues'])[2]['value']
                elif tilde_val == '16': # collection 16 has this pattern
                  img_description = json.loads(luna_json[0]['fieldValues'])[1]['value']
                else: # if the collection type is unknown, return this
                  img_description = f"unrecognized collection {tilde_val}"
            except:
                img_description = "Trying to get description failed" # catch exceptions when getting descriptions

          if 'urlSize4' in img_attributes.keys(): # use size 4, sure, but only if there's nothing else
            img_src = img_attributes['urlSize4']
          if 'urlSize2' in img_attributes.keys(): # preferred image size
            img_src = img_attributes['urlSize2']
          elif 'urlSize3' in img_attributes.keys(): # then size 3
            img_src = img_attributes['urlSize3']
          else: # then size 1 if no others exist
            img_src = img_attributes['urlSize1']

      row['img_url'] = img_src # Add the image url to the row
      row['img_current_description'] = img_description # Add the image description to the row
      return row # Return the modified row

  # 9. Apply the add_luna_info_search function to each row of the DataFrame
  results_df = results_df.apply(add_luna_info_search, axis = 1)

  # 10. Convert the DataFrame back into a list of dictionaries
  return results_df.to_dict(orient='records')

# Example usage:
search_term = "theseus" # The concept to search
image_results = search_images(search_term) # Run the search using the search term and store the results

# 11. Print the results to the console
if image_results: # If there are results
    for item in image_results: # iterate through each result
        print(f"Concept: {item['concept_label']}") # print the concept
        print(f"Image URL: {item['img_url']}") # print the image url
        print(f"Image Description: {item['img_current_description']}") # print the description of the image
        print("---") # print a visual separator
else: # if no results
    print(f"No images found for '{search_term}'") # print a message that no results were found

Concept: Theseus
Image URL: http://umassamherst.lunaimaging.com/MediaManager/srvr?mediafile=/Size2/umass~16~16/4278/PPM9_20.jpg
Image Description: 12. Cubicle (b), wall E: the small residue of lunette-shaped plaster is an indication of the type of roofing of this room with a false ceiling vault, of which no other trace remains. Perhaps in the lunette there was \"the painted decoration of some weapon, vase and mask\" noted by Kekulé (cited in the bibl., p. 168). The decoration of the median area featured the central tympanum aedicula within which was the picture of Theseus who is about to board the ship, leaving Ariadne who is beyond the stern of the boat on the right; the presence of a spearhead, another element recognizable at the time, refers to Athena assisting the hero. The panels were red, surmounted by a purple frieze with scenes from the childhood of Dionysus and the young Dionysus and the Nymphs; in the upper area we still seem to glimpse the traces of a caryatid, with its left

In [None]:
import pandas as pd
from typing import List, Dict

def export_image_results_to_csv(image_results: List[Dict], filename: str):
    """
    Converts a list of image result dictionaries into a Pandas DataFrame
    and writes it to a CSV file.

    Args:
        image_results (List[Dict]): A list of dictionaries, where each dictionary
            contains information about an image and its associated concept.
        filename (str): The name of the CSV file to be created.
    """
    if not image_results:
        print("No image results to export.")
        return

    try:
        # 1. Convert the list of dictionaries into a Pandas DataFrame
        df = pd.DataFrame(image_results)

        # 2. Write the DataFrame to a CSV file
        df.to_csv(filename, index=False, encoding='utf-8') #index = False prevents the row index from being written.
        print(f"Successfully exported image results to '{filename}'")

    except Exception as e:
        print(f"Error exporting image results to CSV: {e}")


export_image_results_to_csv(image_results, "image_results.csv")

## doing something new now

something new, downloading by a list of arc identifiers

In [57]:
import requests
import csv
import json
import os
import re

# API base URL
API_BASE_URL = "https://api.p-lod.org/"

# Output CSV file
OUTPUT_CSV_FILE = "plod_data_enriched.csv"

# Input CSV file - change this to your existing CSV file
INPUT_CSV_FILE = "PPP_ALL_Region_1_test.csv"  # <-- REPLACE THIS WITH YOUR INPUT CSV FILE

def flatten_json(json_obj, key_prefix="", sep="."):
    items = []
    for k, v in json_obj.items():
        new_key = key_prefix + sep + k if key_prefix else k
        if isinstance(v, dict):
            items.extend(flatten_json(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            for i, item in enumerate(v):
              if isinstance(item, dict):
                items.extend(flatten_json(item, f"{new_key}.{i}", sep=sep).items())
              else:
                 items.append((f"{new_key}.{i}", item))
        else:
            items.append((new_key, v))
    return dict(items)


def get_image_data(arc_identifier):
    """
    Retrieves image data from the /images/{identifier} endpoint.

    Args:
        arc_identifier: The ARC identifier (e.g., 'ARC_026824').

    Returns:
        The JSON response as a dictionary, or None if there is an error.
    """
    image_data = None
    try:
        api_image_url = f"{API_BASE_URL}images/{arc_identifier}"
        api_image_response = requests.get(api_image_url)
        api_image_response.raise_for_status()
        image_data = api_image_response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching image data for {arc_identifier}: {e}")
    return image_data

def get_concepts_data(arc_identifier):
    """
    Retrieves concept data from the /depicts-concepts/{identifier} endpoint.

    Args:
        arc_identifier: The ARC identifier (e.g., 'ARC_026824').

    Returns:
        A list of concepts (strings), or None if there is an error.
    """
    concepts_data = None
    try:
        api_concepts_url = f"{API_BASE_URL}depicts-concepts/{arc_identifier}"
        api_concepts_response = requests.get(api_concepts_url)
        api_concepts_response.raise_for_status()
        concepts_data = api_concepts_response.json()
    except requests.exceptions.RequestException as e:
       print(f"Error fetching concept data for {arc_identifier}: {e}")
    
    if concepts_data: # Extract concept URNs after 'urn:p-lod:id:'
        concepts = [concept['urn'].replace("urn:p-lod:id:", "") for concept in concepts_data if isinstance(concept, dict) and 'urn' in concept]
        return concepts
    else:
       return None

def get_geojson_data(arc_identifier):
    """
    Retrieves GeoJSON data from the /geojson/{identifier} endpoint.

    Args:
        arc_identifier: The ARC identifier (e.g., 'ARC_026824').

    Returns:
        The GeoJSON response as a dictionary, or None if there is an error.
    """
    geojson_data = None
    try:
        api_geojson_url = f"{API_BASE_URL}geojson/{arc_identifier}"
        api_geojson_response = requests.get(api_geojson_url)
        api_geojson_response.raise_for_status()
        geojson_data = api_geojson_response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching GeoJSON data for {arc_identifier}: {e}")
    return geojson_data


def write_enriched_data_to_csv(all_data, csv_file):
    """
    Flattens JSON data and writes it to a CSV file, including concept and geojson information.

    Args:
        all_data: A list of dictionaries containing the original row data, image, concept, and geojson data.
        csv_file: The path to the CSV file.
    """
    with open(csv_file, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = None  # Initialize writer

        if all_data:
            for data in all_data:
                original_row = data.get('original_row') # unpack original csv row
                image_data = data.get('image_data') # unpacking data dictionary
                concepts = data.get('concepts') # unpacking data dictionary
                geojson = data.get('geojson')
                if image_data:
                    if isinstance(image_data, list):
                        for item in image_data:
                            flattened_data = flatten_json(item)
                            flattened_data['concepts'] = json.dumps(concepts) #Adding concept
                            flattened_data['geojson'] = json.dumps(geojson)  # Add geojson
                            merged_data = {**original_row, **flattened_data} # merge the original row, with the new data
                            if not writer:
                                fieldnames = list(merged_data.keys())
                                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                                writer.writeheader()
                            writer.writerow(merged_data)
                    elif isinstance(image_data,dict):
                         flattened_data = flatten_json(image_data)
                         flattened_data['concepts'] = json.dumps(concepts)
                         flattened_data['geojson'] = json.dumps(geojson) # Add geojson
                         merged_data = {**original_row, **flattened_data} # merge the original row, with the new data
                         if not writer:
                             fieldnames = list(merged_data.keys())
                             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                             writer.writeheader()
                         writer.writerow(merged_data)


if __name__ == '__main__':
    # Read the existing csv file, and extract the ARC ids
    all_data = []
    with open(INPUT_CSV_FILE, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
           arc_id = row.get("ARCs") # change this to the name of the column with ARC id
           if arc_id:
              image_data = get_image_data(arc_id)
              concepts_data = get_concepts_data(arc_id)
              geojson_data = get_geojson_data(arc_id) # Get geojson data
              all_data.append({'original_row':row, 'image_data':image_data, 'concepts': concepts_data, 'geojson':geojson_data}) # add geojson to all_data
           else:
               print(f"Skipping row, no ARC id: {row}")
    write_enriched_data_to_csv(all_data, OUTPUT_CSV_FILE)
    print(f"All data processed. Output CSV file: {OUTPUT_CSV_FILE}")

All data processed. Output CSV file: plod_data_enriched.csv


In [56]:
def download_images_from_csv(csv_file, image_dir):
    """Downloads images from URLs found in a CSV file."""
    with open(csv_file, mode='r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            for key, value in row.items():
                if key.startswith("l_img_url") and value: # If the field starts with `l_img_url` then it is an image.
                    download_image(value, image_dir)

def download_image(image_url, image_dir):
    """Downloads an image from a URL and saves it to the specified directory."""
    if not image_url:
        print("Skipping empty image url")
        return None

    try:
        response = requests.get(image_url)
        response.raise_for_status()  # Raise an exception for bad status codes

        filename = os.path.basename(image_url)  # get filename from URL

        image_path = os.path.join(image_dir, filename)
        with open(image_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded image to: {image_path}")
        return image_path


    except requests.exceptions.RequestException as e:
        print(f"Error downloading image from {image_url}: {e}")
        return None

download_images_from_csv('plod_data_enriched.csv', 'plod_images')

Downloaded image to: plod_images/PPM1_2.jpg
Downloaded image to: plod_images/image13871.jpg
Downloaded image to: plod_images/image32566.jpg
Downloaded image to: plod_images/PPM1_11.jpg
Downloaded image to: plod_images/image32568.jpg
Downloaded image to: plod_images/image13871.jpg
Downloaded image to: plod_images/image13878.jpg
Downloaded image to: plod_images/image32550.jpg
Downloaded image to: plod_images/image32559.jpg
Downloaded image to: plod_images/image13875.jpg
Downloaded image to: plod_images/image13869.jpg
Downloaded image to: plod_images/PPM1_16.jpg
Downloaded image to: plod_images/PPM1_20.jpg
Downloaded image to: plod_images/PPM1_14.jpg
Downloaded image to: plod_images/PPM1_15.jpg
