### THIS PROCESS WORKS WELL FOR MMD FIGURES, OR OTHER FIGURES WHICH HAVE MANY REFERENCES THROUGHOUT THE FILE

In [None]:
!pip install boto3
!pip install openai
!pip install chromadb
!pip install langchain
!pip install langchain-community
!pip install tiktoken
!pip install pytesseract pdf2image
!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.

# Part 1: Reading in images, putting metadata into DF

Let's read in our image metadata from our s3 bucket, and put it into a dataframe.

In [None]:
import boto3
import re
import pandas as pd
from google.colab import userdata

def get_objects(bucket_name = 'ccber-tester-bucket', folder_name = 'MMD-Figures/', region_name = 'us-east-1'):
  AWS_SERVER_PUBLIC_KEY = userdata.get('AWS_SERVER_PUBLIC_KEY')
  AWS_SERVER_SECRET_KEY = userdata.get('AWS_SERVER_SECRET_KEY')

  s3 = boto3.client('s3',
                    aws_access_key_id=AWS_SERVER_PUBLIC_KEY,
                    aws_secret_access_key=AWS_SERVER_SECRET_KEY)

  s3_objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)

  figure_numbers = []
  image_keys = []
  image_urls = []
  for obj in s3_objects['Contents']:

    file_type = obj["Key"][-3:]
    img_key = obj["Key"]

    if file_type == 'png' or file_type == 'jpg' or file_type == 'peg':
      figure_number = int(re.search(r'img_(\d+)', img_key).group(1))
      img_url = f'https://{bucket_name}.s3.{region_name}.amazonaws.com/{img_key}'

      figure_numbers.append(figure_number)
      image_keys.append(img_key)
      image_urls.append(img_url)

    else:
      print(f'File type {file_type} not supported. Skipping {img_key}')
      continue

  data = {
      'Figure': figure_numbers,
      'Image Key': image_keys,
      'Image URL': image_urls
  }

  return pd.DataFrame(data)

df_images = get_objects()
display(df_images)


File type es/ not supported. Skipping MMD-Figures/


Unnamed: 0,Figure,Image Key,Image URL
0,2,MMD-Figures/page_15_img_2.png,https://ccber-tester-bucket.s3.us-east-1.amazo...
1,3,MMD-Figures/page_15_img_3.png,https://ccber-tester-bucket.s3.us-east-1.amazo...
2,4,MMD-Figures/page_16_img_4.png,https://ccber-tester-bucket.s3.us-east-1.amazo...
3,5,MMD-Figures/page_16_img_5.png,https://ccber-tester-bucket.s3.us-east-1.amazo...
4,6,MMD-Figures/page_16_img_6.png,https://ccber-tester-bucket.s3.us-east-1.amazo...
5,7,MMD-Figures/page_16_img_7.png,https://ccber-tester-bucket.s3.us-east-1.amazo...
6,10,MMD-Figures/page_17_img_10.png,https://ccber-tester-bucket.s3.us-east-1.amazo...
7,8,MMD-Figures/page_17_img_8.png,https://ccber-tester-bucket.s3.us-east-1.amazo...
8,9,MMD-Figures/page_17_img_9.png,https://ccber-tester-bucket.s3.us-east-1.amazo...
9,240,MMD-Figures/pg_59_img_240.png,https://ccber-tester-bucket.s3.us-east-1.amazo...


# Part 2: Reading in text and standardizing the references to figures

Because they need to be parsed seperately, in our s3 bucket, we have put the "keys" section and the "main text" section into two different PDFs. We have also blocked out the images, so they do not interfere with text extraction.

In [None]:
import boto3
import io
import pdf2image
import pytesseract
from google.colab import userdata

def extract_text_from_s3_pdf(object_key, bucket_name = "ccber-tester-bucket", aws_region: str = "us-east-1") -> str:
    """
    Downloads a PDF from an S3 bucket, converts scanned pages to images, and extracts text using OCR.

    Args:
        bucket_name (str): Name of the S3 bucket.
        object_key (str): Key (path) to the PDF file in the S3 bucket.
        aws_region (str, optional): AWS region where the bucket is located. Defaults to "us-east-1".

    Returns:
        str: Extracted text from the PDF.
    """
    # ✅ Initialize S3 client
    AWS_SERVER_PUBLIC_KEY = userdata.get('AWS_SERVER_PUBLIC_KEY')
    AWS_SERVER_SECRET_KEY = userdata.get('AWS_SERVER_SECRET_KEY')

    s3 = boto3.client('s3',
                      region_name=aws_region,
                      aws_access_key_id=AWS_SERVER_PUBLIC_KEY,
                      aws_secret_access_key=AWS_SERVER_SECRET_KEY)

    # Download the PDF file into memory
    response = s3.get_object(Bucket=bucket_name, Key=object_key)
    pdf_bytes = response["Body"].read()

    # Convert PDF pages to images
    images = pdf2image.convert_from_bytes(pdf_bytes)

    # Extract text using Tesseract OCR
    extracted_text = []
    for page_num, image in enumerate(images):  # Adjust page range if needed
        text = pytesseract.image_to_string(image, config='--psm 6')  # Use PSM mode 6 (assumes structured text)
        extracted_text.append(text)
        print(f"Extracted text from page {page_num} of {object_key}")

    extracted_text = "\n".join(extracted_text).strip()
    obj_key_without_file_type = object_key[:-4]
    with open(f"{obj_key_without_file_type}_extracted_text.txt", "w", encoding="utf-8") as f:
      f.write(extracted_text)
    return extracted_text


Next, we will need a function to standardize all figure references

In [None]:
import re

def expand_figure_ranges(match):
    """ Expands figure ranges and formats them as individual (Fig. X) references. """
    start, end = int(match.group(1)), int(match.group(2))
    return " ".join(f"(Fig. {i})" for i in range(start, end + 1))

def expand_figure_list(match):
    """ Expands figure lists like 'Figs. 469 and 470' or 'Fig. 1 and 43' into separate (Fig. X) references. """
    numbers = re.split(r'\s*(?:and|,)\s*', match.group(1))
    return " ".join(f"(Fig. {num})" for num in numbers)

def standardize_figures(object_key, text):
    """
    Standardizes figure references in a given text string.
    - Expands [Figs. 10-12] or Figs. 10-12 -> (Fig. 10) (Fig. 11) (Fig. 12)
    - Converts [Figs. 4] or Figs 4 -> (Fig. 4)
    - Converts [Figs. 469 and 470] or [Fig. 1 and 43] or Figs. 6,7 -> (Fig. 469) (Fig. 470) / (Fig. 1) (Fig. 43) / (Fig. 6) (Fig. 7)
    - Converts Fig.1. -> (Fig. 1).
    - Ensures consistent spacing
    - Fixes missing closing brackets
    - Prevents misinterpretation of `]` as `1`
    """

    # Expand figure ranges, ensuring only three-digit numbers are matched
    text = re.sub(r'\(?Figs?\.?\s*(\d{1,3})\s*-\s*(\d{1,3})\)?', expand_figure_ranges, text)

    # Expand figure lists, ensuring numbers are properly formatted
    text = re.sub(r'\(?Figs?\.?\s*((?:\d{1,3}\s*(?:and|,)\s*)+\d{1,3})\)?', expand_figure_list, text)

    # Standardize individual figure references inside brackets (avoid double parentheses)
    text = re.sub(r'\(?Figs?\.?\s*(\d{1,3})\)?', r'(Fig. \1)', text)

    # Standardize inline figure references like "Fig.1." -> "(Fig. 1)."
    standardized_text = re.sub(r'(?<!\()Fig\.\s*(\d{1,3})(?!\))', r'(Fig. \1)', text)

    # Save to a file for manual editing
    obj_key_without_file_type = object_key[:-4]
    with open(f"{obj_key_without_file_type}_standardized_text.txt", "w", encoding="utf-8") as f:
      f.write(text)

    return standardized_text

# Example usage
test_obj_key = 'test_object_key.pdf'
test_text = """
[Figs. 424 - 428] should not turn into [Figs. 424 - 4281].
Another example: [Figs. 50 and 512] should only expand Fig. 50. [Figs. 510-515]
Also what about if we deal withFig.12 as well as Figs 1 and 4 or Figs 1, 3 and 9 and finally Figs. 1,6 or Figs.1 , 6
also what about (Fig. 1) or ((Fig. 1))
"""
test_standardized_text = standardize_figures(test_obj_key, test_text)
print(test_standardized_text)


[(Fig. 424) (Fig. 425) (Fig. 426) (Fig. 427) (Fig. 428)] should not turn into [(Fig. 424) (Fig. 425) (Fig. 426) (Fig. 427) (Fig. 428)1].
Another example: [(Fig. 50) (Fig. 512)] should only expand (Fig. 50). [(Fig. 510) (Fig. 511) (Fig. 512) (Fig. 513) (Fig. 514) (Fig. 515)]
Also what about if we deal with(Fig. 12) as well as (Fig. 1) (Fig. 4) or (Fig. 1) (Fig. 3) (Fig. 9) and finally (Fig. 1) (Fig. 6) or (Fig. 1) (Fig. 6)
also what about (Fig. 1) or ((Fig. 1))



In [None]:
#Now actually use it
object_key_1 = "MMD-Main-Text.pdf"
real_text = extract_text_from_s3_pdf(object_key_1)
standardized_text = standardize_figures(object_key_1, real_text)

object_key_2 =  "MMD-Keys.pdf"
real_text = extract_text_from_s3_pdf(object_key_2)
standardized_text = standardize_figures(object_key_2, real_text)


Extracted text from page 0 of MMD-Main-Text.pdf
Extracted text from page 1 of MMD-Main-Text.pdf
Extracted text from page 2 of MMD-Main-Text.pdf
Extracted text from page 3 of MMD-Main-Text.pdf
Extracted text from page 4 of MMD-Main-Text.pdf
Extracted text from page 5 of MMD-Main-Text.pdf
Extracted text from page 6 of MMD-Main-Text.pdf
Extracted text from page 7 of MMD-Main-Text.pdf
Extracted text from page 8 of MMD-Main-Text.pdf
Extracted text from page 9 of MMD-Main-Text.pdf
Extracted text from page 10 of MMD-Main-Text.pdf
Extracted text from page 11 of MMD-Main-Text.pdf
Extracted text from page 12 of MMD-Main-Text.pdf
Extracted text from page 13 of MMD-Main-Text.pdf
Extracted text from page 14 of MMD-Main-Text.pdf
Extracted text from page 15 of MMD-Main-Text.pdf
Extracted text from page 16 of MMD-Main-Text.pdf
Extracted text from page 17 of MMD-Main-Text.pdf
Extracted text from page 18 of MMD-Main-Text.pdf
Extracted text from page 19 of MMD-Main-Text.pdf
Extracted text from page 20 of

#TODO: ADD CLEANING FUNCTION

# Part 3: Extracting figure references from the text, putting them into dataframe

This script is meant to grab the words surrounding mentions of various figures. This context is what we will pass into the LLM to create an informative description of the image

In [None]:
import re
import pandas as pd
from IPython.display import display

def extract_figures_from_text(main_text, keys_text, context_size=100, skip_forward=50):
    """
    Extracts mentions of figures in the format (Fig. X) from text using regex and returns a DataFrame.

    Args:
    - keys_text (str): The long text from the keys section of the pdf.
    - main_text (str): The long text from the main text section of the pdf.
    - context_size (int): Number of characters to include as context before and after the match.
    - skip_forward (int): Number of characters to skip after finding a match to avoid overlapping contexts.

    Returns:
    - DataFrame with 'Figure Number' and 'Context'.
    """
    # Regex pattern to match ONLY (Fig. X)
    figure_pattern = r'\(Fig\. \d+\)'

    main_text_matches = list(re.finditer(figure_pattern, main_text))

    data = []

    last_main_text_index = 0  # Track the last processed position within main text
    for match in main_text_matches:
        start, end = match.start(), match.end()

        # Ensure we don't extract overlapping contexts
        if start < last_main_text_index:
            continue  # Skip if within the skip range of the last match

        figure_number = match.group(0)  # Extract matched (Fig. X)

        # Extract context
        context = main_text[max(0, start - context_size): min(len(main_text), end + context_size)]

        # Extract just the number, to feed to DF. TODO: Change by using Apply function maybe, like do this line below later to all at once. Think through tho
        figure = int(re.search(r'\d+', figure_number).group())
        data.append({"Figure": figure, "Context": context})

        # Update last_index to enforce skip
        last_index = end + skip_forward

    keys_text_matches = list(re.finditer(figure_pattern, keys_text))
    last_keys_text_index = 0 # Track the last processed position within key text
    for match in keys_text_matches:
        start, end = match.start(), match.end()

        # Ensure we don't extract overlapping contexts
        if start < last_keys_text_index:
            continue  # Skip if within the skip range of the last match

        figure_number = match.group(0)  # Extract matched (Fig. X)
        figure = int(re.search(r'\d+', figure_number).group())

        # Search backwards for a stopping point
        prev_section_match = re.search(r'\);\s|\];\s', keys_text[:start][::-1])  # Find the previous `);` or `];`
        prev_figure_match = re.search(figure_pattern, keys_text[:start][::-1])  # Find the previous figure mention

        # Determine the closest stopping point
        if prev_section_match:
            stop_index = start - prev_section_match.start()
        elif prev_figure_match:
            stop_index = start - prev_figure_match.start()
        else:
            stop_index = max(0, start - 100)  # Default max of 100 chars if neither is found

        # Ensure we don't start at an incomplete word
        #stop_index = max(0, stop_index)

        context = keys_text[stop_index:end]

        data.append({"Figure": figure, "Context": context})

        # Update last index to enforce skip
        last_keys_text_index = end + skip_forward


    # Convert to DataFrame
    df_context = pd.DataFrame(data)

    return df_context

# #Tester:
# tester_main_text = "distinct scopa will distinguish females from males in many ~ groups (excluding brood parasites, bees that carry pollen oS internally, and queens of advanced eusocial species). Finally, females have stings, and males have male genitalia [(Fig. 20)]; ~ but both are commonly retracted, and in some females the - sting is rudimentary. Other features may need to be examined ww to verify the sex of an individual, including the number of antennal segments and the number of visible metasomal ~ segments. Males have 13 antennal segments (12 in Neopasites ~ nd Holcopasites); females have 12 [(Fig. 10)]. In addition, ~ males and females typically differ in the number of visible ws metasomal segments."
# tester_keys_text = "10(9). Hind tibia of both sexes not over 1.5 times as broad as femur [(Fig. 72)], that of female without ~ COPDICUIA Leese seeeesteteseseescsesessscsessesecacaescseessacssnensasacsnensansussesssssssesessssscststessasscetsessessstseseeee LL ~ — Hind tibia of both sexes over twice as broad as femur [(Fig. 69), (Fig. 70)], that of female with vv COL DICU a sescssssexcssuscsnenssxsnsowenvosensvorapesenensnxaneseavsvsctcnscassswaveavsavbutvensa S70Ti sa ctthSvi east chedencesdadesosessereveseve LO w 11(10). Scutellum with a tubercle on each side [(Fig. 71)]; hind femur usually denticulate beneath ~ [(Fig. 72)]; metasomal terga and sterna without longitudinal median carina (rare)........:.csssscsseeeeseees - soeeeceseseseeseeesesesesessseesesesevsesesesesecsesessesesesscsesecasseesesssessessessssssssssssssssssesstsssssssrasseeresesee LXGerere [158] us — Scutellum flat, not tuberculate; hind femur not denticulate; metasomal terga (especially 3 to 6) ~ and sterna 2 to 4 (also 5 in female) with longitudinal median carina (very rare) ........c.scscesseessseeeeeees ~ sauansaassnTnessstanssesasneisnsestbseeesesssenerssucousesosuenessasasescasonsnessssuesesestsonsersonssensnetreneeersveessseseanenenenaf 2108 [154] ws 12(10). Labrum, mandible, and lower lateral portion of clypeus whitish; body usually brilliantly ~ metallic; posterior tibia of male with hairy groove not reaching rounded apex of tibia [(Fig. 70)]; ~ middle tibia of male with one to three minute velvety patches at the proximal end of the large patch ~ [(Fig. 73)]"
# # Now lets axtually use it
# df_tester_context = extract_figures_from_text(tester_main_text, tester_keys_text)
# display(df_tester_context)


#Now for real:
with open(f"MMD-Main-Text_standardized_text.txt", "r", encoding="utf-8") as f:
  main_text = f.read()

with open(f"MMD-Keys_standardized_text.txt", "r", encoding="utf-8") as f:
  keys_text = f.read()

df_context = extract_figures_from_text(main_text, keys_text)
display(df_context)

Unnamed: 0,Figure,Context
0,13,rms. In Apocrita we refer to the\n“mesosoma” (...
1,19,rth. The same is true for sterna: the first ~\...
2,20,queens of advanced eusocial species). Finally...
3,10,~\nsegments. Males have 13 antennal segments (...
4,19,an usually count seven\nvisible metasomal terg...
...,...,...
603,415,158)]; one subantennal suture [(Fig. 157) (Fig...
604,192,rite near base of galea but hidden between exp...
605,111,ent [(Fig. 192)]; first flagellar segment as l...
606,412,"ed from rest of maxilla, represented by small ..."


### Combines all rows by Fig Number, concatenating context

In [None]:
# Group by 'Figure Number' and concatenate contexts
df_context = df_context.groupby('Figure', as_index=False).agg({'Context': ' -- NEXT CONTEXT -- '.join})

# Sort by extracted numeric figure, we do this again later so we dont have to now, but it's nicer on the eys
df_context['Figure'] = df_context['Figure']
df_context = df_context.sort_values(by='Figure')

# Display sorted DataFrame
display(df_context)


Unnamed: 0,Figure,Context
0,2,isible terga and sterna ~\ncan be reduced to a...
1,3,rga and sterna ~\ncan be reduced to as few as ...
2,4,terna ~\ncan be reduced to as few as four visi...
3,5,deep groove on the\n~ underside of the head i...
4,6,ove on the\n~ underside of the head into which...
...,...,...
394,515,visions: Moure (1963); Dressler (1979). (coupl...
395,516,31. Melipona Illiger: Smallish to moderate-si...
396,517,to Genera of Y\nMeliponini) wy\n35. Oxytrigon...
397,518,"chwarz: Moderate-sized (4.5-7 mm), black or te..."


# Part 4: Mapping figure context to the corresponding image metadata

In [None]:
def map_context_to_images(df_images, df_context,):
  df_combined = pd.merge(df_images, df_context, on='Figure', how='inner')
  df_combined = df_combined.sort_values(by='Figure')
  return df_combined

df_combined = map_context_to_images(df_images, df_context)
display(df_combined)

Unnamed: 0,Figure,Image Key,Image URL,Context
0,2,MMD-Figures/page_15_img_2.png,https://ccber-tester-bucket.s3.us-east-1.amazo...,isible terga and sterna ~\ncan be reduced to a...
1,3,MMD-Figures/page_15_img_3.png,https://ccber-tester-bucket.s3.us-east-1.amazo...,rga and sterna ~\ncan be reduced to as few as ...
2,4,MMD-Figures/page_16_img_4.png,https://ccber-tester-bucket.s3.us-east-1.amazo...,terna ~\ncan be reduced to as few as four visi...
3,5,MMD-Figures/page_16_img_5.png,https://ccber-tester-bucket.s3.us-east-1.amazo...,deep groove on the\n~ underside of the head i...
4,6,MMD-Figures/page_16_img_6.png,https://ccber-tester-bucket.s3.us-east-1.amazo...,ove on the\n~ underside of the head into which...
5,7,MMD-Figures/page_16_img_7.png,https://ccber-tester-bucket.s3.us-east-1.amazo...,e\n~ underside of the head into which the prob...
7,8,MMD-Figures/page_17_img_8.png,https://ccber-tester-bucket.s3.us-east-1.amazo...,side of the head into which the proboscis\n~ f...
8,9,MMD-Figures/page_17_img_9.png,https://ccber-tester-bucket.s3.us-east-1.amazo...,he head into which the proboscis\n~ folds [(Fi...
6,10,MMD-Figures/page_17_img_10.png,https://ccber-tester-bucket.s3.us-east-1.amazo...,~\nsegments. Males have 13 antennal segments (...
9,240,MMD-Figures/pg_59_img_240.png,https://ccber-tester-bucket.s3.us-east-1.amazo...,esesseseneeseeeeseessseeseseeaeeaseneeees JA ~...


# Part 5: Generating descriptions of all of our images

Now that we have all of the text which mentions our figures. We will form an OpenAI API call to take in this text and generate a description of the figure. Then, we will apply the function to our dataframe.

In [None]:
import pandas as pd
import openai
from openai import OpenAI
from google.colab import userdata
import openai

#Function to take in OpenAI client, figure number, and figure context, and generate a description
def generate_figure_description(client, figure_num, context):
    """
    Calls OpenAI's API to generate a figure description optimized for embedding retrieval.

    Args:
    - figure_num (str): Figure number in the format (Fig. X).
    - context (str): Context related to the figure, as well as some surrounding figures (e.g., labels, features, or extracted text).

    Returns:
    - str: Generated description.
    """
    prompt = f"""
    Generate a **figure description** of Fig. {figure_num} optimized for embedding retrieval.
    - Summarize the **main topic of the figure** in the first sentence.
    - Ignore information about any figures that are not Fig. {figure_num}
    - Clearly describe the **key labeled components** and their significance.
    - Avoid excessive detail that is obvious from viewing the image.
    - Keep it **concise (100-200 words) and semantically rich** for retrieval.
    - Ensure it's **self-contained** (no reliance on external text).

    Context: {context}
    """
    #TODO: CONSIDER FEW SHOT LEARNING
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": "You are an expert at generating precise and informative descriptions of scientific and technical figures."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3  # Keeps responses precise and factual
    )

    print(f"Created a description for Fig. {figure_num}")
    return response.choices[0].message.content

#Apply the API call to every row, add descriptions in new column
def create_descriptions(df, client):
    df['Generated Description'] = df.apply(lambda row: generate_figure_description(client, row['Figure'], row['Context']), axis=1)
    return df

Now we will use the apply function to generate descriptions for all of our figures.

In [None]:
# Initialize the OpenAI client
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
df_final = create_descriptions(df_combined, client = OpenAI(api_key=OPENAI_API_KEY))
df_final.to_csv("output.csv", index=False)


Created a description for Fig. 2
Created a description for Fig. 3
Created a description for Fig. 4
Created a description for Fig. 5
Created a description for Fig. 6
Created a description for Fig. 7
Created a description for Fig. 8
Created a description for Fig. 9
Created a description for Fig. 10
Created a description for Fig. 240


# Part 6: Embedding / Retrieving our image data

This script takes in our final dataframe and embeds all of our image descriptions with the proper metadata

In [None]:
import chromadb
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings  # Assuming OpenAI for embeddings

def get_and_store_embeddings(df, chroma_client, collection_name):
    """
    Generates embeddings for the 'Generated Description' column in df and stores them in ChromaDB
    with metadata including Figure Number, Image Key, and Image URL.
    """
    collection = chroma_client.get_or_create_collection(name=collection_name)

    # ✅ Define OpenAI Embedding Model (or replace with another embedding model)
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

    # ✅ Prepare Data for ChromaDB
    image_descriptions = df["Generated Description"].tolist()  # List of descriptions
    figure_ids = df["Image Key"].tolist()  # Unique IDs using Figure Number

    # ✅ Create metadata for each row
    image_sources = df.apply(lambda row: {
        "Figure": row["Figure"],
        "Image Key": row["Image Key"],
        "Image URL": row["Image URL"]
    }, axis=1).tolist()

    # ✅ Compute Embeddings
    embeddings = embedding_model.embed_documents(image_descriptions)  # List of vector embeddings

    # ✅ Store in ChromaDB
    collection.add(
        documents=image_descriptions,
        embeddings=embeddings,
        metadatas=image_sources,
        ids=figure_ids
    )

    print(f"✅ Successfully stored {len(image_descriptions)} descriptions in ChromaDB.")


In [None]:
# Create Chroma Client
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Store data persistently
collection_name = 'neww_collection'
# Create and store embeddings
get_and_store_embeddings(df_final, chroma_client, collection_name)

  embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


✅ Successfully stored 10 descriptions in ChromaDB.


This next script allows us to send in a query and retrieve the K most relevant embeddings

In [None]:
def get_relevant_docs(question, chroma_client, collection_name, top_k = 3):
    # Connect to ChromaDB
    collection = chroma_client.get_or_create_collection(name=collection_name)

    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    query_embedding = embedding_model.embed_query(question)

    # Perform query with normalized embedding
    results = collection.query(
        query_embeddings=query_embedding,  # Ensure correct format
        n_results=top_k,
    )

    return results

Now, lets call our functions above to embed our data and try retrieving some of it

In [None]:
# Reference the same client and connection as before
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Store data persistently
collection_name = 'neww_collection'

# Now, let's see if we can retrieve the right embeddings
question = 'What are mandibles?'  #EXAMPLE QUESTION - TEXT FOUND ON BOTTOM OF PAGE 16 OF PDF
get_relevant_docs(question, chroma_client, collection_name)

{'ids': [['MMD-Figures/page_16_img_5.png',
   'MMD-Figures/page_16_img_4.png',
   'MMD-Figures/page_16_img_6.png']],
 'embeddings': None,
 'documents': [["**Figure Description of Fig. 5: Bee Mandibular Structures**\n\nFig. 5 illustrates the detailed anatomy of bee mandibles, focusing on the specific structures used for chewing. The figure is a close-up diagram that labels various parts of the mandibles, which are critical for the bee's ability to process food. Key components labeled in the diagram include the preapical teeth, which are situated near the tips of the mandibles and play a significant role in gripping and grinding food particles. The diagram also highlights the basal part of the mandibles, which connects to the bee's head and provides the necessary leverage for mandibular movement. The labels follow the naming conventions established by Michener and Fraser (1978), ensuring consistency with scientific literature. This figure is essential for understanding the functional mor

# Part 7: Embedding our text data

Now, we know we can get the best image data, but let's embed all of our text data as well, and make sure that we get a good mix of image descriptions and general text

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text(text):
  # Initialize the text chunker with custom parameters
  custom_text_splitter = RecursiveCharacterTextSplitter(
     chunk_size = 400,
     chunk_overlap  = 30,
     length_function = len
     )

    # Chunk the text
  chunks = custom_text_splitter.create_documents([text])

    # Format chunks to go into nomic embedding model correctly

  chunk_sources = []
  text_chunks = []
  chunk_ids = []
  for idx, chunk in enumerate(chunks[0:200]):
    chunk_sources.append({"source": f"MMD"})
    text_chunks.append(f"{chunk.page_content}")
    chunk_ids.append(f"MMD_CHUNK_{idx}")
  return text_chunks, chunk_ids, chunk_sources

In [None]:
def embed_text_chunks(text, chroma_client, collection_name):

    collection = chroma_client.get_or_create_collection(name='neww_collection')

    # ✅ Define OpenAI Embedding Model (or replace with another embedding model)
    embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    text_chunks, chunk_ids, chunk_sources = chunk_text(text)
    # ✅ Compute Embeddings
    embeddings = embedding_model.embed_documents(text_chunks)  # List of vector embeddings

    # ✅ Store in ChromaDB
    collection.add(
        documents=text_chunks,
        embeddings=embeddings,
        metadatas=chunk_sources,
        ids=chunk_ids
    )

    print(f"✅ Successfully stored {len(text_chunks)} descriptions in ChromaDB.")

In [None]:
# Reference the same client and connection as before
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Store data persistently
collection_name = 'neww_collection'

#First, we will read in text and concat it
# Read the Keys text file
with open("MMD-Keys_extracted_text.txt", "r", encoding="utf-8") as file:
    keys_text = file.read()

# Read the Main text file
with open("MMD-Main-Text_extracted_text.txt", "r", encoding="utf-8") as file:
    main_text = file.read()

# Combine both into one string if needed
combined_text = keys_text + "\n" + main_text

#Then we will chunk our code
embed_text_chunks(combined_text, chroma_client, collection_name)

✅ Successfully stored 200 descriptions in ChromaDB.


# Part 8: (Not Quite) Multimodal Embedding Retrieval!

Now, we will once again send a query and see what context is retrieved.

In [None]:
# Now, let's see if we can retrieve the right embeddings
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Store data persistently
collection_name = 'neww_collection'
question = 'What are mandible pairs?'
results = get_relevant_docs(question, chroma_client, collection_name)

print(f"Retrieved IDs: {results['ids']}")
print(f"Retrieved Documents: {results['documents']}")

Retrieved IDs: [['MMD-Figures/page_16_img_5.png', 'MMD_CHUNK_169', 'MMD_CHUNK_156']]
Retrieved Documents: [["**Figure Description of Fig. 5: Bee Mandibular Structures**\n\nFig. 5 illustrates the detailed anatomy of bee mandibles, focusing on the specific structures used for chewing. The figure is a close-up diagram that labels various parts of the mandibles, which are critical for the bee's ability to process food. Key components labeled in the diagram include the preapical teeth, which are situated near the tips of the mandibles and play a significant role in gripping and grinding food particles. The diagram also highlights the basal part of the mandibles, which connects to the bee's head and provides the necessary leverage for mandibular movement. The labels follow the naming conventions established by Michener and Fraser (1978), ensuring consistency with scientific literature. This figure is essential for understanding the functional morphology of bee mouthparts, particularly how th

Great! As you can see in the output above, our query retrieved two relevant text chunks, as well as one image! We can then pass this context into our chatbot to help formulate responses, and possibly render the image.

# Part 9: (Not Quite) Multimodal RAG!

Now, we'll write an api call to take in our context, render the images (if there are any) and provide an answer to our query!

In [None]:
from IPython.display import display, HTML  # <-- Add this line

def generate_answer_with_images(client, query, retrieval_results):
    """
    Generates an answer using OpenAI API based on retrieved embeddings from ChromaDB.
    Mentions figures passively instead of listing them separately.

    Args:
        query (str): The user's question.
        retrieval_results (dict): The output of the ChromaDB query.

    Returns:
        str: The generated answer from OpenAI.
    """

    # Extract text and figures into a cohesive context
    contexts = []
    images_to_render = []

    for doc, meta in zip(retrieval_results["documents"][0], retrieval_results["metadatas"][0]):
        if "Image URL" in meta and "Figure" in meta:
            contexts.append(f"{doc} (as seen in Figure {meta['Figure']})")
            images_to_render.append(meta["Image URL"])  # Store images for rendering
        else:
            contexts.append(doc)

    # Render images in a Jupyter Notebook environment if available
    if images_to_render:
        html_code = "".join(f'<img src="{url}" alt="Retrieved Image" style="max-width:400px; margin:10px;">' for url in images_to_render)
        display(HTML(html_code))

    # Construct OpenAI Prompt
    prompt = f"""
    You are a scientific assistant helping to answer questions using retrieved figure descriptions and text.

    **User Question:** {query}

    **Relevant Information:** {' '.join(contexts) if contexts else 'No relevant information found.'}

    **Instructions:**
    - Answer the question directly in a clear and concise manner.
    - If figures are available, reference them naturally within the response (e.g., "as shown in Figure 3").
    - Do not list figures separately; integrate them passively in the explanation.
    - Mention all figures which are rendered.
    """

    # Call OpenAI API
    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[{"role": "system", "content": "You are an expert assistant that analyzes images and text to answer scientific questions."},
                  {"role": "user", "content": prompt}],
        temperature=0.3
    )

    return textwrap.fill(response.choices[0].message.content, width=80)


In [None]:
import textwrap
# Example Usage
question = "What are mandible pairs?"
retrieval_results = get_relevant_docs(question, chroma_client, collection_name)
answer = generate_answer_with_images(OpenAI(api_key=OPENAI_API_KEY), question, retrieval_results)

print(answer)  # Displays the OpenAI-generated response

Mandible pairs refer to the two mandibles found in some insects, such as bees,
which are used primarily for grasping, biting, cutting, or crushing food. In the
context of bees, as illustrated in Figure 5, the mandibles are detailed
anatomical structures that include components like preapical teeth and a basal
part. These structures enable bees to effectively process food, contributing to
their feeding habits. The diagram in Figure 5 provides a close-up view of these
mandibular components, highlighting their importance in the bee's anatomy and
feeding mechanisms.
