In [52]:
'''
!pip install PyMuPDF
!pip install pillow
!pip install -q --upgrade google-generativeai langchain-google-genai
!pip install langchain faiss-gpu
'''

'\n!pip install PyMuPDF\n!pip install pillow\n!pip install -q --upgrade google-generativeai langchain-google-genai\n!pip install langchain faiss-gpu\n'

In [53]:
'''
from google.colab import drive
drive.mount('/content/drive')
'''

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n"

In [54]:
import fitz
import os
import requests
import re
from tqdm import tqdm
from PIL import Image
from tqdm import tqdm
from pathlib import Path
#로컬버전으로 변경하였기 때문에 주석처리
#from google.colab import userdata
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough, RunnableParallel

# pdf 경로를 설정하세요.
# pdf_path = "/content/drive/MyDrive/프로젝트/data/lp.pdf"

# 로컬버전에서 다시 설정
pdf_path = "data/lp.pdf"

In [55]:
# 실행하기 전에 열쇠창에 API키를 등록하세요.

# 코랩버전이 아니므로 주석처리 로컬 환경변수에서 불러오는 것으로 변경
'''
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
'''

# Gemini 설정
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
model = genai.GenerativeModel(model_name = "gemini-pro")

In [56]:
# 모델을 불러옵니다.
gemini_vision = genai.GenerativeModel('gemini-pro-vision')
gemini_pro = ChatGoogleGenerativeAI(model="gemini-pro")
embed_doc = GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type = "RETRIEVAL_DOCUMENT")

In [57]:
# pdf에서 이미지를 추출해서 임시 저장합니다.
def extract_images_from_pdf(pdf_path, output_folder):
    doc = fitz.open(pdf_path)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    extracted_images = []

    for page_number in tqdm(range(doc.page_count), desc="Extracting images"):
        page = doc[page_number]
        images = page.get_images(full=True)
        for img_index, image in enumerate(images):
            xref = image[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_filename = f"{output_folder}/page{page_number + 1}_img{img_index + 1}.png"

            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)

            extracted_images.append(image_filename)

    return extracted_images

output_folder = "data/temporary"
extracted_images = extract_images_from_pdf(pdf_path, output_folder)

def get_image_files(dir_path):
    dir_path = Path(dir_path)
    image_paths = list(dir_path.glob("*.png"))
    return image_paths

image_files = get_image_files(output_folder)

Extracting images: 100%|██████████| 64/64 [00:00<00:00, 5332.87it/s]


In [58]:
# 이미지 설명 텍스트를 추출합니다.
# image_folder = "/content/temporary/"
# 로컬버전에서 경로 다시 설정
image_folder = "data/temporary/"

image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]

responses = []

for image_file in tqdm(image_files, desc="Generating content"):
    image_path = os.path.join(image_folder, image_file)
    img = Image.open(image_path)
    response = gemini_vision.generate_content(img)
    responses.append({"image_path": image_path, "response": response})

pdf_img = []

response_values = [result["response"] for result in responses]

for response in response_values:
    pdf_img.append(str(response.text))

pdf_img = ' '.join(pdf_img)

Generating content: 100%|██████████| 30/30 [02:51<00:00,  5.72s/it]


In [59]:
# pdf에서 텍스트를 추출합니다.
def extract_and_clean_text_from_pdf(pdf_path):
    text_content = ""

    with fitz.open(pdf_path) as pdf_document:
        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
            text_content += page.get_text()

    cleaned_text = re.sub(r'[^\w\s.,?!]', '', text_content)
    cleaned_text = re.sub(r'\b\d+\b', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    return cleaned_text

pdf_txt = extract_and_clean_text_from_pdf(pdf_path)

In [60]:
# 텍스트 파일 임베딩
vectorstore1 = FAISS.from_texts([pdf_txt], embedding=embed_doc)
retriever1 = vectorstore1.as_retriever()

# 이미지 파일 임베딩
vectorstore2 = FAISS.from_texts([pdf_img], embedding=embed_doc)
retriever2 = vectorstore2.as_retriever()

In [61]:
# 질문답변 프롬프트
query = "tell me how many images are there and description of them?"
prompt_str = """Answer the question step by step. and you can refer to the following context:
{context_a}
{context_b}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(prompt_str)
retrieval = RunnableParallel(
    {   "context_a": retriever1, "context_b": retriever2,
        "question": RunnablePassthrough()  }
)
output_parser = StrOutputParser()
chain = retrieval | prompt | gemini_pro | output_parser

chain.invoke(query)

'There are 19 images in total.\n\n1. A man in a suit pointing at a blackboard with a mathematical equation written on it.\n\n\n2. A man in a turban standing in front of a blackboard with math equations written on it.\n\n\n3. A boa constrictor digesting an elephant.\n\n\n4. The little prince uprooting a baobab tree.\n\n\n5. The little prince standing on a baobab tree looking up at the stars.\n\n\n6. The little prince watering a rose.\n\n\n7. A king sitting on a planet that looks like an asteroid.\n\n\n8. A man standing on a small planet.\n\n\n9. Enrico Fermi.\n\n\n10. A man sitting at a desk, smoking a pipe and reading a document.\n\n\n11. A scholar or rabbi reading a holy book.\n\n\n12. The little prince standing in a garden of roses.\n\n\n13. A rabbit in a hole in the ground, a tree on top of a hill, and flowers at the bottom of the hill.\n\n\n14. The fox and the little prince.\n\n\n15. An elephant being swallowed by a boa constrictor.\n\n\n16. A drawing of a hat.\n\n\n17. A snake eat

In [62]:
print(pdf_img)

 The drawing shows a man in a suit pointing at a blackboard with a mathematical equation written on it. The man is likely a teacher, and the equation is likely a math problem that he is working on. The drawing is in a cartoon style, and the man is drawn with simple lines and shapes. The blackboard is also drawn in a simple style, and the equation is written in a clear and easy-to-read font. The drawing is likely meant to be humorous, as the man is pointing at the equation with a puzzled expression on his face.  The illustration shows a man in a turban standing in front of a blackboard. He is wearing a long white robe and a red sash. On the blackboard, there are a bunch of math equations written.  The illustration shows a boa constrictor digesting an elephant.  The picture shows the little prince. He is standing on a small planet and uprooting a baobab tree. The baobab trees are very dangerous. They can grow very quickly and take over the entire planet. The little prince is trying to sa