In [None]:
import toml
import sys
import os
import pandas as pd

from langchain_openai import AzureChatOpenAI

 # Get the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the parent directory to the Python path
sys.path.append(parent_dir)

# Import the PDFExtractChain class from the chains module
from chains.pdf_extractor import PDFExtractChain
from IPython.display import Markdown

In [None]:
azure_config = toml.load(r"C:\Users\rami8629\OneDrive - Esri\Demos & Blogs\ArcGIS Resources\GeoAi & Deep Learning\GenAI\Gen-ai\Examples\config_rami.toml")["configs"][0]
llm = AzureChatOpenAI(
    openai_api_version=azure_config["api_version"],
    azure_deployment=azure_config["deployment_name"],
    api_key=azure_config["api_key"],
    azure_endpoint=azure_config["api_base"],
    model=azure_config["model_name"],
    model_name=azure_config["model_name"],
    temperature=0,
)
response = llm.invoke("hi")

In [None]:

# Directory containing PDF files
pdf_directory = "pdfs"

# List all PDF files in the directory
pdf_paths = [os.path.join(pdf_directory, f) for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

# Create an instance of PDFExtractChain for each PDF and extract information
results = {}
for pdf_path in pdf_paths:
    # Extract country name from the PDF file name
    country_name, global_id = os.path.basename(pdf_path).split('_')[:2]
    countryname_globalid = f"{country_name}_{global_id}"
    
    # Create an instance of PDFExtractChain
    pdf_chain = PDFExtractChain(llm, pdf_path)
    
    # Extract information from the PDF
    result = pdf_chain.extract_from_pdf()
    
    # Store the result in the dictionary
    results[countryname_globalid] = result
   

In [None]:

# Convert the results dictionary to a DataFrame
df = pd.DataFrame.from_dict(results, orient='index')

# Display the DataFrame
df

In [None]:
summaries = df["summary"]

In [None]:
for summary in summaries:
    display(Markdown(summary))

In [None]:
# Assuming df is already defined and contains the 'graphs' column
graphs_data = df['graphs'].apply(pd.Series)

# Create two DataFrames from the 'graphs' column
df_graphs_titles = graphs_data.applymap(lambda x: x['title'] if isinstance(x, dict) else None)
df_graphs_page_numbers = graphs_data.applymap(lambda x: x['page_number'] if isinstance(x, dict) else None)


In [None]:

# Display the DataFrames
print("Graph Titles DataFrame:")
df_graphs_titles


In [None]:

print("\nGraph Page Numbers DataFrame:")
df_graphs_page_numbers