# PROCESSING PDFs IN AZURE CONTAINER BLOB STORAGE

IMPORT NECESSARY MODULES

In [None]:
from adlfs import AzureBlobFileSystem
from dotenv import load_dotenv
import os
import json
import meilisearch

import sys
sys.path.append('C:/Users/lloyd/OneDrive/Documents/GitHub/bghinsights/bghinsights')

# Import the modules from bghinsights
from pdf_processor import process_pdf
from text_analyzer import analyze_text_content
from store_data import save_json_to_azure

LOAD ENV VARIABLES FROM .ENV FILE

In [None]:
# Load environment variables from .env file
load_dotenv()

# Connect to Azure Blob Storage
fs = AzureBlobFileSystem()

# Define the container name
container_name = os.getenv("AZURE_PDF_CONTAINER")

# Load Meilisearch client
meilisearch_client = meilisearch.Client(os.getenv("MEILISEARCH_URL"), os.getenv("MEILISEARCH_API_KEY"))

# Index the dataset in Meilisearch
index = meilisearch_client.index(os.getenv("MEILISEARCH_INDEX_NAME"))

PROCESS PDF FILES AND SAVE THE EXTRACTED DATA TO AZURE BLOB STORAGE

In [None]:
# Initialize an empty list to store the extracted data from each PDF file
all_data = []

# Process PDF files and extract text
for filename in fs.ls(container_name):
    if filename.endswith(".pdf"):
        print(f"Processing file: {filename}")
        # Extract just the filename without the folder path
        filename_only = os.path.basename(filename)
        file_path = fs.open(filename)

        # Extract text from the PDF file
        text = process_pdf(file_path)
        # Pass the extracted text to the analyze_text_content function
        analyzed_content = analyze_text_content(text, filename=filename_only)
        # You can perform additional processing or analysis here if needed
        
        # Append the analyzed content to the list
        all_data.append(analyzed_content)

# Save the extracted data to Azure Blob Storage
save_json_to_azure(fs, container_name, all_data)

LOAD DATA SET AND ADD DOCUMENTS TO MEILISEARCH INDEX

In [None]:
# Load the dataset from Azure Blob Storage
dataset_path = f"{container_name}/extracted_json/all_data.json"
with fs.open(dataset_path, 'rb') as json_file:
    dataset = json.load(json_file)

# Add the documents to the Meilisearch index
index.add_documents(dataset)

# PROCESSING PDF IN LOCAL ENV

IMPORT NECESSARY MODULES

In [1]:
import os
import json
import meilisearch

import sys
sys.path.append('C:/Users/lloyd/OneDrive/Documents/GitHub/bghinsights/bghinsights')

# Import the modules from bghinsights
from pdf_processor import process_pdf
from text_analyzer import analyze_text_content
from store_data import save_json_to_local

LOAD ENV VARIABLES FROM .ENV

In [2]:
# Load Meilisearch client
meilisearch_client = meilisearch.Client(os.getenv("MEILISEARCH_URL"), os.getenv("MEILISEARCH_API_KEY"))

# Index the dataset in Meilisearch
index = meilisearch_client.index(os.getenv("MEILISEARCH_INDEX_NAME"))

PROCESS PDF FILES AND SAVE THE EXTRACTED DATA TO THE INPUT DIRECTORY PATH

In [3]:
# Initialize an empty list to store the extracted data from each PDF file
directory_path = "C:/Users/lloyd/OneDrive/Desktop/test"
all_data = []

# List files in the directory
files_in_directory = os.listdir(directory_path)

# Process PDF files and extract text
for filename in files_in_directory:
    if filename.endswith(".pdf"):
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)
        
        print(f"Processing file: {file_path}")
        
        # Extract just the filename without the folder path
        filename_only = os.path.basename(filename)
        
        # Extract text from the PDF file
        text = process_pdf(file_path)
        
        # Pass the extracted text to the analyze_text_content function
        analyzed_content = analyze_text_content(text, filename=filename_only)
        
        # Append the analyzed content to the list
        all_data.append(analyzed_content)

# Save the extracted data to a JSON file locally
save_json_to_local(directory_path, all_data)

Processing file: C:/Users/lloyd/OneDrive/Desktop/test\anwz(brfg)__32-23.pdf
Processing file: C:/Users/lloyd/OneDrive/Desktop/test\enzr__27-20a.pdf
Processing file: C:/Users/lloyd/OneDrive/Desktop/test\gsz___1-04.pdf
Processing file: C:/Users/lloyd/OneDrive/Desktop/test\iv_zr__69-23.pdf
Processing file: C:/Users/lloyd/OneDrive/Desktop/test\ix_zr_210-99.pdf
Processing file: C:/Users/lloyd/OneDrive/Desktop/test\i_zr__24-23.pdf
Processing file: C:/Users/lloyd/OneDrive/Desktop/test\notz(brfg)___4-22b.pdf
Processing file: C:/Users/lloyd/OneDrive/Desktop/test\riz(r)___1-23.pdf
Processing file: C:/Users/lloyd/OneDrive/Desktop/test\vgs___1-16.pdf
All data saved to C:/Users/lloyd/OneDrive/Desktop/test\extracted_json/all_data.json


LOAD DATA SET AND ADD DOCUMENTS TO MEILISEARCH INDEX

In [None]:
# Load the dataset from Azure Blob Storage
dataset_path = f"{directory_path}/extracted_json/all_data.json"
with fs.open(dataset_path, 'rb') as json_file:
    dataset = json.load(json_file)

# Add the documents to the Meilisearch index
index.add_documents(dataset)