In [2]:
import os
import textwrap
import glob

from llama_parse import LlamaParse

os.environ["GROQ_API_KEY"] = "groqapi"
os.environ["LLAMA_CLOUD_API_KEY"] = "llamakey"


def print_response(response):
    response_txt = response["result"]
    for chunk in response_txt.split("\n"):
        if not chunk:
            print()
            continue
        print("\n".join(textwrap.wrap(chunk, 100, break_long_words=False)))

In [3]:
import os.path
parser = LlamaParse(
    result_type="markdown",
    # parsing_instruction=instruction,
    max_timeout=89000,
)
pdf_directory = './data/'

markdown_directory = './data'

pdf_files = glob.glob(os.path.join(pdf_directory, 'Handbook_IIT Mandi_final_v3.pdf'))

In [4]:
import asyncio
import nest_asyncio
nest_asyncio.apply()
from httpx import ReadTimeout

# for 5 tries
max_retries = 5
retry_delay = 3  # in seconds

async def process_pdf(pdf_file):
    retries = 0
    while retries < max_retries:
        try:
            print(f"Processing file: {pdf_file} (Attempt {retries + 1}/{max_retries})")

            markdown_content = await parser.aload_data(pdf_file)
            print(f"Successfully parsed PDF: {pdf_file}")

            markdown_file = os.path.join(markdown_directory, os.path.basename(pdf_file).replace('.pdf', '.md'))
            with open(markdown_file, "w", encoding='utf-8') as md_file:  # Changed "a" to "w" to overwrite if exists
                md_file.write(markdown_content[0].text)
            print(f"Markdown file saved: {markdown_file}")
            return

        except ReadTimeout:
            print(f"ReadTimeout occurred while processing {pdf_file}. Retrying {retries + 1}/{max_retries}...")
            retries += 1
            await asyncio.sleep(retry_delay)
        except Exception as e:
            print(f"An error occurred while processing {pdf_file}: {e}")
            retries += 1
            await asyncio.sleep(retry_delay)

    print(f"Failed to process {pdf_file} after {max_retries} retries.")
    

if asyncio.get_event_loop().is_running():
    tasks = [process_pdf(pdf_file) for pdf_file in pdf_files]
    results = await asyncio.gather(*tasks)
else:
    tasks = [process_pdf(pdf_file) for pdf_file in pdf_files]
    results = asyncio.run(asyncio.gather(*tasks))



Processing file: ./data/Handbook_IIT Mandi_final_v3.pdf (Attempt 1/5)
Started parsing the file under job_id 79c96224-1a79-47c6-a10e-877937aeebad
Successfully parsed PDF: ./data/Handbook_IIT Mandi_final_v3.pdf
Markdown file saved: ./data\Handbook_IIT Mandi_final_v3.md
