In [3]:
!pip install pymupdf
!pip install pdfplumber
!pip install faiss-gpu
!pip install faiss-cpu
!pip install tabula-py
!pip install fitz
!pip install sentence-transformers
!sudo apt-get update -y
!sudo apt-get --fix-broken install -y
!sudo apt-get install -y openjdk-11-jdk
!pip uninstall -y autogluon-multimodal 
!pip uninstall -y autogluon-timeseries
!pip install accelerate==0.26.0



Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5
Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 

In [2]:
import tempfile
import torch
import io
import pandas as pd
import re
from tqdm import tqdm 
import pymupdf
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import time
import sys

# Load the LLM model
model_nam = 'microsoft/Phi-3.5-mini-instruct'
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    model_nam, device_map="cuda", torch_dtype='auto', trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_nam)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Compile model for performance boost (Torch 2.0+)
# model = torch.compile(model)

def extract_pdf_content(pdf_path):
    doc = pymupdf.open(pdf_path)
    pdfplumber_tables = []
    for i in range(len(doc)):
        page_one = doc[i]
        find = page_one.find_tables()
        len_tb = len(find.tables)
        for k in range(len_tb):
            table1 = find.tables[k]
            tab_dat = table1.extract()
            df_table = pd.DataFrame(tab_dat)
            pdfplumber_tables.append({
                "content": df_table.apply(lambda row: " | ".join(row.values.astype(str)), axis=1).tolist()
            })
    return pdfplumber_tables

def preprocess_tables(tables):
    processed_tables = []
    for table_data in tables:
        processed_tables.append({
            "type": "table",
            "content": table_data['content'],
        })
    return processed_tables

def extract_content_and_embeddings(pdf_path):
    pdf_tab = extract_pdf_content(pdf_path)
    structured_pdfplumber_tables = preprocess_tables(pdf_tab)
    return structured_pdfplumber_tables

def extract_pdf_data(pdf_path):
    pdf_extracted_content = extract_content_and_embeddings(pdf_path)
    pdf_m = []
    for i in range(len(pdf_extracted_content)):
        d = pdf_extracted_content[i]['content']
        ll = '\n'.join(str(item).replace('\n', ' ') for item in d)
        ll = ll.replace('None', '')
        ll = ll.replace('|  |', '|')
        pdf_m.append(ll)
    return pdf_m

def generate_answers(document_text, user_question):
    messages = [
        {"role": "system", "content": "You are an AI chatbot that extracts and summarizes information from documents. Answer the user's question in a helpful and conversational way."},
        {"role": "user", "content": f"Here is a document: {document_text}\nNow, answer the following question: {user_question}"}
    ]

    generation_args = {  
        "max_new_tokens": 200,  
        "return_full_text": False,
        "temperature": None,      
        "do_sample": False,  
        "top_p": None,
        "use_cache":False
    }
    output = pipe(messages, **generation_args)
    return output[0]['generated_text']

def typing_effect(text, speed=0.02):
    """Simulates a typing effect when displaying the chatbot's response."""
    for char in text:
        sys.stdout.write(char)
        sys.stdout.flush()
        time.sleep(speed)
    print()  # Move to the next line after finishing typing

# Main script
print("\n📄 Welcome! I'm your AI chatbot. I'll help you extract information from your PDF.")
pdf_path = input('🔹 Enter the PDF path: ')
document_text = extract_pdf_data(pdf_path)

print("\n✅ PDF processed! Now you can start asking questions.")
print("💬 Type your question or type 'exit' to quit.")

while True:
    user_question = input('\n🟢 You: ')
    if user_question.lower() in ["exit", "quit", "bye"]:
        print("👋 Goodbye! Have a great day!")
        break
    elif len(user_question.strip()) == 0:
        print("⚠️ Please enter a valid question.")
        continue

    start_time = time.time()
    print("\n🤖 AI is typing...", end="", flush=True)
    qa_output = generate_answers('\n'.join(document_text), user_question)
    end_time = time.time()

    print("\r", end="")  # Remove "AI is typing..." before displaying output
    typing_effect(f"🤖 AI: {qa_output}")
    print(f"⏱️ Response time: {end_time - start_time:.2f} seconds")


2025-04-18 06:31:59.187957: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744957919.391791      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744957919.449437      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Device set to use cuda



📄 Welcome! I'm your AI chatbot. I'll help you extract information from your PDF.


🔹 Enter the PDF path:  /kaggle/input/prrrrrr/ROJECT.pdf



✅ PDF processed! Now you can start asking questions.
💬 Type your question or type 'exit' to quit.



🟢 You:  what is the application number?



🤖 AI is typing... AI:  The application number is RA2332014010043.
⏱️ Response time: 122.82 seconds



🟢 You:  /kaggle/input/prrrrrr/ROJECT.pdf



🤖 AI is typing...

KeyboardInterrupt: 