# Extracting the text from the doc


In [None]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [None]:
import pymupdf
from google.colab import userdata
HF_TOKEN = userdata.get("HUGGINGFACE_API_KEY")

In [None]:
print(pymupdf.__version__)

1.26.3


In [None]:
file = "/content/pp1.pdf"

In [None]:
import sys, pathlib
with pymupdf.open(file) as doc:  # open document
    extracted = chr(12).join([page.get_text() for page in doc])
# write as a binary file to support non-ASCII characters



FileNotFoundError: no such file: '/content/pp1.pdf'

In [None]:
print(extracted)

In [None]:
# Preprocessing the document

import re
import unicodedata
from collections import Counter

# Normalize unicode and whitespace
extracted = unicodedata.normalize("NFKC", extracted)
extracted = extracted.replace('\r\n', '\n')
extracted = re.sub(r'[ \t]+', ' ', extracted)
extracted = re.sub(r'\s+', ' ', extracted).strip()

# Remove repeated lines (common for headers/footers)
lines = extracted.split('\n')
line_counts = Counter(lines)
extracted = '\n'.join([line for line in lines if line_counts[line] < 3])

# Remove page numbers, headers/footers, and boilerplate patterns
extracted = re.sub(r'Page\s*\d+(\s*of\s*\d+)?', '', extracted, flags=re.IGNORECASE)
extracted = re.sub(r'\f', '', extracted)  # form feed from PDF
extracted = re.sub(r'^\s*\d+\s*$', '', extracted, flags=re.MULTILINE)
extracted = re.sub(r'(Company Name|Confidential|Insurance Ltd.)', '', extracted, flags=re.IGNORECASE)

# Normalize bullets and remove unwanted characters
extracted = re.sub(r'[•–—]', '-', extracted)   # Normalize bullet characters
extracted = re.sub(r'[^\x00-\x7F]+', ' ', extracted)  # Remove non-ASCII characters

# Remove all-uppercase lines (often section titles or noise)
lines = extracted.split('\n')
extracted = '\n'.join([line for line in lines if not line.strip().isupper()])

# Fix broken line breaks in the middle of sentences
extracted = re.sub(r'(?<!\n)\n(?![\n])', ' ', extracted)

# Final cleanup: collapse whitespace again
extracted = re.sub(r'\s+', ' ', extracted).strip()


In [None]:
extracted

# Dividing into chunks and setting up RAG

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,      # Token or char size per chunk
    chunk_overlap=100,    # Overlap for context
    separators=["\n\n", "\n", ".", " ", ""]
)

# Split the cleaned `extracted` text
chunks = text_splitter.split_text(extracted)


In [None]:
!pip install -U langchain-community


In [None]:
!pip install transformers accelerate langchain langchain-community


In [None]:
!pip install faiss-cpu


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Load embedding model (or use OpenAIEmbeddings if using OpenAI)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create the FAISS vector store
vectorstore = FAISS.from_texts(chunks, embedding_model)


In [None]:
!pip install -U transformers accelerate huggingface_hub langchain langchain-community


In [None]:
from huggingface_hub import login

login(token=HF_TOKEN)  # Paste your real token here (read access is enough)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Load model and tokenizer
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype="auto", use_auth_token=True)

# Build the HF pipeline
mistral_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.2,
    return_full_text=False
)

# Wrap for LangChain
llm = HuggingFacePipeline(pipeline=mistral_pipe)

# Use your existing vectorstore and build retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Build RAG chain
rag_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)




In [None]:
query = "46-year-old male, knee surgery in Pune, 3-month-old insurance policy"


response = rag_chain.invoke(query)

print("Answer:", response['result'])

# Setting up LLM for analyzing the data

In [None]:
from huggingface_hub import InferenceClient
from google.colab import userdata

HF_TOKEN = userdata.get("HUGGINGFACE_API_KEY")

client = InferenceClient(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    token=HF_TOKEN,
)


prompt = "46M, knee surgery, Pune, 3-month policy"

# Use chat_completion instead of chat.completions.create
response = client.chat_completion(
    messages=[
        {
            "role": "system",
            "content": f"You are a legal insurance and policy manager. Based on the following policy document:\n\n{extracted}\n\nDecide if the claim is valid, covered under the policy, and cite the clause and reasoning."
        },
        {
            "role": "user",
            "content": prompt
        }
    ],

)

print(response)
