In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from PyPDF2 import PdfReader
from openai import OpenAI
import yaml
import json

In [3]:
CONFIG_PATH = r"config.yaml"

In [4]:
with open(CONFIG_PATH) as file:
    data = yaml.load(file, Loader=yaml.FullLoader)
    api_key = data['OPENAI_API_KEY']

In [5]:
def get_pdf_data(fpath):
    text = ""
    pdf = PdfReader(fpath)
    for page_num in range(len(pdf.pages)):
        page = pdf.pages[page_num]
        page_text = page.extract_text()
        text += page_text
    return text

In [6]:
def get_llm():
    openai_client = OpenAI(
        api_key=api_key
    )

    return openai_client

In [7]:
def get_invoice_info_from_llm(data):
    llm = get_llm()
    prompt = "Act as an expert in extracting information from medical invoices. You are given with the invoice details of a patient. Go through the given document carefully and extract the 'disease' and the 'expense amount' from the data. Return the data in json format = {'disease':"",'expense':""}"
    messages=[
        {"role": "system", 
        "content": prompt}
        ]
    
    user_content = f"INVOICE DETAILS: {data}"

    messages.append({"role": "user", "content": user_content})

    response = llm.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=messages,
                temperature=0.4,
                max_tokens=2500)
        
    data = json.loads(response.choices[0].message.content)

    return data

In [16]:
def check_claim_validity(patient_disease_info, disease_exclusion_list, threshold=0.4):
    vectorizer = CountVectorizer()
    patient_info_vector = vectorizer.fit_transform([patient_disease_info])

    for disease in disease_exclusion_list:
        disease_vector = vectorizer.transform([disease])
        similarity = cosine_similarity(patient_info_vector, disease_vector)[0][0]
        if similarity > threshold:
            claim_status = f"{patient_disease_info} is present is disease exclusion list. Claim Rejected"
            return claim_status
        
        claim_status = f"{patient_disease_info} is not present is disease exclusion list. Claim Accepted"
    
    return claim_status

In [9]:
data = get_pdf_data("Bills/MedicalBill1.pdf")
invoice_details = get_invoice_info_from_llm(data)

In [10]:
print(f"Disease: {invoice_details["disease"]}")
print(f"Claim Amount: {invoice_details["expense"]}")

Disease: Bodyache with fever, cold
Claim Amount: 3150


In [17]:
disease_exclusion_list = ["cancer", "HIV/AIDS", "Parkinson's disease", "Alzheimer's disease"]
threshold = 0.4
data = check_claim_validity("Lung cancer", disease_exclusion_list, threshold)
print(data)

Lung cancer is present is disease exclusion list. Claim Rejected


In [18]:
disease_exclusion_list = ["cancer", "HIV/AIDS", "Parkinson's disease", "Alzheimer's disease"]
threshold = 0.4
data = check_claim_validity(invoice_details["disease"], disease_exclusion_list, threshold)
print(data)

Bodyache with fever, cold is not present is disease exclusion list. Claim Accepted
