In [18]:
#!pip install pandas
!pip install cryptography

Collecting cryptography
  Downloading cryptography-44.0.0-cp39-abi3-macosx_10_9_universal2.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 6.1 MB/s eta 0:00:01
Installing collected packages: cryptography
Successfully installed cryptography-44.0.0


In [2]:
!curl http://localhost:11434/api/tags
#If above doesnt work start ollama
#ollama serve
#Pulling models
#ollama pull llama2

{"models":[{"name":"llama3.2:latest","model":"llama3.2:latest","modified_at":"2025-02-01T12:50:30.696551832-06:00","size":2019393189,"digest":"a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72","details":{"parent_model":"","format":"gguf","family":"llama","families":["llama"],"parameter_size":"3.2B","quantization_level":"Q4_K_M"}},{"name":"phi:latest","model":"phi:latest","modified_at":"2025-01-31T00:05:20.974236818-06:00","size":1602463378,"digest":"e2fd6321a5fe6bb3ac8a4e6f1cf04477fd2dea2924cf53237a995387e152ee9c","details":{"parent_model":"","format":"gguf","family":"phi2","families":["phi2"],"parameter_size":"3B","quantization_level":"Q4_0"}},{"name":"tinyllama:latest","model":"tinyllama:latest","modified_at":"2025-01-31T00:04:15.986647282-06:00","size":637700138,"digest":"2644915ede352ea7bdfaff0bfac0be74c719d5d5202acb63a6fb095b52f394a4","details":{"parent_model":"","format":"gguf","family":"llama","families":["llama"],"parameter_size":"1B","quantization_level":"Q4_0"

In [22]:
import os
import json
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import Ollama
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Define the detailed categories
document_categories = {
    "Work-Related": [
        "Employment Contracts", "Performance Reviews", "Meeting Notes", "Technical Documentation",
        "Certifications & Training", "Payslips & Financial Records", "Project Reports",
        "Code Documentation", "Reference Materials"
    ],
    "College/Academics": [
        "Lecture Notes", "Assignments & Homework", "Research Papers", "Exam Papers & Solutions",
        "Course Syllabi", "Academic Transcripts", "Certificates & Diplomas",
        "College Applications", "Study Materials"
    ],
    "Personal Documents": [
        "Identification Documents", "Legal Documents", "Financial Statements",
        "Insurance Policies", "Utility Bills", "Rental Agreements",
        "Personal Letters & Diaries", "Password Lists/Security Info"
    ],
    "Projects/Open Source": [
        "Project Proposals", "Design Documents", "GitHub Readmes", "Documentation",
        "Contribution Guidelines", "Project Reports & Status Updates"
    ],
    "Receipts/Payments": [
        "Online Purchase Receipts", "Bank Transfer Confirmations", "Utility Payment Records",
        "Subscription Invoices", "Tax Receipts", "Reimbursement Forms", "Loan Documents"
    ],
    "Entertainment & Hobbies": [
        "Book PDFs & eBooks", "Game Manuals & Guides", "Music Sheets", "Art/Photography Collections",
        "Movie Scripts", "Cooking Recipes", "Event Tickets/Passes", "Personal Journals/Stories"
    ],
    "Health & Fitness": [
        "Medical Reports", "Prescription Records", "Vaccination Certificates",
        "Health Insurance Documents", "Workout Plans", "Diet Charts",
        "Doctor’s Notes", "Pregnancy & Maternity Docs"
    ],
    "Travel": [
        "Flight Tickets & Itineraries", "Hotel Bookings", "Travel Insurance", "Visa Documents",
        "Maps & Travel Guides", "Travel Plans & Checklists", "Car Rental Agreements", "Cruise Documents"
    ],
    "Finance & Investments": [
        "Tax Documents", "Investment Statements", "Cryptocurrency Reports", "Loan Agreements",
        "Stock Portfolio Reports", "Budget & Expense Trackers"
    ],
    "Legal Documents": [
        "Contracts & Agreements", "Court Documents", "Affidavits", "Property Deeds", "NDAs",
        "Legal Correspondence"
    ],
    "Household & Property": [
        "Property Deeds", "Lease/Rental Agreements", "Home Improvement Receipts",
        "Appliance Manuals", "Maintenance Records", "Real Estate Documents"
    ],
    "Kids & Family": [
        "School Reports", "Birth Certificates", "Family Tree Documents", "Activity Plans", "Parenting Guides"
    ],
    "Miscellaneous": [
        "Scanned Notes", "Random Downloads", "Uncategorized Documents"
    ]
}

# Function to load PDFs from a folder
def load_pdfs_from_folder(folder_path):
    documents = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            file_path = os.path.join(folder_path, file)
            loader = PyPDFLoader(file_path)
            pages = loader.load()
            text = "\n".join([page.page_content for page in pages])
            documents.append((file, text))
    return documents

# Load Llama 2 using Ollama
llm = Ollama(model="mistral")

# Define classification prompt
template = """
You are an AI assistant trained to classify documents into specific categories.
The available categories and subcategories are:

{categories}

Given the following document content, classify it into the top appropriate subcategory only:

{document_text}
"""

prompt = PromptTemplate(
    input_variables=["categories", "document_text"],
    template=template
)

chain = LLMChain(llm=llm, prompt=prompt)

# Classify documents
def classify_documents(documents):
    results = []
    categories_formatted = "\n".join([
        f"{main_category}: {', '.join(subcategories)}"
        for main_category, subcategories in document_categories.items()
    ])

    for file_name, text in documents:
        response = chain.run(categories=categories_formatted, document_text=text[:1000])  # Limit text to 1000 chars
        results.append({"file_name": file_name, "subcategory": response.strip()})
    return results

# Main execution
folder_path = "/Users/nagasudheerravela/Desktop/Sudheer_AllDocs/test_pdf"  # Change this to your actual folder path
documents = load_pdfs_from_folder(folder_path)
classified_docs = classify_documents(documents)


In [23]:
for doc in classified_docs:
    print(f"{doc['file_name']} -> {doc['subcategory']}")

134988218_1657135083804.pdf -> This document falls under the category of **Finance & Investments > Interest Certificate**. The specific subcategory here would be **Interest Certificates**.
Onboarding Logon Docs.pdf -> The document belongs to the subcategory "Work-Related > Technical Documentation". This document appears to provide logon credentials, LAN passwords, and instructions for setting up applications (VAST, VISTA, and COMPASS) and software like Lotus Notes and Outlook. It is a technical document related to employment or contractor work.
CheckStub_2020-02-25.pdf -> The given document is a Payslip & Financial Record under the Work-Related category. Specifically, it falls under the subcategory "Payslips & Financial Records".
CheckStub_2020-01-10.pdf -> Category: Work-Related
   Subcategory: Payslips & Financial Records
134988218_1657135081084.pdf -> This document belongs to the subcategory "Finance & Investments" specifically under "Interest Certificates".
CheckStub_2020-01-24.pdf