In [1]:
!pip install requests sqlalchemy pymilvus sentence-transformers fastapi uvicorn

Collecting pymilvus
  Downloading pymilvus-2.4.9-py3-none-any.whl.metadata (5.6 kB)
Collecting fastapi
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.32.0-py3-none-any.whl.metadata (6.6 kB)
Collecting environs<=9.5.0 (from pymilvus)
  Downloading environs-9.5.0-py2.py3-none-any.whl.metadata (14 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting milvus-lite<2.5.0,>=2.4.0 (from pymilvus)
  Downloading milvus_lite-2.4.10-py3-none-manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting starlette<0.42.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.41.2-py3-none-any.whl.metadata (6.0 kB)
Collecting marshmallow>=3.0.0 (from environs<=9.5.0->pymilvus)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting python-dotenv (from environs<=9.5.0->pymilvus)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.

In [11]:
import requests
import json
import time
from fastapi import FastAPI, HTTPException, Depends
from sqlalchemy import create_engine, Column, String, Integer, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, Session
from sentence_transformers import SentenceTransformer

# FastAPI setup
app = FastAPI()

# SQL Database setup
DATABASE_URL = "sqlite:///./documents.db"
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()

# SQL Table Definition
class Document(Base):
    __tablename__ = "documents"

    id = Column(Integer, primary_key=True, index=True)
    filename = Column(String, unique=True, index=True)
    category = Column(String, index=True)
    ocr_text_tesseract = Column(Text)
    ocr_text_easyocr = Column(Text)
    summary = Column(Text)

Base.metadata.create_all(bind=engine)

# Embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# OpenRouter API settings
OPENROUTER_API_KEY = "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"
API_URL = "https://openrouter.ai/api/v1/chat/completions"

# Function to get summary from OpenRouter API
def summarize_text_openrouter(text):
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
    }

    data = {
        "model": "nousresearch/hermes-3-llama-3.1-405b:free",
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Summarize the following document:"},
                    {"type": "text", "text": text}
                ]
            }
        ]
    }

    response = requests.post(url=API_URL, headers=headers, data=json.dumps(data))

    if response.status_code == 200:
        return response.json().get("choices", [{}])[0].get("message", {}).get("content", "No summary returned.")
    else:
        return f"Error: {response.status_code}, {response.text}"

def summarize_text_openrouter_with_retry(text, retries=3, delay=2):
    for attempt in range(retries):
        try:
            return summarize_text_openrouter(text)
        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}. Retrying in {delay} seconds...")
            time.sleep(delay)
    raise Exception("Failed to get summary from OpenRouter API after retries.")

def save_to_sql(session, filename, category, ocr_text_tesseract, ocr_text_easyocr, summary):
    db_doc = Document(
        filename=filename,
        category=category,
        ocr_text_tesseract=ocr_text_tesseract,
        ocr_text_easyocr=ocr_text_easyocr,
        summary=summary
    )
    session.add(db_doc)
    session.commit()
    session.refresh(db_doc)
    return db_doc

def save_document_with_summary(db_session, filename, category, ocr_text_tesseract, ocr_text_easyocr):
    summary_tesseract = summarize_text_openrouter_with_retry(ocr_text_tesseract)
    summary_easyocr = summarize_text_openrouter_with_retry(ocr_text_easyocr)
    summary = f"Tesseract Summary: {summary_tesseract}\nEasyOCR Summary: {summary_easyocr}"

    db_doc = save_to_sql(db_session, filename, category, ocr_text_tesseract, ocr_text_easyocr, summary)
    return db_doc

# FastAPI Dependency for Database Session
def get_db():
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()

# FastAPI Endpoint
@app.get("/get_summary")
def get_summary(filename: str, db: Session = Depends(get_db)):
    doc = db.query(Document).filter(Document.filename == filename).first()
    if not doc:
        raise HTTPException(status_code=404, detail="Document not found")
    return {"filename": doc.filename, "category": doc.category, "summary": doc.summary}

# Example: Process your JSON file and save data to SQL
def process_json_data(file_path):
    # Load JSON data from file
    with open(file_path, 'r') as f:
        json_data = json.load(f)

    for category, documents in json_data.items():
        for doc in documents:
            filename = doc['filename']
            ocr_text_tesseract = doc['tesseract']['text']
            ocr_text_easyocr = doc['easyocr']['text']

            # Start a database session
            db_session = SessionLocal()

            # Save document, generate summary, and store in Milvus Lite
            saved_doc = save_document_with_summary(db_session, filename, category, ocr_text_tesseract, ocr_text_easyocr)
            print(f"Document {filename} saved. Summary: {saved_doc.summary}")

            # Close session
            db_session.close()

# Example usage: Replace 'path_to_your_json_file.json' with your actual JSON file path
#process_json_data('/content/drive/MyDrive/TRIAL1/ocr_results.json')


  Base = declarative_base()


Document 2073487737.jpg saved. Summary: Tesseract Summary: The document appears to be a carton offer for smokers, with the following key points:

1. The offer is restricted to smokers who are 23 years of age or older.


3. The document contains various text and images related to the offer, which are difficult to interpret from the provided information.

4. The phrases "Doral", "Discover", and "Marlboro" are mentioned, which could indicate the brands associated with the offer.

Please note that the image provided is of poor quality, making it challenging to accurately summarize all the details. The summary is based on the limited legible information available.
EasyOCR Summary: The document appears to be an advertisement or offer from Doral, a cigarette brand owned by Reynolds Tobacco Company. The key points are:

1. Doral is offering a free carton of cigarettes for every 70 pack seals as part of their rewards program for loyal smokers.

2. To learn more about the free carton offer, cust

KeyboardInterrupt: 

In [4]:
import requests
import json
import time
import numpy as np
from sqlalchemy import create_engine, Column, String, Integer, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker, Session
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import time

# SQL Database setup
DATABASE_URL = "sqlite:///./documents.db"
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()

# SQL Table Definition
class Document(Base):
    __tablename__ = "documents"

    id = Column(Integer, primary_key=True, index=True)
    filename = Column(String, unique=True, index=True)
    category = Column(String, index=True)
    ocr_text_tesseract = Column(Text)
    ocr_text_easyocr = Column(Text)
    summary = Column(Text)

Base.metadata.create_all(bind=engine)

# FastAPI Dependency for Database Session
def get_db():
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()

def compute_metrics(embeddings_tesseract, embeddings_easyocr, categories, model_name, file):
    # Metric 1: Average cosine similarity between Tesseract and EasyOCR embeddings per document
    cos_similarities = [
        cosine_similarity([emb_t], [emb_e])[0][0]
        for emb_t, emb_e in zip(embeddings_tesseract, embeddings_easyocr)
    ]
    avg_cosine_similarity = np.mean(cos_similarities)

    # Metric 2: Intra-category and inter-category cosine similarities
    unique_categories = list(set(categories))
    intra_similarities = []
    inter_similarities = []

    for i, category_i in enumerate(unique_categories):
        category_indices = [j for j, cat in enumerate(categories) if cat == category_i]
        other_indices = [j for j, cat in enumerate(categories) if cat != category_i]

        for idx in category_indices:
            intra_similarities.extend(
                cosine_similarity([embeddings_tesseract[idx]], [embeddings_tesseract[k]])[0][0]
                for k in category_indices if idx != k
            )
            inter_similarities.extend(
                cosine_similarity([embeddings_tesseract[idx]], [embeddings_tesseract[k]])[0][0]
                for k in other_indices
            )

    avg_intra_similarity = np.mean(intra_similarities) if intra_similarities else 0
    avg_inter_similarity = np.mean(inter_similarities) if inter_similarities else 0

    # Write the results to file
    file.write(f"Model: {model_name}\n")
    file.write(f"Average Cosine Similarity between Tesseract and EasyOCR embeddings: {avg_cosine_similarity:.4f}\n")
    file.write(f"Average Intra-Category Similarity: {avg_intra_similarity:.4f}\n")
    file.write(f"Average Inter-Category Similarity: {avg_inter_similarity:.4f}\n\n")

def test_embedding_models(db: Session):
    # Test different embedding models
    models = ["all-MiniLM-L6-v2", "all-mpnet-base-v2", "all-distilroberta-v1", "paraphrase-MiniLM-L12-v2"]

    # Open the file for writing results
    with open("embedding_model_comparison.txt", "w") as file:
        for model_name in models:
            print(f"\nTesting model: {model_name}")
            file.write(f"Testing model: {model_name}\n")
            embedding_model = SentenceTransformer(model_name)

            # Get all document texts and categories
            documents = db.query(Document).all()

            # Check if documents exist before proceeding
            if not documents:  # If documents is empty
                print("No documents found in the database. Please process and save documents first.")
                file.write("No documents found in the database. Please process and save documents first.\n")
                continue  # Skip to the next model

            embeddings_tesseract = []
            embeddings_easyocr = []
            categories = []

            # Track embedding computation time
            start_time = time.time()

            # Compute embeddings for all documents
            for doc in tqdm(documents, desc=f"Computing embeddings with {model_name}"):
                emb_tesseract = embedding_model.encode(doc.ocr_text_tesseract)
                emb_easyocr = embedding_model.encode(doc.ocr_text_easyocr)

                embeddings_tesseract.append(emb_tesseract)
                embeddings_easyocr.append(emb_easyocr)
                categories.append(doc.category)

            end_time = time.time()
            avg_embedding_time = (end_time - start_time) / len(documents)
            file.write(f"Average Embedding Time per Document: {avg_embedding_time:.4f} seconds\n")

            # Calculate and store evaluation metrics
            compute_metrics(embeddings_tesseract, embeddings_easyocr, categories, model_name, file)

    print("Embedding model comparison results are saved in 'embedding_model_comparison.txt'")
def main():
    db_session = SessionLocal()
    test_embedding_models(db_session)
    db_session.close()

if __name__ == "__main__":
    main()


  Base = declarative_base()



Testing model: all-MiniLM-L6-v2


Computing embeddings with all-MiniLM-L6-v2: 100%|██████████| 209/209 [00:58<00:00,  3.57it/s]



Testing model: all-mpnet-base-v2


Computing embeddings with all-mpnet-base-v2: 100%|██████████| 209/209 [07:57<00:00,  2.29s/it]



Testing model: all-distilroberta-v1


Computing embeddings with all-distilroberta-v1: 100%|██████████| 209/209 [04:24<00:00,  1.26s/it]



Testing model: paraphrase-MiniLM-L12-v2


Computing embeddings with paraphrase-MiniLM-L12-v2: 100%|██████████| 209/209 [01:42<00:00,  2.04it/s]


Embedding model comparison results are saved in 'embedding_model_comparison.txt'
