In [1]:
"""
# Document AI Integration for OCR Processing
## Implementing handwritten note processing and cover art analysis
"""

from google.cloud import documentai
import base64
from PIL import Image
import io
import sys
import os

# Setup paths
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
sys.path.insert(0, os.path.join(project_root, 'src'))

from dotenv import load_dotenv
load_dotenv(os.path.join(project_root, '.env'))

from config.bigquery_config import config
client = config.get_client()

print("Document AI Integration for Vinyl Collection Processing")
print("=" * 60)

# Initialize Document AI client
def init_document_ai():
    try:
        project_id = config.project_id
        location = "us"  # Document AI location
        
        doc_client = documentai.DocumentProcessorServiceClient()
        processor_name = f"projects/{project_id}/locations/{location}/processors/PROCESSOR_ID"
        
        print("Document AI client initialized")
        return doc_client, processor_name
    except Exception as e:
        print(f"Document AI setup: {e}")
        print("Simulating OCR functionality for demo")
        return None, None

doc_client, processor_name = init_document_ai()

# Simulate realistic handwritten collection notes
simulated_handwritten_notes = [
    {
        'image_filename': 'handwritten_001.jpg',
        'simulated_ocr_text': 'Miles Davis - Kind of Blue\nColumbia Records 1959\nCondition: VG+\nPaid: $28\nNotes: Perfect late night album',
        'note_type': 'collection_entry'
    },
    {
        'image_filename': 'shopping_list_001.jpg', 
        'simulated_ocr_text': 'Want to buy:\n- Art Blakey Moanin\n- Bill Evans Waltz for Debby\n- Hank Mobley Soul Station\nBudget: $150',
        'note_type': 'wishlist'
    },
    {
        'image_filename': 'listening_notes_001.jpg',
        'simulated_ocr_text': 'A Love Supreme sessions:\nListened 3x this week\nSpiritual intensity incredible\nTrack 2 = favorite\nCondition still mint',
        'note_type': 'listening_log'
    }
]

print("Simulated OCR Processing Results:")
for note in simulated_handwritten_notes:
    print(f"\nImage: {note['image_filename']}")
    print(f"Type: {note['note_type']}")
    print(f"OCR Text: {note['simulated_ocr_text'][:100]}...")

Document AI Integration for Vinyl Collection Processing
Document AI client initialized
Simulated OCR Processing Results:

Image: handwritten_001.jpg
Type: collection_entry
OCR Text: Miles Davis - Kind of Blue
Columbia Records 1959
Condition: VG+
Paid: $28
Notes: Perfect late night ...

Image: shopping_list_001.jpg
Type: wishlist
OCR Text: Want to buy:
- Art Blakey Moanin
- Bill Evans Waltz for Debby
- Hank Mobley Soul Station
Budget: $15...

Image: listening_notes_001.jpg
Type: listening_log
OCR Text: A Love Supreme sessions:
Listened 3x this week
Spiritual intensity incredible
Track 2 = favorite
Con...


In [None]:
# Advanced metadata extraction using pattern recognition
import re
from typing import Dict, List, Optional

class AdvancedMetadataExtractor:
    def __init__(self):
        self.artist_patterns = [
            r'(Miles Davis|John Coltrane|Art Blakey|Bill Evans|Horace Silver|Lee Morgan|Hank Mobley)',
            r'([A-Z][a-z]+ [A-Z][a-z]+)',  # First Last name pattern
        ]
        
        self.album_patterns = [
            r'(Kind of Blue|A Love Supreme|Moanin|Giant Steps|Blue Train)',
            r'([A-Z][a-z]+ (?:of|for|in|with) [A-Z][a-z]+)',  # "Word of Word" pattern
        ]
        
        self.price_patterns = [
            r'\$(\d+(?:\.\d{2})?)',
            r'paid:?\s*\$?(\d+)',
            r'cost:?\s*\$?(\d+)'
        ]
        
        self.condition_patterns = [
            r'(Mint|Near Mint|Very Good\+?|Good\+?|Fair|Poor)',
            r'condition:?\s*(mint|nm|vg\+?|vg|g\+?|g|fair|poor)'
        ]
        
        self.label_patterns = [
            r'(Blue Note|Columbia|Atlantic|Impulse!|Verve|Prestige|Riverside)',
            r'([A-Z][a-z]+ Records?)'
        ]
    
    def extract_from_text(self, text: str) -> Dict:
        """Extract structured metadata from unstructured text"""
        
        extracted = {
            'artist': None,
            'album': None,
            'price': None,
            'condition': None,
            'label': None,
            'year': None,
            'confidence_score': 0.0
        }
        
        confidence_points = 0
        total_fields = 6
        
        # Extract artist
        for pattern in self.artist_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                extracted['artist'] = match.group(1)
                confidence_points += 1
                break
        
        # Extract album
        for pattern in self.album_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                extracted['album'] = match.group(1)
                confidence_points += 1
                break
        
        # Extract price
        for pattern in self.price_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                extracted['price'] = float(match.group(1))
                confidence_points += 1
                break
        
        # Extract condition
        for pattern in self.condition_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                extracted['condition'] = match.group(1).replace('+', '_plus')
                confidence_points += 1
                break
        
        # Extract label
        for pattern in self.label_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                extracted['label'] = match.group(1)
                confidence_points += 1
                break
        
        # Extract year
        year_match = re.search(r'(19\d{2}|20[0-2]\d)', text)
        if year_match:
            extracted['year'] = int(year_match.group(1))
            confidence_points += 1
        
        extracted['confidence_score'] = confidence_points / total_fields
        
        return extracted

# Test metadata extraction
extractor = AdvancedMetadataExtractor()

test_texts = [
    "Miles Davis - Kind of Blue Columbia 1959 mint condition bought for $35 incredible trumpet work",
    "A Love Supreme John Coltrane Impulse! 1965 VG+ $42 spiritual masterpiece",
    "Art Blakey Moanin Blue Note 1958 Near Mint condition $38 incredible drumming",
    "Need: Bill Evans Waltz for Debby Riverside Records budget $30-40 range"
]

print("\nADVANCED METADATA EXTRACTION")
print("=" * 50)

extraction_results = []
for i, text in enumerate(test_texts):
    print(f"\nText {i+1}: {text}")
    extracted = extractor.extract_from_text(text)
    extraction_results.append(extracted)
    
    print(f"Extracted:")
    for key, value in extracted.items():
        if value is not None:
            print(f"  {key}: {value}")
    print(f"Confidence: {extracted['confidence_score']*100:.1f}%")

# Calculate overall extraction performance
avg_confidence = sum(r['confidence_score'] for r in extraction_results) / len(extraction_results)
print(f"\nOverall Extraction Accuracy: {avg_confidence*100:.1f}%")
print("Metadata extraction system operational")