In [1]:
import pandas as pd
import numpy as np
import sqlite3
import time
from pathlib import Path
import json
import logging
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from contextlib import contextmanager

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class DatabaseConfig:
    """Configuration for database setup"""
    db_path: Path
    enable_wal_mode: bool = True
    enable_foreign_keys: bool = True
    cache_size_mb: int = 256
    temp_store: str = "MEMORY"
    synchronous: str = "NORMAL"
    journal_mode: str = "WAL"

class VinylCatalogDB:
    """High-performance database interface for vinyl catalog"""
    
    def __init__(self, config: DatabaseConfig):
        self.config = config
        self.db_path = config.db_path
        self._ensure_db_directory()
        
    def _ensure_db_directory(self):
        """Ensure database directory exists"""
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        
    @contextmanager
    def get_connection(self):
        """Context manager for database connections with optimizations"""
        conn = sqlite3.connect(str(self.db_path))
        
        # Apply performance optimizations
        conn.execute(f"PRAGMA cache_size = -{self.config.cache_size_mb * 1024}")
        conn.execute(f"PRAGMA temp_store = {self.config.temp_store}")
        conn.execute(f"PRAGMA synchronous = {self.config.synchronous}")
        conn.execute(f"PRAGMA journal_mode = {self.config.journal_mode}")
        
        if self.config.enable_foreign_keys:
            conn.execute("PRAGMA foreign_keys = ON")
            
        # Enable row factory for dict-like access
        conn.row_factory = sqlite3.Row
        
        try:
            yield conn
        finally:
            conn.close()
            
    def create_schema(self):
        """Create optimized database schema"""
        with self.get_connection() as conn:
            # Main releases table with core data
            conn.execute("""
                CREATE TABLE IF NOT EXISTS releases (
                    release_id INTEGER PRIMARY KEY,
                    discogs_id INTEGER UNIQUE,
                    title TEXT NOT NULL,
                    artist TEXT NOT NULL,
                    album TEXT,
                    year INTEGER,
                    genre TEXT,
                    label TEXT,
                    country TEXT,
                    format TEXT,
                    status TEXT,
                    catalog_number TEXT,
                    master_id INTEGER,
                    
                    -- Quality and popularity metrics
                    popularity_score REAL DEFAULT 0.0,
                    rating REAL DEFAULT 0.0,
                    plays INTEGER DEFAULT 0,
                    favorites INTEGER DEFAULT 0,
                    
                    -- Metadata
                    duration INTEGER, -- in seconds
                    data_quality TEXT,
                    source TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            """)
            
            # Artists table for normalized artist data
            conn.execute("""
                CREATE TABLE IF NOT EXISTS artists (
                    artist_id INTEGER PRIMARY KEY,
                    discogs_artist_id INTEGER UNIQUE,
                    name TEXT NOT NULL,
                    real_name TEXT,
                    profile TEXT,
                    urls TEXT,
                    name_variations TEXT,
                    aliases TEXT,
                    quality_score INTEGER DEFAULT 0,
                    image_count INTEGER DEFAULT 0,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            """)
            
            # Labels table
            conn.execute("""
                CREATE TABLE IF NOT EXISTS labels (
                    label_id INTEGER PRIMARY KEY,
                    discogs_label_id INTEGER UNIQUE,
                    name TEXT NOT NULL,
                    contact_info TEXT,
                    parent_label_id INTEGER,
                    profile TEXT,
                    data_quality TEXT,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    FOREIGN KEY (parent_label_id) REFERENCES labels(label_id)
                )
            """)
            
            # Masters table for release grouping
            conn.execute("""
                CREATE TABLE IF NOT EXISTS masters (
                    master_id INTEGER PRIMARY KEY,
                    discogs_master_id INTEGER UNIQUE,
                    title TEXT NOT NULL,
                    original_year INTEGER,
                    main_release_id INTEGER,
                    num_versions INTEGER DEFAULT 1,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            """)
            
            # Genres table for normalization
            conn.execute("""
                CREATE TABLE IF NOT EXISTS genres (
                    genre_id INTEGER PRIMARY KEY AUTOINCREMENT,
                    name TEXT UNIQUE NOT NULL,
                    parent_genre_id INTEGER,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    FOREIGN KEY (parent_genre_id) REFERENCES genres(genre_id)
                )
            """)
            
            # Release-Artist junction table (many-to-many)
            conn.execute("""
                CREATE TABLE IF NOT EXISTS release_artists (
                    release_id INTEGER,
                    artist_id INTEGER,
                    role TEXT DEFAULT 'artist',
                    join_phrase TEXT,
                    anv TEXT, -- Artist Name Variation
                    PRIMARY KEY (release_id, artist_id, role),
                    FOREIGN KEY (release_id) REFERENCES releases(release_id),
                    FOREIGN KEY (artist_id) REFERENCES artists(artist_id)
                )
            """)
            
            # Tracklist table
            conn.execute("""
                CREATE TABLE IF NOT EXISTS tracklist (
                    track_id INTEGER PRIMARY KEY AUTOINCREMENT,
                    release_id INTEGER NOT NULL,
                    position TEXT NOT NULL,
                    title TEXT NOT NULL,
                    duration INTEGER, -- in seconds
                    FOREIGN KEY (release_id) REFERENCES releases(release_id)
                )
            """)
            
            # Reviews table for future MARD integration
            conn.execute("""
                CREATE TABLE IF NOT EXISTS reviews (
                    review_id INTEGER PRIMARY KEY AUTOINCREMENT,
                    release_id INTEGER NOT NULL,
                    reviewer TEXT,
                    rating REAL,
                    review_text TEXT,
                    sentiment_score REAL,
                    source TEXT,
                    review_date DATE,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    FOREIGN KEY (release_id) REFERENCES releases(release_id)
                )
            """)
            
            # User interactions for recommendation system
            conn.execute("""
                CREATE TABLE IF NOT EXISTS user_interactions (
                    interaction_id INTEGER PRIMARY KEY AUTOINCREMENT,
                    user_id TEXT NOT NULL,
                    release_id INTEGER NOT NULL,
                    interaction_type TEXT NOT NULL, -- 'play', 'favorite', 'skip', 'rate'
                    value REAL, -- rating value, play count, etc.
                    timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    FOREIGN KEY (release_id) REFERENCES releases(release_id)
                )
            """)
            
            conn.commit()
            logger.info("✅ Database schema created successfully")
    
    def create_indexes(self):
        """Create performance indexes"""
        indexes = [
            # Primary search indexes
            "CREATE INDEX IF NOT EXISTS idx_releases_artist ON releases(artist)",
            "CREATE INDEX IF NOT EXISTS idx_releases_title ON releases(title)",
            "CREATE INDEX IF NOT EXISTS idx_releases_album ON releases(album)",
            "CREATE INDEX IF NOT EXISTS idx_releases_genre ON releases(genre)",
            "CREATE INDEX IF NOT EXISTS idx_releases_year ON releases(year)",
            "CREATE INDEX IF NOT EXISTS idx_releases_label ON releases(label)",
            
            # Composite indexes for common queries
            "CREATE INDEX IF NOT EXISTS idx_releases_artist_year ON releases(artist, year)",
            "CREATE INDEX IF NOT EXISTS idx_releases_genre_year ON releases(genre, year)",
            "CREATE INDEX IF NOT EXISTS idx_releases_popularity ON releases(popularity_score DESC)",
            "CREATE INDEX IF NOT EXISTS idx_releases_rating ON releases(rating DESC)",
            
            # Foreign key indexes
            "CREATE INDEX IF NOT EXISTS idx_release_artists_release ON release_artists(release_id)",
            "CREATE INDEX IF NOT EXISTS idx_release_artists_artist ON release_artists(artist_id)",
            "CREATE INDEX IF NOT EXISTS idx_tracklist_release ON tracklist(release_id)",
            "CREATE INDEX IF NOT EXISTS idx_reviews_release ON reviews(release_id)",
            "CREATE INDEX IF NOT EXISTS idx_user_interactions_user ON user_interactions(user_id)",
            "CREATE INDEX IF NOT EXISTS idx_user_interactions_release ON user_interactions(release_id)",
            
            # Search optimization indexes
            "CREATE INDEX IF NOT EXISTS idx_artists_name ON artists(name)",
            "CREATE INDEX IF NOT EXISTS idx_labels_name ON labels(name)",
            "CREATE INDEX IF NOT EXISTS idx_masters_title ON masters(title)",
            
            # Full-text search preparation
            "CREATE INDEX IF NOT EXISTS idx_releases_search_terms ON releases(artist, title, album, genre, label)",
        ]
        
        with self.get_connection() as conn:
            for index_sql in indexes:
                try:
                    conn.execute(index_sql)
                    logger.info(f"Created index: {index_sql.split('idx_')[1].split(' ')[0]}")
                except sqlite3.Error as e:
                    logger.warning(f"Failed to create index: {e}")
            
            conn.commit()
            logger.info("✅ All indexes created successfully")
    
    def analyze_database(self):
        """Update SQLite statistics for query optimization"""
        with self.get_connection() as conn:
            conn.execute("ANALYZE")
            conn.commit()
            logger.info("✅ Database statistics updated")

# Setup paths and configuration
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = PROJECT_ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'
DB_DIR = DATA_DIR / 'database'

# Initialize database
db_config = DatabaseConfig(
    db_path=DB_DIR / 'vinyl_catalog.db',
    cache_size_mb=512,  # Use 512MB cache for better performance
)

vinyl_db = VinylCatalogDB(db_config)

print(f"🗄️ Database will be created at: {vinyl_db.db_path}")
print(f"📁 Project structure:")
print(f"  - Data directory: {DATA_DIR}")
print(f"  - Processed directory: {PROCESSED_DIR}")
print(f"  - Database directory: {DB_DIR}")

🗄️ Database will be created at: /Users/richpointofview/smart-vinyl-catalog/data/database/vinyl_catalog.db
📁 Project structure:
  - Data directory: /Users/richpointofview/smart-vinyl-catalog/data
  - Processed directory: /Users/richpointofview/smart-vinyl-catalog/data/processed
  - Database directory: /Users/richpointofview/smart-vinyl-catalog/data/database


In [2]:
class DataMigrator:
    """Handles migration of existing CSV data to database"""
    
    def __init__(self, db: VinylCatalogDB):
        self.db = db
        
    def migrate_catalog_data(self, csv_path: Path, batch_size: int = 1000) -> int:
        """Migrate main catalog data from CSV to database"""
        logger.info(f"Starting migration from {csv_path}")
        
        if not csv_path.exists():
            logger.error(f"CSV file not found: {csv_path}")
            return 0
            
        # Read CSV in chunks for memory efficiency
        chunk_count = 0
        total_migrated = 0
        
        for chunk in pd.read_csv(csv_path, chunksize=batch_size, low_memory=False):
            chunk_count += 1
            migrated = self._process_catalog_chunk(chunk, chunk_count)
            total_migrated += migrated
            
            if chunk_count % 10 == 0:
                logger.info(f"Processed {chunk_count} chunks, migrated {total_migrated} records")
        
        logger.info(f"✅ Migration complete: {total_migrated} records migrated")
        return total_migrated
    
    def _process_catalog_chunk(self, chunk: pd.DataFrame, chunk_num: int) -> int:
        """Process a single chunk of catalog data"""
        records_migrated = 0
        
        with self.db.get_connection() as conn:
            for _, row in chunk.iterrows():
                try:
                    # Clean and prepare data
                    release_data = self._prepare_release_data(row)
                    
                    # Insert or update release
                    conn.execute("""
                        INSERT OR REPLACE INTO releases (
                            discogs_id, title, artist, album, year, genre, label, 
                            country, format, status, catalog_number, master_id,
                            popularity_score, rating, plays, favorites, duration,
                            data_quality, source
                        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                    """, release_data)
                    
                    records_migrated += 1
                    
                except Exception as e:
                    logger.warning(f"Failed to migrate record in chunk {chunk_num}: {e}")
                    continue
            
            conn.commit()
            
        return records_migrated
    
    def _prepare_release_data(self, row: pd.Series) -> tuple:
        """Clean and prepare release data for database insertion"""
        def safe_int(value):
            try:
                return int(float(value)) if pd.notna(value) else None
            except (ValueError, TypeError):
                return None
                
        def safe_float(value):
            try:
                return float(value) if pd.notna(value) else None
            except (ValueError, TypeError):
                return None
                
        def safe_str(value, max_length=None):
            if pd.isna(value):
                return None
            str_val = str(value).strip()
            if max_length and len(str_val) > max_length:
                str_val = str_val[:max_length]
            return str_val if str_val else None
        
        return (
            safe_int(row.get('discogs_id') or row.get('releases_release_id')),
            safe_str(row.get('title') or row.get('releases_release_title'), 500),
            safe_str(row.get('artist') or row.get('artists_artist_name'), 200),
            safe_str(row.get('album') or row.get('title'), 500),
            safe_int(row.get('year') or row.get('releases_release_released')),
            safe_str(row.get('genre') or row.get('release_genres_genre'), 100),
            safe_str(row.get('label') or row.get('release_labels_label_name'), 200),
            safe_str(row.get('country') or row.get('releases_release_country'), 50),
            safe_str(row.get('format') or row.get('release_formats_format_name'), 100),
            safe_str(row.get('status') or row.get('releases_release_status'), 50),
            safe_str(row.get('catalog_number') or row.get('release_labels_label_catno'), 100),
            safe_int(row.get('master_id') or row.get('releases_release_master_id')),
            safe_float(row.get('popularity_score')),
            safe_float(row.get('rating')),
            safe_int(row.get('plays')),
            safe_int(row.get('favorites')),
            safe_int(row.get('duration')),
            safe_str(row.get('data_quality') or row.get('releases_release_data_quality'), 50),
            safe_str(row.get('source'), 100)
        )

# Run migration
migrator = DataMigrator(vinyl_db)

# Create schema and indexes
vinyl_db.create_schema()
vinyl_db.create_indexes()

# Find the latest catalog file
catalog_files = [
    PROCESSED_DIR / 'final_combined_catalog_real_data.csv',
    PROCESSED_DIR / 'final_enriched_catalog.csv',
    PROCESSED_DIR / 'filtered_catalog.csv'
]

catalog_path = None
for path in catalog_files:
    if path.exists():
        catalog_path = path
        break

if catalog_path:
    print(f"📂 Found catalog: {catalog_path.name}")
    migrated_count = migrator.migrate_catalog_data(catalog_path)
    print(f"✅ Migrated {migrated_count:,} releases to database")
else:
    print("❌ No catalog file found. Please run the previous notebooks first.")

2025-09-03 20:41:16,842 - INFO - ✅ Database schema created successfully
2025-09-03 20:41:16,846 - INFO - Created index: releases_artist
2025-09-03 20:41:16,847 - INFO - Created index: releases_title
2025-09-03 20:41:16,848 - INFO - Created index: releases_album
2025-09-03 20:41:16,850 - INFO - Created index: releases_genre
2025-09-03 20:41:16,851 - INFO - Created index: releases_year
2025-09-03 20:41:16,853 - INFO - Created index: releases_label
2025-09-03 20:41:16,854 - INFO - Created index: releases_artist_year
2025-09-03 20:41:16,855 - INFO - Created index: releases_genre_year
2025-09-03 20:41:16,856 - INFO - Created index: releases_popularity
2025-09-03 20:41:16,857 - INFO - Created index: releases_rating
2025-09-03 20:41:16,859 - INFO - Created index: release_artists_release
2025-09-03 20:41:16,860 - INFO - Created index: release_artists_artist
2025-09-03 20:41:16,861 - INFO - Created index: tracklist_release
2025-09-03 20:41:16,862 - INFO - Created index: reviews_release
2025-09-

📂 Found catalog: final_combined_catalog_real_data.csv


2025-09-03 20:41:19,349 - INFO - Processed 10 chunks, migrated 10000 records
2025-09-03 20:41:22,101 - INFO - Processed 20 chunks, migrated 20000 records
2025-09-03 20:41:24,295 - INFO - Processed 30 chunks, migrated 30000 records
2025-09-03 20:41:26,838 - INFO - Processed 40 chunks, migrated 40000 records
2025-09-03 20:41:29,125 - INFO - Processed 50 chunks, migrated 50000 records
2025-09-03 20:41:30,306 - INFO - ✅ Migration complete: 55000 records migrated


✅ Migrated 55,000 releases to database


In [3]:
class CatalogSearch:
    """High-performance search interface for the vinyl catalog"""
    
    def __init__(self, db: VinylCatalogDB):
        self.db = db
        
    def search_releases(
        self, 
        query: str = None,
        artist: str = None,
        genre: str = None,
        year_range: tuple = None,
        min_rating: float = None,
        min_popularity: float = None,
        label: str = None,
        format_type: str = None,
        limit: int = 100,
        offset: int = 0,
        sort_by: str = 'popularity_score',
        sort_order: str = 'DESC'
    ) -> List[Dict]:
        """
        Advanced search with multiple filters and sorting options
        
        Args:
            query: Free-text search across artist, title, album
            artist: Filter by artist name (partial match)
            genre: Filter by genre
            year_range: Tuple of (min_year, max_year)
            min_rating: Minimum rating threshold
            min_popularity: Minimum popularity score
            label: Filter by record label
            format_type: Filter by format (vinyl, CD, etc.)
            limit: Maximum results to return
            offset: Results offset for pagination
            sort_by: Column to sort by
            sort_order: ASC or DESC
        """
        
        where_clauses = []
        params = []
        
        # Free-text search
        if query:
            where_clauses.append("""
                (artist LIKE ? OR title LIKE ? OR album LIKE ? OR label LIKE ?)
            """)
            search_term = f"%{query}%"
            params.extend([search_term, search_term, search_term, search_term])
        
        # Specific filters
        if artist:
            where_clauses.append("artist LIKE ?")
            params.append(f"%{artist}%")
            
        if genre:
            where_clauses.append("genre LIKE ?")
            params.append(f"%{genre}%")
            
        if year_range:
            min_year, max_year = year_range
            if min_year:
                where_clauses.append("year >= ?")
                params.append(min_year)
            if max_year:
                where_clauses.append("year <= ?")
                params.append(max_year)
        
        if min_rating:
            where_clauses.append("rating >= ?")
            params.append(min_rating)
            
        if min_popularity:
            where_clauses.append("popularity_score >= ?")
            params.append(min_popularity)
            
        if label:
            where_clauses.append("label LIKE ?")
            params.append(f"%{label}%")
            
        if format_type:
            where_clauses.append("format LIKE ?")
            params.append(f"%{format_type}%")
        
        # Build query
        base_query = """
            SELECT 
                release_id, discogs_id, title, artist, album, year, genre, 
                label, country, format, popularity_score, rating, plays, 
                favorites, duration, created_at
            FROM releases
        """
        
        if where_clauses:
            base_query += " WHERE " + " AND ".join(where_clauses)
        
        base_query += f" ORDER BY {sort_by} {sort_order} LIMIT ? OFFSET ?"
        params.extend([limit, offset])
        
        with self.db.get_connection() as conn:
            cursor = conn.execute(base_query, params)
            results = [dict(row) for row in cursor.fetchall()]
            
        return results
    
    def get_release_by_id(self, release_id: int) -> Optional[Dict]:
        """Get detailed release information by ID"""
        with self.db.get_connection() as conn:
            cursor = conn.execute("""
                SELECT * FROM releases WHERE release_id = ?
            """, (release_id,))
            
            result = cursor.fetchone()
            return dict(result) if result else None
    
    def get_popular_releases(self, limit: int = 50) -> List[Dict]:
        """Get most popular releases"""
        return self.search_releases(
            limit=limit,
            sort_by='popularity_score',
            sort_order='DESC'
        )
    
    def get_top_rated_releases(self, limit: int = 50) -> List[Dict]:
        """Get highest rated releases"""
        return self.search_releases(
            limit=limit,
            sort_by='rating',
            sort_order='DESC'
        )
    
    def get_releases_by_artist(self, artist: str, limit: int = 50) -> List[Dict]:
        """Get all releases by a specific artist"""
        return self.search_releases(
            artist=artist,
            limit=limit,
            sort_by='year',
            sort_order='ASC'
        )
    
    def get_genre_statistics(self) -> List[Dict]:
        """Get statistics by genre"""
        with self.db.get_connection() as conn:
            cursor = conn.execute("""
                SELECT 
                    genre,
                    COUNT(*) as release_count,
                    AVG(rating) as avg_rating,
                    AVG(popularity_score) as avg_popularity,
                    MIN(year) as earliest_year,
                    MAX(year) as latest_year
                FROM releases 
                WHERE genre IS NOT NULL
                GROUP BY genre
                HAVING COUNT(*) >= 10
                ORDER BY release_count DESC
            """)
            
            return [dict(row) for row in cursor.fetchall()]
    
    def get_label_statistics(self) -> List[Dict]:
        """Get statistics by label"""
        with self.db.get_connection() as conn:
            cursor = conn.execute("""
                SELECT 
                    label,
                    COUNT(*) as release_count,
                    AVG(rating) as avg_rating,
                    AVG(popularity_score) as avg_popularity
                FROM releases 
                WHERE label IS NOT NULL
                GROUP BY label
                HAVING COUNT(*) >= 5
                ORDER BY release_count DESC
                LIMIT 50
            """)
            
            return [dict(row) for row in cursor.fetchall()]

# Initialize search interface
search = CatalogSearch(vinyl_db)

print("🔍 High-performance search interface initialized!")
print("\nAvailable search methods:")
print("  - search_releases(): Advanced multi-filter search")
print("  - get_popular_releases(): Top releases by popularity")
print("  - get_top_rated_releases(): Highest rated releases")
print("  - get_releases_by_artist(): All releases by artist")
print("  - get_genre_statistics(): Genre breakdown and stats")
print("  - get_label_statistics(): Label breakdown and stats")

🔍 High-performance search interface initialized!

Available search methods:
  - search_releases(): Advanced multi-filter search
  - get_popular_releases(): Top releases by popularity
  - get_top_rated_releases(): Highest rated releases
  - get_releases_by_artist(): All releases by artist
  - get_genre_statistics(): Genre breakdown and stats
  - get_label_statistics(): Label breakdown and stats


In [4]:
class PerformanceTester:
    """Test and benchmark database performance"""
    
    def __init__(self, search: CatalogSearch):
        self.search = search
        
    def benchmark_search_operations(self) -> Dict[str, float]:
        """Benchmark common search operations"""
        benchmarks = {}
        
        # Test 1: Simple text search
        start_time = time.time()
        results = self.search.search_releases(query="Beatles", limit=100)
        benchmarks['text_search'] = time.time() - start_time
        
        # Test 2: Filtered search with multiple criteria
        start_time = time.time()
        results = self.search.search_releases(
            genre="Rock",
            year_range=(1970, 1980),
            min_rating=4.0,
            limit=100
        )
        benchmarks['filtered_search'] = time.time() - start_time
        
        # Test 3: Popular releases (sorted)
        start_time = time.time()
        results = self.search.get_popular_releases(limit=100)
        benchmarks['popular_releases'] = time.time() - start_time
        
        # Test 4: Genre statistics aggregation
        start_time = time.time()
        results = self.search.get_genre_statistics()
        benchmarks['genre_stats'] = time.time() - start_time
        
        # Test 5: Large result set
        start_time = time.time()
        results = self.search.search_releases(limit=1000)
        benchmarks['large_result_set'] = time.time() - start_time
        
        return benchmarks
    
    def test_database_size(self) -> Dict[str, Any]:
        """Get database size and record count information"""
        with self.search.db.get_connection() as conn:
            # Get table sizes
            tables = ['releases', 'artists', 'labels', 'masters', 'tracklist', 'reviews']
            table_info = {}
            
            for table in tables:
                try:
                    cursor = conn.execute(f"SELECT COUNT(*) FROM {table}")
                    count = cursor.fetchone()[0]
                    table_info[table] = count
                except sqlite3.Error:
                    table_info[table] = 0
            
            # Database file size
            db_size = self.search.db.db_path.stat().st_size if self.search.db.db_path.exists() else 0
            
            return {
                'db_size_mb': db_size / (1024 * 1024),
                'table_counts': table_info,
                'total_releases': table_info.get('releases', 0)
            }

# Run performance tests
tester = PerformanceTester(search)

print("🚀 Running performance benchmarks...")
benchmarks = tester.benchmark_search_operations()
db_info = tester.test_database_size()

print("\n📊 Performance Results:")
print(f"  Text search: {benchmarks['text_search']:.3f}s")
print(f"  Filtered search: {benchmarks['filtered_search']:.3f}s") 
print(f"  Popular releases: {benchmarks['popular_releases']:.3f}s")
print(f"  Genre statistics: {benchmarks['genre_stats']:.3f}s")
print(f"  Large result set: {benchmarks['large_result_set']:.3f}s")

print("\n📈 Database Information:")
print(f"  Database size: {db_info['db_size_mb']:.1f} MB")
print(f"  Total releases: {db_info['total_releases']:,}")
for table, count in db_info['table_counts'].items():
    if count > 0:
        print(f"  {table}: {count:,} records")

# Update database statistics for optimal query planning
vinyl_db.analyze_database()

print("\n✅ Performance optimization complete!")
print("🎯 Your vinyl catalog is now running on a high-performance database backend!")

🚀 Running performance benchmarks...

📊 Performance Results:
  Text search: 0.174s
  Filtered search: 0.009s
  Popular releases: 0.002s
  Genre statistics: 0.076s
  Large result set: 0.010s

📈 Database Information:
  Database size: 21.6 MB
  Total releases: 55,000
  releases: 55,000 records


2025-09-03 20:42:42,758 - INFO - ✅ Database statistics updated



✅ Performance optimization complete!
🎯 Your vinyl catalog is now running on a high-performance database backend!


In [None]:
print("\n" + "="*80)
print("🎵 SMART VINYL CATALOG - PERFORMANCE OPTIMIZED")
print("="*80)

# Example searches to demonstrate capabilities
print("\n🔍 Example Searches:")

# 1. Search for Beatles releases
print("\n1. Beatles releases:")
beatles = search.search_releases(query="Beatles", limit=5)
for release in beatles:
    print(f"  • {release['artist']} - {release['title']} ({release['year']})")

# 2. High-rated rock from the 70s
print("\n2. High-rated 70s Rock:")
rock_70s = search.search_releases(
    genre="Rock", 
    year_range=(1970, 1979), 
    min_rating=4.0, 
    limit=5
)
for release in rock_70s:
    print(f"  • {release['artist']} - {release['album']} ({release['year']}) - ⭐{release['rating']}")

# 3. Most popular releases overall
print("\n3. Most Popular Releases:")
popular = search.get_popular_releases(limit=5)
for release in popular:
    print(f"  • {release['artist']} - {release['title']} - 🔥{release['popularity_score']:.2f}")

# 4. Genre breakdown
print("\n4. Top Genres by Count:")
genres = search.get_genre_statistics()[:5]
for genre in genres:
    print(f"  • {genre['genre']}: {genre['release_count']} releases (avg rating: {genre['avg_rating']:.2f})")

print(f"\n💾 Database Performance Summary:")
print(f"  • {db_info['total_releases']:,} releases indexed and searchable")
print(f"  • Average search time: {np.mean(list(benchmarks.values())):.3f} seconds")
print(f"  • Database size: {db_info['db_size_mb']:.1f} MB")
print(f"  • Ready for advanced features like recommendations and visual discovery!")

print(f"\n🚀 Next Steps Available:")
print(f"  • Add full-text search capabilities")
print(f"  • Implement caching layer for dashboard")
print(f"  • Build REST API endpoints")
print(f"  • Add recommendation engine")
print(f"  • Integrate album cover computer vision")