In [1]:
"""
# Sprint 4: Advanced Data Processing
## Missing PRD Features Implementation

Implementing:
- Document AI integration for OCR processing
- Advanced collection gap analysis 
- Market trend forecasting
- Multi-modal data fusion
- Metadata extraction from unstructured notes
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import sys
import os

# Setup paths
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
sys.path.insert(0, os.path.join(project_root, 'src'))

from dotenv import load_dotenv
load_dotenv(os.path.join(project_root, '.env'))

from google.cloud import bigquery
from config.bigquery_config import config

client = config.get_client()

print("Advanced Data Processing - Missing PRD Features")
print("=" * 60)

# Test current AI function status with proper syntax
def test_ai_functions_with_connection():
    """Test AI functions with connection parameters"""
    
    # Simple test without connection_id first
    basic_test = """
    SELECT 
        title,
        genre,
        CASE 
            WHEN genre = 'Jazz' THEN 'Contemplative'
            WHEN genre = 'Rock' THEN 'Energetic' 
            WHEN genre = 'Electronic' THEN 'Experimental'
            ELSE 'Mixed'
        END as mood_category
    FROM `vinyl_catalog.discogs_releases`
    LIMIT 5
    """
    
    try:
        result = client.query(basic_test).to_dataframe()
        print("Traditional categorization working")
        return result
    except Exception as e:
        print(f"Basic query test: {e}")
        return pd.DataFrame()

mood_test = test_ai_functions_with_connection()
if len(mood_test) > 0:
    print("Mood Categorization Preview:")
    print(mood_test.to_string(index=False))

Advanced Data Processing - Missing PRD Features
Traditional categorization working
Mood Categorization Preview:
          title      genre mood_category
 Sample Album 3 Electronic  Experimental
 Sample Album 9 Electronic  Experimental
Sample Album 15 Electronic  Experimental
Sample Album 21 Electronic  Experimental
Sample Album 27 Electronic  Experimental


In [None]:
# Advanced collection gap analysis using scaled data
class AdvancedGapAnalyzer:
    def __init__(self, client):
        self.client = client
    
    def analyze_genre_gaps(self):
        """Identify missing genres and subgenres"""
        current_genres_query = """
        SELECT 
            genre,
            COUNT(*) as album_count,
            AVG(rating) as avg_rating,
            MIN(year) as earliest_year,
            MAX(year) as latest_year
        FROM `vinyl_catalog.discogs_releases` dr
        JOIN `vinyl_catalog.album_reviews` ar ON dr.release_id = ar.album_id
        GROUP BY genre
        ORDER BY avg_rating DESC
        """
        
        current_genres = self.client.query(current_genres_query).to_dataframe()
        
        # Define comprehensive genre taxonomy
        complete_genre_map = {
            'Jazz': ['Bebop', 'Cool Jazz', 'Hard Bop', 'Free Jazz', 'Fusion', 'Swing'],
            'Rock': ['Classic Rock', 'Progressive Rock', 'Alternative Rock', 'Punk Rock'],
            'Electronic': ['Ambient', 'Techno', 'House', 'IDM', 'Drum & Bass'],
            'Classical': ['Baroque', 'Romantic', 'Modern Classical', 'Chamber Music'],
            'Folk': ['Traditional Folk', 'Contemporary Folk', 'World Music'],
            'Soul': ['Motown', 'Neo-Soul', 'Classic Soul', 'R&B']
        }
        
        gaps = []
        for main_genre, subgenres in complete_genre_map.items():
            if main_genre not in current_genres['genre'].values:
                gaps.append(f"Missing entire {main_genre} category")
            else:
                genre_count = current_genres[current_genres['genre'] == main_genre]['album_count'].iloc[0]
                if genre_count < 5:
                    gaps.append(f"Underrepresented: {main_genre} ({genre_count} albums)")
        
        return current_genres, gaps
    
    def analyze_era_gaps(self):
        """Identify missing time periods"""
        era_query = """
        SELECT 
            FLOOR(year/10)*10 as decade,
            COUNT(*) as album_count,
            AVG(rating) as avg_rating,
            STRING_AGG(DISTINCT genre, ', ') as genres
        FROM `vinyl_catalog.discogs_releases` dr
        JOIN `vinyl_catalog.album_reviews` ar ON dr.release_id = ar.album_id
        WHERE year BETWEEN 1950 AND 2020
        GROUP BY decade
        ORDER BY decade
        """
        
        era_data = self.client.query(era_query).to_dataframe()
        
        # Identify gaps
        all_decades = set(range(1950, 2030, 10))
        covered_decades = set(era_data['decade'].values)
        missing_decades = all_decades - covered_decades
        
        return era_data, missing_decades
    
    def analyze_label_gaps(self):
        """Identify missing important labels"""
        current_labels_query = """
        SELECT 
            label,
            COUNT(*) as releases,
            AVG(rating) as avg_rating,
            STRING_AGG(DISTINCT genre, ', ') as genres_covered
        FROM `vinyl_catalog.discogs_releases` dr
        JOIN `vinyl_catalog.album_reviews` ar ON dr.release_id = ar.album_id
        GROUP BY label
        ORDER BY releases DESC, avg_rating DESC
        """
        
        current_labels = self.client.query(current_labels_query).to_dataframe()
        
        # Important labels by genre
        important_labels = {
            'Jazz': ['Blue Note', 'Prestige', 'Riverside', 'Impulse!', 'Verve', 'ECM'],
            'Rock': ['Atlantic', 'Columbia', 'Capitol', 'Elektra', 'Warner Bros'],
            'Electronic': ['Warp', 'Ninja Tune', 'R&S', 'Planet Mu'],
            'Classical': ['Deutsche Grammophon', 'Decca', 'RCA Red Seal']
        }
        
        label_gaps = []
        for genre, labels in important_labels.items():
            for label in labels:
                if label not in current_labels['label'].values:
                    label_gaps.append(f"Missing {genre} label: {label}")
        
        return current_labels, label_gaps

# Run advanced gap analysis
print("ADVANCED COLLECTION GAP ANALYSIS")
print("=" * 50)

gap_analyzer = AdvancedGapAnalyzer(client)

# Genre analysis
current_genres, genre_gaps = gap_analyzer.analyze_genre_gaps()
print("Current Genre Distribution:")
print(current_genres.to_string(index=False))

print(f"\nGenre Gaps Identified:")
for gap in genre_gaps[:5]:
    print(f"  • {gap}")

# Era analysis  
era_data, missing_decades = gap_analyzer.analyze_era_gaps()
print(f"\nEra Analysis:")
print(era_data.to_string(index=False))

if missing_decades:
    print(f"\nMissing Decades: {sorted(missing_decades)}")

# Label analysis
current_labels, label_gaps = gap_analyzer.analyze_label_gaps()
print(f"\nTop Labels in Collection:")
print(current_labels.head(8).to_string(index=False))

print(f"\nImportant Missing Labels:")
for gap in label_gaps[:8]:
    print(f"  • {gap}")