In [1]:
"""
# Sprint 3: Data Scaling & Real Dataset Integration
## Smart Vinyl Catalog - Production Scale Implementation

Moving from 5-album demo to production-scale catalog with real data sources.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import os
import sys

# Setup paths and imports
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
sys.path.insert(0, os.path.join(project_root, 'src'))

from dotenv import load_dotenv
load_dotenv(os.path.join(project_root, '.env'))

from google.cloud import bigquery
from config.bigquery_config import config

client = config.get_client()

print("Sprint 3: Scaling Up to Production Data")
print("=" * 50)
print("Target: 100+ albums, realistic review data, advanced AI processing")

Sprint 3: Scaling Up to Production Data
Target: 100+ albums, realistic review data, advanced AI processing


In [2]:
# Download and process real Discogs data
import pandas as pd
import requests
import zipfile
import os

print("SCALING UP: Real Data Integration")
print("=" * 50)

# Download Discogs dataset from Kaggle
# Note: You'll need Kaggle API credentials
try:
    import kaggle
    
    # Download Discogs data
    kaggle.api.dataset_download_files(
        'ofurkancoban/discogs-data-dumps-april-2025',
        path='../data/raw/',
        unzip=True
    )
    print("✅ Downloaded Discogs dataset")
    
    # Process releases data
    releases_df = pd.read_csv('../data/raw/discogs_releases.csv', nrows=10000)  # Sample first 10k
    print(f"Loaded {len(releases_df)} releases from Discogs")
    
except Exception as e:
    print(f"Kaggle download failed: {e}")
    print("Creating larger sample dataset instead...")
    
    # Create expanded sample data for demonstration
    expanded_sample = []
    genres = ['Jazz', 'Rock', 'Electronic', 'Folk', 'Soul', 'Funk']
    labels = ['Blue Note', 'Columbia', 'Atlantic', 'Verve', 'Impulse!', 'ECM']
    
    for i in range(100):  # 100 albums instead of 5
        expanded_sample.append({
            'release_id': f'{200000 + i}',
            'title': f'Sample Album {i+1}',
            'artist': f'Artist {i%20}',  # 20 different artists
            'year': 1950 + (i % 70),  # Span 1950-2020
            'genre': genres[i % len(genres)],
            'label': labels[i % len(labels)],
            'country': 'US'
        })
    
    releases_df = pd.DataFrame(expanded_sample)
    print(f"Created expanded sample: {len(releases_df)} releases")

SCALING UP: Real Data Integration
Kaggle download failed: Could not find kaggle.json. Make sure it's located in /Users/richpointofview/.kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/
Creating larger sample dataset instead...
Created expanded sample: 100 releases


In [3]:
# Simulate processing MARD review data
print("Processing Review Data...")

# In real implementation, download from MARD project
# For now, generate realistic review data
import random

review_templates = [
    "A {adjective} {genre} album that {verb} the listener. The {instrument} work is {quality}.",
    "This {year}s release {action} with {emotion}. {artist}'s {style} approach creates {atmosphere}.",
    "Essential {genre} recording. The {aspect} demonstrates {quality} throughout."
]

adjectives = ['masterful', 'groundbreaking', 'contemplative', 'energetic', 'innovative']
verbs = ['captivates', 'challenges', 'soothes', 'energizes', 'inspires'] 
instruments = ['saxophone', 'piano', 'trumpet', 'bass', 'drums']
qualities = ['exceptional', 'outstanding', 'remarkable', 'sublime', 'brilliant']
actions = ['resonates', 'connects', 'strikes a chord', 'makes an impression']
emotions = ['deep emotion', 'raw energy', 'subtle beauty', 'complex feelings']
styles = ['unique', 'traditional', 'experimental', 'refined', 'bold']
atmospheres = ['an immersive experience', 'lasting impact', 'memorable moments']
aspects = ['composition', 'performance', 'production', 'arrangement']

expanded_reviews = []
for i, album in releases_df.iterrows():
    if i < 200:  # Generate reviews for subset
        template = random.choice(review_templates)
        review = template.format(
            adjective=random.choice(adjectives),
            genre=album['genre'].lower(),
            verb=random.choice(verbs),
            instrument=random.choice(instruments),
            quality=random.choice(qualities),
            year=str(album['year'])[:3] + '0',
            action=random.choice(actions),
            emotion=random.choice(emotions),
            artist=album['artist'],
            style=random.choice(styles),
            atmosphere=random.choice(atmospheres),
            aspect=random.choice(aspects)
        )
        
        expanded_reviews.append({
            'album_id': album['release_id'],
            'album_title': album['title'],
            'artist': album['artist'],
            'review_text': review,
            'rating': round(random.uniform(2.5, 5.0), 1),
            'review_source': random.choice(['AllMusic', 'Rolling Stone', 'Pitchfork', 'DownBeat', 'JazzTimes'])
        })

reviews_df = pd.DataFrame(expanded_reviews)
print(f"Generated {len(reviews_df)} reviews")

Processing Review Data...
Generated 100 reviews


In [4]:
# Upload scaled dataset to BigQuery
def upload_large_dataset(df, table_name, batch_size=1000):
    """Upload large dataset in batches"""
    total_rows = len(df)
    
    for i in range(0, total_rows, batch_size):
        batch = df.iloc[i:i+batch_size]
        
        if i == 0:
            # First batch - overwrite table
            job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
        else:
            # Subsequent batches - append
            job_config = bigquery.LoadJobConfig(write_disposition="WRITE_APPEND")
        
        table_id = f"{client.project}.vinyl_catalog.{table_name}"
        job = client.load_table_from_dataframe(batch, table_id, job_config=job_config)
        job.result()
        
        print(f"Uploaded batch {i//batch_size + 1}: {len(batch)} rows to {table_name}")
    
    print(f"✅ Total uploaded: {total_rows} rows to {table_name}")

# Upload scaled data
upload_large_dataset(releases_df, 'discogs_releases')
upload_large_dataset(reviews_df, 'album_reviews')

# Verify scale
for table in ['discogs_releases', 'album_reviews']:
    count_query = f"SELECT COUNT(*) as count FROM `vinyl_catalog.{table}`"
    result = client.query(count_query).to_dataframe()
    print(f"{table}: {result['count'].iloc[0]} rows")

Uploaded batch 1: 100 rows to discogs_releases
✅ Total uploaded: 100 rows to discogs_releases
Uploaded batch 1: 100 rows to album_reviews
✅ Total uploaded: 100 rows to album_reviews
discogs_releases: 100 rows
album_reviews: 100 rows


In [5]:
# Test the scaled data system
print("TESTING SCALED SYSTEM")
print("=" * 40)

# Verify data scale
tables_info = {}
for table in ['discogs_releases', 'album_reviews']:
    count_query = f"SELECT COUNT(*) as count FROM `vinyl_catalog.{table}`"
    result = client.query(count_query).to_dataframe()
    tables_info[table] = result['count'].iloc[0]
    print(f"{table}: {result['count'].iloc[0]} rows")

# Test complex queries with scaled data
genre_analysis_query = """
SELECT 
    dr.genre,
    COUNT(*) as album_count,
    AVG(ar.rating) as avg_rating,
    MIN(dr.year) as earliest_year,
    MAX(dr.year) as latest_year
FROM `vinyl_catalog.discogs_releases` dr
JOIN `vinyl_catalog.album_reviews` ar ON dr.release_id = ar.album_id
GROUP BY dr.genre
ORDER BY album_count DESC
"""

genre_data = client.query(genre_analysis_query).to_dataframe()
print("\nGENRE ANALYSIS (Scaled Data):")
print(genre_data.to_string(index=False))

# Label distribution
label_query = """
SELECT 
    label,
    COUNT(*) as releases,
    ROUND(AVG(ar.rating), 1) as avg_rating
FROM `vinyl_catalog.discogs_releases` dr
JOIN `vinyl_catalog.album_reviews` ar ON dr.release_id = ar.album_id
GROUP BY label
ORDER BY releases DESC
LIMIT 10
"""

label_data = client.query(label_query).to_dataframe()
print(f"\nTOP LABELS:")
print(label_data.to_string(index=False))

print(f"\nSystem now operates at realistic scale:")
print(f"- {tables_info['discogs_releases']} album catalog")
print(f"- {tables_info['album_reviews']} reviews")
print(f"- Multi-genre, multi-decade coverage")
print(f"- Ready for advanced AI processing")

TESTING SCALED SYSTEM
discogs_releases: 100 rows
album_reviews: 100 rows

GENRE ANALYSIS (Scaled Data):
     genre  album_count  avg_rating  earliest_year  latest_year
Electronic           17    3.811765           1952         2018
      Jazz           17    3.717647           1950         2016
      Rock           17    3.805882           1951         2017
      Folk           17    3.941176           1953         2019
      Funk           16    3.725000           1951         2015
      Soul           16    3.581250           1950         2014

TOP LABELS:
    label  releases  avg_rating
    Verve        17         3.9
 Atlantic        17         3.8
Blue Note        17         3.7
 Columbia        17         3.8
      ECM        16         3.7
 Impulse!        16         3.6

System now operates at realistic scale:
- 100 album catalog
- 100 reviews
- Multi-genre, multi-decade coverage
- Ready for advanced AI processing


In [None]:
# Test AI functions with scaled dataset
print("TESTING AI FUNCTIONS WITH SCALED DATA")
print("=" * 50)

# Test AI.GENERATE with album categorization
categorization_test = """
SELECT 
    title,
    artist,
    genre,
    AI.GENERATE(
        'Categorize this album by mood in one word - contemplative, energetic, or experimental: ' 
        || title || ' by ' || artist || ' (' || genre || ')'
    ) as ai_mood
FROM `vinyl_catalog.discogs_releases`
WHERE genre IN ('Jazz', 'Rock', 'Electronic')
LIMIT 10
"""

try:
    result = client.query(categorization_test).to_dataframe()
    print("✅ AI categorization working!")
    print(result.to_string(index=False))
    ai_ready = True
except Exception as e:
    print(f"AI functions status: {e}")
    ai_ready = False

# Test recommendation generation if AI is ready
if ai_ready:
    recommendation_test = """
    SELECT AI.GENERATE(
        'Based on these highly-rated albums: Kind of Blue (Jazz), A Love Supreme (Jazz), recommend 3 similar albums with brief explanations'
    ) as recommendations
    """
    
    try:
        rec_result = client.query(recommendation_test).to_dataframe()
        print(f"\n🎵 AI RECOMMENDATIONS:")
        print(rec_result['recommendations'].iloc[0])
    except Exception as e:
        print(f"Recommendation test: {e}")

print(f"\nAI Status: {'Active - proceeding with advanced features' if ai_ready else 'Still activating - implementing fallback system'}")