In [1]:
"""
# Smart Vinyl Catalog - Data Exploration
## Sprint 1: Foundation & Data Pipeline
"""
import os
import sys

# Add project root to path
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
sys.path.insert(0, os.path.join(project_root, 'src'))

print(f"Notebook directory: {notebook_dir}")
print(f"Project root: {project_root}")

# Load environment from project root
from dotenv import load_dotenv
load_dotenv(os.path.join(project_root, '.env'))

from google.cloud import bigquery
from config.bigquery_config import config

print("🎵 Smart Vinyl Catalog - Data Exploration")
print("=" * 50)

Notebook directory: /Users/richpointofview/smart-vinyl-catalog/notebooks
Project root: /Users/richpointofview/smart-vinyl-catalog
🎵 Smart Vinyl Catalog - Data Exploration


In [2]:
# Test BigQuery connection with better error handling
try:
    client = config.get_client()
    print(f"✅ Connected to BigQuery project: {config.project_id}")
    
    # Test basic query first
    test_query = "SELECT 1 as test_number"
    test_result = client.query(test_query).to_dataframe()
    print("✅ Basic query test passed")
    
    # List datasets
    datasets = list(client.list_datasets())
    print(f"Available datasets: {len(datasets)}")
    for dataset in datasets:
        print(f"  - {dataset.dataset_id}")
        
except Exception as e:
    print(f"❌ BigQuery connection failed: {e}")
    print(f"Project ID: {config.project_id}")
    print(f"Credentials path: {os.getenv('GOOGLE_APPLICATION_CREDENTIALS')}")

✅ Connected to BigQuery project: smart-vinyl-catalog
✅ Basic query test passed
Available datasets: 0


In [3]:
# Test AI capabilities
query = """
SELECT AI.GENERATE_TEXT(
  'Write a short description of jazz music',
  'temperature', 0.3
) AS jazz_description
"""

try:
    result = client.query(query).to_dataframe()
    print("🤖 AI Generation Test Success!")
    print(result['jazz_description'].iloc[0])
except Exception as e:
    print(f"⚠️ AI test result: {e}")
    print("This is normal - AI features may need a moment to activate")

⚠️ AI test result: 400 Table-valued function is not expected here: AI.GENERATE_TEXT; Did you mean ai.generate_int? at [2:8]; reason: invalidQuery, location: query, message: Table-valued function is not expected here: AI.GENERATE_TEXT; Did you mean ai.generate_int? at [2:8]

Location: US
Job ID: 4766d38a-f350-4c8b-975c-684c415c6c90

This is normal - AI features may need a moment to activate


In [4]:
# Create dataset and tables
from google.cloud import bigquery

def create_dataset_and_tables():
    dataset_id = f"{client.project}.vinyl_catalog"
    
    # Create dataset
    try:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = "US"
        dataset = client.create_dataset(dataset)
        print(f"✅ Created dataset: vinyl_catalog")
    except Exception as e:
        if "Already Exists" in str(e):
            print(f"✅ Dataset vinyl_catalog already exists")
        else:
            print(f"Dataset creation error: {e}")
    
    # Define table schemas
    tables_to_create = [
        ('discogs_releases', [
            bigquery.SchemaField('release_id', 'STRING'),
            bigquery.SchemaField('title', 'STRING'),
            bigquery.SchemaField('artist', 'STRING'),
            bigquery.SchemaField('year', 'INTEGER'),
            bigquery.SchemaField('genre', 'STRING'),
            bigquery.SchemaField('style', 'STRING'),
            bigquery.SchemaField('label', 'STRING'),
            bigquery.SchemaField('country', 'STRING')
        ]),
        ('album_reviews', [
            bigquery.SchemaField('album_id', 'STRING'),
            bigquery.SchemaField('album_title', 'STRING'),
            bigquery.SchemaField('artist', 'STRING'),
            bigquery.SchemaField('review_text', 'STRING'),
            bigquery.SchemaField('rating', 'FLOAT'),
            bigquery.SchemaField('review_source', 'STRING')
        ])
    ]
    
    # Create tables
    for table_name, schema in tables_to_create:
        table_id = f"{client.project}.vinyl_catalog.{table_name}"
        table = bigquery.Table(table_id, schema=schema)
        
        try:
            table = client.create_table(table)
            print(f"✅ Created table: {table_name}")
        except Exception as e:
            if "Already Exists" in str(e):
                print(f"✅ Table {table_name} already exists")
            else:
                print(f"Table creation error: {e}")

create_dataset_and_tables()

✅ Created dataset: vinyl_catalog
✅ Created table: discogs_releases
✅ Created table: album_reviews


In [5]:
# Create and upload sample data
import pandas as pd

# Sample Discogs releases data
sample_discogs_data = {
    'release_id': ['123456', '234567', '345678', '456789', '567890'],
    'title': ['Kind of Blue', 'A Love Supreme', 'Giant Steps', 'Blue Train', 'Somethin\' Else'],
    'artist': ['Miles Davis', 'John Coltrane', 'John Coltrane', 'John Coltrane', 'Cannonball Adderley'],
    'year': [1959, 1965, 1960, 1957, 1958],
    'genre': ['Jazz', 'Jazz', 'Jazz', 'Jazz', 'Jazz'],
    'style': ['Cool Jazz', 'Free Jazz', 'Hard Bop', 'Hard Bop', 'Soul Jazz'],
    'label': ['Columbia', 'Impulse!', 'Atlantic', 'Blue Note', 'Blue Note'],
    'country': ['US', 'US', 'US', 'US', 'US']
}

# Sample album reviews data
sample_reviews_data = {
    'album_id': ['123456', '234567', '345678', '456789', '567890'],
    'album_title': ['Kind of Blue', 'A Love Supreme', 'Giant Steps', 'Blue Train', 'Somethin\' Else'],
    'artist': ['Miles Davis', 'John Coltrane', 'John Coltrane', 'John Coltrane', 'Cannonball Adderley'],
    'review_text': [
        'A masterpiece of cool jazz. Davis\'s muted trumpet creates an atmosphere of contemplative beauty.',
        'Coltrane\'s spiritual journey manifests in four powerful movements. Represents the pinnacle of his expression.',
        'Complex harmonic structures that challenged jazz conventions. Coltrane\'s technical prowess shines.',
        'A hard bop classic featuring Coltrane at his most accessible. Perfect rhythm section support.',
        'Adderley\'s alto sax shines in this soul jazz gem. Balances sophistication with accessibility.'
    ],
    'rating': [4.8, 4.9, 4.6, 4.4, 4.3],
    'review_source': ['AllMusic', 'Rolling Stone', 'DownBeat', 'JazzTimes', 'Pitchfork']
}

# Create DataFrames
discogs_df = pd.DataFrame(sample_discogs_data)
reviews_df = pd.DataFrame(sample_reviews_data)

# Upload to BigQuery
def upload_dataframe_to_bq(df, table_name):
    table_id = f"{client.project}.vinyl_catalog.{table_name}"
    
    job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    job.result()
    
    print(f"✅ Uploaded {len(df)} rows to {table_name}")

upload_dataframe_to_bq(discogs_df, 'discogs_releases')
upload_dataframe_to_bq(reviews_df, 'album_reviews')

print("\n📊 Sample data uploaded successfully!")

✅ Uploaded 5 rows to discogs_releases
✅ Uploaded 5 rows to album_reviews

📊 Sample data uploaded successfully!


In [6]:
# Test the complete data pipeline
test_query = """
SELECT 
    dr.title,
    dr.artist,
    dr.year,
    dr.label,
    ar.rating,
    SUBSTR(ar.review_text, 1, 80) as review_snippet
FROM `vinyl_catalog.discogs_releases` dr
JOIN `vinyl_catalog.album_reviews` ar 
    ON dr.release_id = ar.album_id
ORDER BY ar.rating DESC
"""

results = client.query(test_query).to_dataframe()
print("🎵 Your Vinyl Collection Data:")
print(results.to_string(index=False))

print(f"\n📊 Collection Stats:")
print(f"Total albums: {len(results)}")
print(f"Average rating: {results['rating'].mean():.1f}/5.0")
print(f"Years covered: {results['year'].min()}-{results['year'].max()}")

🎵 Your Vinyl Collection Data:
         title              artist  year     label  rating                                                                   review_snippet
A Love Supreme       John Coltrane  1965  Impulse!     4.9 Coltrane's spiritual journey manifests in four powerful movements. Represents th
  Kind of Blue         Miles Davis  1959  Columbia     4.8 A masterpiece of cool jazz. Davis's muted trumpet creates an atmosphere of conte
   Giant Steps       John Coltrane  1960  Atlantic     4.6 Complex harmonic structures that challenged jazz conventions. Coltrane's technic
    Blue Train       John Coltrane  1957 Blue Note     4.4 A hard bop classic featuring Coltrane at his most accessible. Perfect rhythm sec
Somethin' Else Cannonball Adderley  1958 Blue Note     4.3 Adderley's alto sax shines in this soul jazz gem. Balances sophistication with a

📊 Collection Stats:
Total albums: 5
Average rating: 4.6/5.0
Years covered: 1957-1965


In [7]:
# Test if AI functions are now active
print("Testing BigQuery AI capabilities...")

# Test 1: Simple AI.GENERATE_TEXT
simple_ai_test = """
SELECT AI.GENERATE_TEXT(
  'What is jazz music?'
) AS simple_response
"""

try:
    result = client.query(simple_ai_test).to_dataframe()
    print("✅ AI.GENERATE_TEXT working!")
    print("Response:", result['simple_response'].iloc[0][:100] + "...")
    ai_working = True
except Exception as e:
    print(f"⚠️ AI.GENERATE_TEXT not ready: {e}")
    ai_working = False

# Test 2: AI.GENERATE (newer function)
if ai_working:
    generate_test = """
    SELECT AI.GENERATE(
      'Categorize this album as "chill", "energetic", or "contemplative": Kind of Blue by Miles Davis'
    ) AS category
    """
    
    try:
        result = client.query(generate_test).to_dataframe()
        print("✅ AI.GENERATE working!")
        print("Category:", result['category'].iloc[0])
    except Exception as e:
        print(f"⚠️ AI.GENERATE not ready: {e}")
        print("Will use alternative approaches")

print(f"\nAI Status: {'Ready for processing' if ai_working else 'Still activating - will create data for later processing'}")

Testing BigQuery AI capabilities...
⚠️ AI.GENERATE_TEXT not ready: ('invalid_grant: Invalid JWT: Token must be a short-lived token (60 minutes) and in a reasonable timeframe. Check your iat and exp values in the JWT claim.', {'error': 'invalid_grant', 'error_description': 'Invalid JWT: Token must be a short-lived token (60 minutes) and in a reasonable timeframe. Check your iat and exp values in the JWT claim.'})

AI Status: Still activating - will create data for later processing


In [None]:
# Create realistic messy personal collection data
import random
from datetime import datetime, timedelta

# Messy handwritten notes (simulating OCR'd text)
messy_collection_notes = [
    {
        'note_id': 'NOTE_001',
        'raw_text': 'Miles Davis - Kind of Blue, Columbia pressing, 1959, bought for $28 at Fingerprints Music, condition VG+, sounds incredible on the vintage setup',
        'note_type': 'purchase_record'
    },
    {
        'note_id': 'NOTE_002', 
        'raw_text': 'John Coltrane A Love Supreme - Impulse original pressing - $45 - mint condition - spiritual masterpiece - found at estate sale in Berkeley',
        'note_type': 'purchase_record'
    },
    {
        'note_id': 'NOTE_003',
        'raw_text': 'Giant Steps Atlantic Records John Coltrane 1960 $35 very good condition complex harmonies challenging listen but rewarding',
        'note_type': 'listening_notes'
    },
    {
        'note_id': 'NOTE_004',
        'raw_text': 'Blue Train Blue Note Coltrane 1957 Near Mint $32 hard bop classic great for late night listening perfect rhythm section',
        'note_type': 'listening_notes'
    },
    {
        'note_id': 'NOTE_005',
        'raw_text': 'Somethin Else Cannonball Adderley Blue Note 1958 Good+ condition $25 soul jazz gem Miles Davis on trumpet too!',
        'note_type': 'discovery_notes'
    },
    {
        'note_id': 'NOTE_006',
        'raw_text': 'Need to find: Waltz for Debby Bill Evans, The Sidewinder Lee Morgan, Song for My Father Horace Silver - missing key Blue Note albums',
        'note_type': 'wishlist'
    }
]

# Create personal collection with purchase history
personal_collection_data = []
start_date = datetime(2020, 1, 1)

for i, release_id in enumerate(['123456', '234567', '345678', '456789', '567890']):
    # Generate realistic purchase timeline
    days_offset = i * 120 + random.randint(0, 60)  # Spread over 2+ years
    purchase_date = start_date + timedelta(days=days_offset)
    
    # Price based on rarity/condition
    base_prices = [28, 45, 35, 32, 25]
    purchase_price = base_prices[i]
    
    conditions = ['VG+', 'Mint', 'VG', 'Near Mint', 'Good+']
    condition = conditions[i]
    
    listening_notes = [
        'Perfect for late night sessions. The trumpet tone is phenomenal.',
        'Spiritual journey in four movements. Life-changing album.',
        'Complex but rewarding. Takes multiple listens to appreciate fully.',
        'Hard bop at its finest. Great entry point for new jazz listeners.',
        'Soul jazz with incredible energy. Miles Davis feature is amazing.'
    ]
    
    personal_collection_data.append({
        'collection_id': f'PC_{i+1:03d}',
        'release_id': release_id,
        'purchase_date': purchase_date.strftime('%Y-%m-%d'),
        'purchase_price': purchase_price,
        'condition': condition,
        'listening_notes': listening_notes[i],
        'personal_rating': random.randint(8, 10),
        'times_played': random.randint(15, 45)
    })

# Convert to DataFrames
messy_notes_df = pd.DataFrame(messy_collection_notes)
personal_df = pd.DataFrame(personal_collection_data)

print("📝 Created messy collection notes:")
print(messy_notes_df[['note_id', 'note_type']].to_string(index=False))
print("\n💿 Created personal collection:")
print(personal_df[['collection_id', 'purchase_date', 'purchase_price', 'condition']].to_string(index=False))