In [1]:
"""
# Smart Vinyl Catalog - Data Exploration
## Sprint 1: Foundation & Data Pipeline
"""
import os
import sys

# Add project root to path
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
sys.path.insert(0, os.path.join(project_root, 'src'))

print(f"Notebook directory: {notebook_dir}")
print(f"Project root: {project_root}")

# Load environment from project root
from dotenv import load_dotenv
load_dotenv(os.path.join(project_root, '.env'))

from google.cloud import bigquery
from config.bigquery_config import config

print("🎵 Smart Vinyl Catalog - Data Exploration")
print("=" * 50)

Notebook directory: /Users/richpointofview/smart-vinyl-catalog/notebooks
Project root: /Users/richpointofview/smart-vinyl-catalog
🎵 Smart Vinyl Catalog - Data Exploration


In [2]:
# Test BigQuery connection
try:
    # Force create a completely new client
    from google.cloud import bigquery
    from google.oauth2 import service_account
    import os
    
    credentials_path = "/Users/richpointofview/smart-vinyl-catalog/service-account-key.json"
    credentials = service_account.Credentials.from_service_account_file(credentials_path)
    client = bigquery.Client(credentials=credentials, project="smart-vinyl-catalog")
    
    # Test basic query
    test_query = "SELECT 1 as test_number"
    result = client.query(test_query).to_dataframe()
    print("✅ BigQuery connection working")
    
    # List datasets
    datasets = list(client.list_datasets())
    print(f"Available datasets: {len(datasets)}")
    for dataset in datasets:
        print(f"  - {dataset.dataset_id}")
        
except Exception as e:
    print(f"❌ Connection failed: {e}")

✅ BigQuery connection working
Available datasets: 1
  - vinyl_catalog


In [3]:
# Sprint 1 Day 2 Analysis - Prepare queries for when connection is fixed

# Query 1: Complete collection overview
collection_analysis_query = """
SELECT 
    dr.title,
    dr.artist,
    dr.year,
    dr.label,
    ar.rating as critic_rating,
    ar.review_text
FROM `vinyl_catalog.discogs_releases` dr
JOIN `vinyl_catalog.album_reviews` ar 
    ON dr.release_id = ar.album_id
ORDER BY dr.year ASC
"""

# Query 2: Genre and style analysis
genre_analysis_query = """
SELECT 
    genre,
    style,
    COUNT(*) as album_count,
    AVG(ar.rating) as avg_rating
FROM `vinyl_catalog.discogs_releases` dr
JOIN `vinyl_catalog.album_reviews` ar ON dr.release_id = ar.album_id
GROUP BY genre, style
ORDER BY avg_rating DESC
"""

print("Prepared queries for collection analysis:")
print("1. Complete collection overview")
print("2. Genre and style breakdown") 
print("3. Ready for AI processing once authentication is resolved")

print("\nNext steps after fixing authentication:")
print("- Test AI functions (may be active by now)")
print("- Create personal collection tables")
print("- Process messy notes with AI extraction")
print("- Build recommendation foundation")

Prepared queries for collection analysis:
1. Complete collection overview
2. Genre and style breakdown
3. Ready for AI processing once authentication is resolved

Next steps after fixing authentication:
- Test AI functions (may be active by now)
- Create personal collection tables
- Process messy notes with AI extraction
- Build recommendation foundation


In [4]:
# Test your existing data
if 'client' in globals():
    # Run the collection overview query
    try:
        result = client.query(collection_analysis_query).to_dataframe()
        print("Collection Data:")
        print(result.to_string(index=False))
        print(f"\nTotal albums: {len(result)}")
    except Exception as e:
        print(f"Query failed: {e}")
else:
    print("Client not available - fix connection first")

Collection Data:
         title              artist  year     label  critic_rating                                                                                                   review_text
    Blue Train       John Coltrane  1957 Blue Note            4.4                 A hard bop classic featuring Coltrane at his most accessible. Perfect rhythm section support.
Somethin' Else Cannonball Adderley  1958 Blue Note            4.3                 Adderley's alto sax shines in this soul jazz gem. Balances sophistication with accessibility.
  Kind of Blue         Miles Davis  1959  Columbia            4.8              A masterpiece of cool jazz. Davis's muted trumpet creates an atmosphere of contemplative beauty.
   Giant Steps       John Coltrane  1960  Atlantic            4.6            Complex harmonic structures that challenged jazz conventions. Coltrane's technical prowess shines.
A Love Supreme       John Coltrane  1965  Impulse!            4.9 Coltrane's spiritual journey manifest

In [5]:
# Test BigQuery AI functions (Day 2 focus)
print("Testing AI capabilities...")

# Test 1: Simple AI generation
ai_test_query = """
SELECT AI.GENERATE_TEXT(
    'Describe the musical style of hard bop jazz in one sentence'
) AS description
"""

try:
    result = client.query(ai_test_query).to_dataframe()
    print("✅ AI.GENERATE_TEXT is working!")
    print("Response:", result['description'].iloc[0])
    ai_ready = True
except Exception as e:
    print(f"AI functions still activating: {e}")
    ai_ready = False

# If AI is working, test with your actual data
if ai_ready:
    album_analysis_query = """
    SELECT 
        title,
        artist,
        AI.GENERATE_TEXT(
            'Categorize this album as "mellow", "intense", or "experimental" based on the title and artist: ' || 
            title || ' by ' || artist
        ) AS ai_category
    FROM `vinyl_catalog.discogs_releases`
    LIMIT 3
    """
    
    try:
        result = client.query(album_analysis_query).to_dataframe()
        print("\n🎵 AI Album Categorization:")
        print(result.to_string(index=False))
    except Exception as e:
        print(f"Album analysis failed: {e}")

print(f"\nAI Status: {'Ready for processing' if ai_ready else 'Still activating - continue with data prep'}")

Testing AI capabilities...
AI functions still activating: 400 Table-valued function is not expected here: AI.GENERATE_TEXT; Did you mean ai.generate_int? at [2:8]; reason: invalidQuery, location: query, message: Table-valued function is not expected here: AI.GENERATE_TEXT; Did you mean ai.generate_int? at [2:8]

Location: US
Job ID: 65f030f6-57cf-4596-ad61-8a6d98116ba0


AI Status: Still activating - continue with data prep


In [6]:
# Create personal collection tables for messy data processing
def create_remaining_tables():
    tables_schema = {
        'personal_collection': [
            ('collection_id', 'STRING'),
            ('release_id', 'STRING'), 
            ('purchase_date', 'DATE'),
            ('purchase_price', 'FLOAT'),
            ('condition', 'STRING'),
            ('listening_notes', 'STRING'),
            ('personal_rating', 'INTEGER'),
            ('times_played', 'INTEGER')
        ],
        'raw_collection_notes': [
            ('note_id', 'STRING'),
            ('raw_text', 'STRING'),
            ('note_type', 'STRING'),
            ('created_date', 'TIMESTAMP')
        ]
    }
    
    for table_name, schema_list in tables_schema.items():
        table_id = f"{client.project}.vinyl_catalog.{table_name}"
        schema = [bigquery.SchemaField(name, field_type) for name, field_type in schema_list]
        table = bigquery.Table(table_id, schema=schema)
        
        try:
            client.create_table(table)
            print(f"Created table: {table_name}")
        except Exception as e:
            if "Already Exists" in str(e):
                print(f"Table {table_name} already exists")
            else:
                print(f"Error: {e}")

create_remaining_tables()

Created table: personal_collection
Created table: raw_collection_notes


In [7]:
# Test alternative AI function syntax
print("Testing alternative AI syntax...")

# Try the newer ML.GENERATE_TEXT syntax
ml_test_query = """
SELECT ML.GENERATE_TEXT(
    'Describe jazz music briefly',
    STRUCT(
        0.3 AS temperature,
        100 AS max_output_tokens
    )
) AS description
"""

try:
    result = client.query(ml_test_query).to_dataframe()
    print("ML.GENERATE_TEXT working!")
    print("Response:", result['description'].iloc[0])
    ml_ready = True
except Exception as e:
    print(f"ML.GENERATE_TEXT: {e}")
    ml_ready = False

# Since AI functions aren't ready, let's prepare for Sprint 2
# Upload the messy collection data we created earlier
if 'messy_notes_df' in globals() and 'personal_df' in globals():
    try:
        # Add timestamp to messy notes
        messy_notes_df['created_date'] = datetime.now()
        
        # Upload both datasets
        upload_dataframe_to_bq(messy_notes_df, 'raw_collection_notes')
        upload_dataframe_to_bq(personal_df, 'personal_collection')
        print("Personal collection data uploaded successfully")
    except Exception as e:
        print(f"Upload error: {e}")
else:
    print("Creating personal collection data for upload...")
    # We'll recreate this data since kernel was restarted

Testing alternative AI syntax...
ML.GENERATE_TEXT: 400 Table-valued function is not expected here: ML.GENERATE_TEXT at [2:8]; reason: invalidQuery, location: query, message: Table-valued function is not expected here: ML.GENERATE_TEXT at [2:8]

Location: US
Job ID: f2d04467-3ea4-465f-9f51-5ca925c3b155

Creating personal collection data for upload...


In [9]:
# Import required libraries
import pandas as pd
import random
from datetime import datetime, timedelta

# Recreate personal collection data (lost during kernel restart)
messy_collection_notes = [
    {
        'note_id': 'NOTE_001',
        'raw_text': 'Miles Davis - Kind of Blue, Columbia pressing, 1959, bought for $28 at Fingerprints Music, condition VG+, sounds incredible',
        'note_type': 'purchase_record'
    },
    {
        'note_id': 'NOTE_002', 
        'raw_text': 'John Coltrane A Love Supreme - Impulse original pressing - $45 - mint condition - spiritual masterpiece - found at estate sale',
        'note_type': 'purchase_record'
    },
    {
        'note_id': 'NOTE_003',
        'raw_text': 'Giant Steps Atlantic Records John Coltrane 1960 $35 very good condition complex harmonies challenging but rewarding',
        'note_type': 'listening_notes'
    },
    {
        'note_id': 'NOTE_004',
        'raw_text': 'Blue Train Blue Note Coltrane 1957 Near Mint $32 hard bop classic great for late night listening perfect rhythm',
        'note_type': 'listening_notes'
    },
    {
        'note_id': 'NOTE_005',
        'raw_text': 'Somethin Else Cannonball Adderley Blue Note 1958 Good+ condition $25 soul jazz gem Miles Davis on trumpet',
        'note_type': 'discovery_notes'
    }
]

# Personal collection with purchase history
personal_collection_data = []
start_date = datetime(2020, 1, 1)

for i, release_id in enumerate(['123456', '234567', '345678', '456789', '567890']):
    days_offset = i * 120 + random.randint(0, 60)
    purchase_date = start_date + timedelta(days=days_offset)
    
    base_prices = [28, 45, 35, 32, 25]
    conditions = ['VG+', 'Mint', 'VG', 'Near Mint', 'Good+']
    
    listening_notes = [
        'Perfect for late night sessions. The trumpet tone is phenomenal.',
        'Spiritual journey in four movements. Life-changing album.',
        'Complex but rewarding. Takes multiple listens to appreciate.',
        'Hard bop at its finest. Great entry point for jazz.',
        'Soul jazz with incredible energy. Miles Davis feature amazing.'
    ]
    
    personal_collection_data.append({
        'collection_id': f'PC_{i+1:03d}',
        'release_id': release_id,
        'purchase_date': purchase_date.strftime('%Y-%m-%d'),
        'purchase_price': base_prices[i],
        'condition': conditions[i],
        'listening_notes': listening_notes[i],
        'personal_rating': random.randint(8, 10),
        'times_played': random.randint(15, 45)
    })

# Convert to DataFrames
messy_notes_df = pd.DataFrame(messy_collection_notes)
personal_df = pd.DataFrame(personal_collection_data)

print("Created messy collection notes for AI processing")
print("Created personal collection with purchase history")
print(f"Notes: {len(messy_notes_df)} entries")
print(f"Collection: {len(personal_df)} albums")

Created messy collection notes for AI processing
Created personal collection with purchase history
Notes: 5 entries
Collection: 5 albums


In [None]:
# Define upload function and upload personal collection data
from google.cloud import bigquery

def upload_dataframe_to_bq(df, table_name):
    table_id = f"{client.project}.vinyl_catalog.{table_name}"
    
    job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    job.result()
    
    print(f"Uploaded {len(df)} rows to {table_name}")

# Upload personal collection data
try:
    # Add timestamp to notes
    messy_notes_df['created_date'] = datetime.now()
    
    # Upload both datasets
    upload_dataframe_to_bq(messy_notes_df, 'raw_collection_notes')
    upload_dataframe_to_bq(personal_df, 'personal_collection')
    
    print("Successfully uploaded personal collection data")
    print("Ready for AI processing when functions activate")
    
except Exception as e:
    print(f"Upload error: {e}")