In [1]:
"""
# Smart Vinyl Catalog - Data Exploration
## Sprint 1: Foundation & Data Pipeline
"""
import os
import sys

# Add project root to path
notebook_dir = os.getcwd()
project_root = os.path.dirname(notebook_dir)
sys.path.insert(0, os.path.join(project_root, 'src'))

print(f"Notebook directory: {notebook_dir}")
print(f"Project root: {project_root}")

# Load environment from project root
from dotenv import load_dotenv
load_dotenv(os.path.join(project_root, '.env'))

from google.cloud import bigquery
from config.bigquery_config import config

print("🎵 Smart Vinyl Catalog - Data Exploration")
print("=" * 50)

Notebook directory: /Users/richpointofview/smart-vinyl-catalog/notebooks
Project root: /Users/richpointofview/smart-vinyl-catalog
🎵 Smart Vinyl Catalog - Data Exploration


In [2]:
# Test BigQuery connection with better error handling
try:
    client = config.get_client()
    print(f"✅ Connected to BigQuery project: {config.project_id}")
    
    # Test basic query first
    test_query = "SELECT 1 as test_number"
    test_result = client.query(test_query).to_dataframe()
    print("✅ Basic query test passed")
    
    # List datasets
    datasets = list(client.list_datasets())
    print(f"Available datasets: {len(datasets)}")
    for dataset in datasets:
        print(f"  - {dataset.dataset_id}")
        
except Exception as e:
    print(f"❌ BigQuery connection failed: {e}")
    print(f"Project ID: {config.project_id}")
    print(f"Credentials path: {os.getenv('GOOGLE_APPLICATION_CREDENTIALS')}")

✅ Connected to BigQuery project: smart-vinyl-catalog
✅ Basic query test passed
Available datasets: 0


In [3]:
# Test AI capabilities
query = """
SELECT AI.GENERATE_TEXT(
  'Write a short description of jazz music',
  'temperature', 0.3
) AS jazz_description
"""

try:
    result = client.query(query).to_dataframe()
    print("🤖 AI Generation Test Success!")
    print(result['jazz_description'].iloc[0])
except Exception as e:
    print(f"⚠️ AI test result: {e}")
    print("This is normal - AI features may need a moment to activate")

⚠️ AI test result: 400 Table-valued function is not expected here: AI.GENERATE_TEXT; Did you mean ai.generate_int? at [2:8]; reason: invalidQuery, location: query, message: Table-valued function is not expected here: AI.GENERATE_TEXT; Did you mean ai.generate_int? at [2:8]

Location: US
Job ID: 4766d38a-f350-4c8b-975c-684c415c6c90

This is normal - AI features may need a moment to activate


In [4]:
# Create dataset and tables
from google.cloud import bigquery

def create_dataset_and_tables():
    dataset_id = f"{client.project}.vinyl_catalog"
    
    # Create dataset
    try:
        dataset = bigquery.Dataset(dataset_id)
        dataset.location = "US"
        dataset = client.create_dataset(dataset)
        print(f"✅ Created dataset: vinyl_catalog")
    except Exception as e:
        if "Already Exists" in str(e):
            print(f"✅ Dataset vinyl_catalog already exists")
        else:
            print(f"Dataset creation error: {e}")
    
    # Define table schemas
    tables_to_create = [
        ('discogs_releases', [
            bigquery.SchemaField('release_id', 'STRING'),
            bigquery.SchemaField('title', 'STRING'),
            bigquery.SchemaField('artist', 'STRING'),
            bigquery.SchemaField('year', 'INTEGER'),
            bigquery.SchemaField('genre', 'STRING'),
            bigquery.SchemaField('style', 'STRING'),
            bigquery.SchemaField('label', 'STRING'),
            bigquery.SchemaField('country', 'STRING')
        ]),
        ('album_reviews', [
            bigquery.SchemaField('album_id', 'STRING'),
            bigquery.SchemaField('album_title', 'STRING'),
            bigquery.SchemaField('artist', 'STRING'),
            bigquery.SchemaField('review_text', 'STRING'),
            bigquery.SchemaField('rating', 'FLOAT'),
            bigquery.SchemaField('review_source', 'STRING')
        ])
    ]
    
    # Create tables
    for table_name, schema in tables_to_create:
        table_id = f"{client.project}.vinyl_catalog.{table_name}"
        table = bigquery.Table(table_id, schema=schema)
        
        try:
            table = client.create_table(table)
            print(f"✅ Created table: {table_name}")
        except Exception as e:
            if "Already Exists" in str(e):
                print(f"✅ Table {table_name} already exists")
            else:
                print(f"Table creation error: {e}")

create_dataset_and_tables()

✅ Created dataset: vinyl_catalog
✅ Created table: discogs_releases
✅ Created table: album_reviews


In [None]:
# Create and upload sample data
import pandas as pd

# Sample Discogs releases data
sample_discogs_data = {
    'release_id': ['123456', '234567', '345678', '456789', '567890'],
    'title': ['Kind of Blue', 'A Love Supreme', 'Giant Steps', 'Blue Train', 'Somethin\' Else'],
    'artist': ['Miles Davis', 'John Coltrane', 'John Coltrane', 'John Coltrane', 'Cannonball Adderley'],
    'year': [1959, 1965, 1960, 1957, 1958],
    'genre': ['Jazz', 'Jazz', 'Jazz', 'Jazz', 'Jazz'],
    'style': ['Cool Jazz', 'Free Jazz', 'Hard Bop', 'Hard Bop', 'Soul Jazz'],
    'label': ['Columbia', 'Impulse!', 'Atlantic', 'Blue Note', 'Blue Note'],
    'country': ['US', 'US', 'US', 'US', 'US']
}

# Sample album reviews data
sample_reviews_data = {
    'album_id': ['123456', '234567', '345678', '456789', '567890'],
    'album_title': ['Kind of Blue', 'A Love Supreme', 'Giant Steps', 'Blue Train', 'Somethin\' Else'],
    'artist': ['Miles Davis', 'John Coltrane', 'John Coltrane', 'John Coltrane', 'Cannonball Adderley'],
    'review_text': [
        'A masterpiece of cool jazz. Davis\'s muted trumpet creates an atmosphere of contemplative beauty.',
        'Coltrane\'s spiritual journey manifests in four powerful movements. Represents the pinnacle of his expression.',
        'Complex harmonic structures that challenged jazz conventions. Coltrane\'s technical prowess shines.',
        'A hard bop classic featuring Coltrane at his most accessible. Perfect rhythm section support.',
        'Adderley\'s alto sax shines in this soul jazz gem. Balances sophistication with accessibility.'
    ],
    'rating': [4.8, 4.9, 4.6, 4.4, 4.3],
    'review_source': ['AllMusic', 'Rolling Stone', 'DownBeat', 'JazzTimes', 'Pitchfork']
}

# Create DataFrames
discogs_df = pd.DataFrame(sample_discogs_data)
reviews_df = pd.DataFrame(sample_reviews_data)

# Upload to BigQuery
def upload_dataframe_to_bq(df, table_name):
    table_id = f"{client.project}.vinyl_catalog.{table_name}"
    
    job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    job.result()
    
    print(f"✅ Uploaded {len(df)} rows to {table_name}")

upload_dataframe_to_bq(discogs_df, 'discogs_releases')
upload_dataframe_to_bq(reviews_df, 'album_reviews')

print("\n📊 Sample data uploaded successfully!")