In [2]:
# Setup Cell 1 - Imports
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import gzip
import requests
from pathlib import Path
import sqlite3
import json
import re
import warnings
from typing import Dict, List, Optional, Union
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import time

print("✅ All imports successful")

✅ All imports successful


In [3]:
# Setup Cell 2 - Configure display and warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('default')  # Use default if seaborn issues
sns.set_palette("husl")

print("✅ Display options configured")

✅ Display options configured


In [4]:
# Setup Cell 3 - Project paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = PROJECT_ROOT / 'data'
DISCOGS_DIR = DATA_DIR / 'discogs'
PROCESSED_DIR = DATA_DIR / 'processed'
RAW_DIR = DATA_DIR / 'raw'

# Create directories
DISCOGS_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)

print(f"📁 Project root: {PROJECT_ROOT}")
print(f"📁 Data directory: {DATA_DIR}")
print(f"📁 Discogs directory: {DISCOGS_DIR}")
print(f"📁 Processed directory: {PROCESSED_DIR}")
print("✅ Directory structure created")

📁 Project root: /Users/richpointofview/smart-vinyl-catalog
📁 Data directory: /Users/richpointofview/smart-vinyl-catalog/data
📁 Discogs directory: /Users/richpointofview/smart-vinyl-catalog/data/discogs
📁 Processed directory: /Users/richpointofview/smart-vinyl-catalog/data/processed
✅ Directory structure created


In [6]:
# Cell 1 - Load Existing Catalog
# Load the FMA-integrated catalog from the previous notebook

# Load existing FMA catalog
fma_catalog_path = PROCESSED_DIR / 'fma_integrated.csv'

if fma_catalog_path.exists():
    existing_catalog = pd.read_csv(fma_catalog_path)
    print(f"✅ Loaded existing catalog: {len(existing_catalog):,} tracks")
    
    # Check if source column exists, if not add it
    if 'source' not in existing_catalog.columns:
        existing_catalog['source'] = 'fma_data'  # Default source for FMA data
        print("📝 Added 'source' column (set to 'fma_data')")
    
    print(f"📊 Sources: {existing_catalog['source'].value_counts().to_dict()}")
    
    # Display basic stats
    print(f"\n📈 Catalog Statistics:")
    print(f"   🎵 Unique genres: {existing_catalog['genre'].nunique() if 'genre' in existing_catalog.columns else 'N/A'}")
    print(f"   🎤 Unique artists: {existing_catalog['artist'].nunique() if 'artist' in existing_catalog.columns else 'N/A'}")
    print(f"   ⭐ Average rating: {existing_catalog['rating'].mean():.2f}" if 'rating' in existing_catalog.columns else "   ⭐ No ratings available")
    print(f"   📅 Year range: {existing_catalog['year'].min():.0f} - {existing_catalog['year'].max():.0f}" if 'year' in existing_catalog.columns else "   📅 No year data")
    
else:
    print("⚠️  FMA catalog not found. Creating sample catalog for demonstration.")
    # Create sample data that matches FMA structure
    existing_catalog = pd.DataFrame({
        'track_id': range(100),
        'title': [f'Sample Track {i+1}' for i in range(100)],
        'artist': ['Various Artists'] * 100,
        'genre': np.random.choice(['Electronic', 'Rock', 'Jazz', 'Folk'], 100),
        'rating': np.random.normal(3.5, 0.8, 100),
        'year': np.random.randint(1990, 2024, 100),
        'duration': np.random.randint(120, 400, 100),
        'plays': np.random.randint(100, 10000, 100),
        'favorites': np.random.randint(10, 1000, 100),
        'source': 'sample_data'
    })
    print(f"📝 Created sample catalog: {len(existing_catalog)} tracks")

# Display sample
print(f"\n🔍 Existing catalog sample:")
display(existing_catalog.head())

print(f"\n📋 Column info:")
print(f"   Columns: {list(existing_catalog.columns)}")
print(f"   Shape: {existing_catalog.shape}")
print(f"   Memory usage: {existing_catalog.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

✅ Loaded existing catalog: 5,000 tracks
📝 Added 'source' column (set to 'fma_data')
📊 Sources: {'fma_data': 5000}

📈 Catalog Statistics:
   🎵 Unique genres: 16
   🎤 Unique artists: 3139
   ⭐ Average rating: 2.00
   📅 Year range: 1971 - 2017

🔍 Existing catalog sample:


Unnamed: 0,release_id,title,artist,album,genre,year,duration,tags,plays,favorites,license,popularity_score,rating,label,country,review_text,review_source,source
0,129624,Tentacle Hentaij - Core Noisetone,Ralph Brown,Spettro Records Volume 2 - Decline,Experimental,2015.0,265,[],0,1,Attribution-Noncommercial-Share Alike 3.0 Unit...,0.000202,2.0,FMA,US,Creative Commons track by Ralph Brown. Genre: ...,FMA_Metadata,fma_data
1,24769,Goldfish,Aoiroooasamusi,Root Of Sorrow,,2010.0,308,[],0,17,Attribution-Noncommercial-Share Alike 3.0 Unit...,0.003441,2.0,FMA,US,Creative Commons track by Aoiroooasamusi. Genr...,FMA_Metadata,fma_data
2,14537,The Monk Said,dmyra,Love in the Air,,2009.0,388,"['clinical archives', 'folk', 'rock', 'psych-f...",0,4,Attribution-NonCommercial-NoDerivatives (aka M...,0.00081,2.0,FMA,US,Creative Commons track by dmyra. Genre: nan. [...,FMA_Metadata,fma_data
3,36538,Fac ut animae,The Tudor Consort,Stabat Mater - Domenico Scarlatti,Classical,2010.0,129,['new zealand'],0,11,Attribution 3.0 International,0.002227,2.0,FMA,US,Creative Commons track by The Tudor Consort. G...,FMA_Metadata,fma_data
4,25193,Bartlett Bridge,Vitamin-D,Live at WFMU on Irene Trudel's show 2/8/10,Folk,2010.0,173,[],0,0,Attribution-Noncommercial-No Derivative Works ...,0.0,2.0,FMA,US,Creative Commons track by Vitamin-D. Genre: Fo...,FMA_Metadata,fma_data



📋 Column info:
   Columns: ['release_id', 'title', 'artist', 'album', 'genre', 'year', 'duration', 'tags', 'plays', 'favorites', 'license', 'popularity_score', 'rating', 'label', 'country', 'review_text', 'review_source', 'source']
   Shape: (5000, 18)
   Memory usage: 4.0 MB


In [7]:
# Cell 2 - Kaggle API Setup and Data Download Preparation

def check_kaggle_setup():
    """Check if Kaggle API is properly configured"""
    try:
        import kaggle
        # Try to authenticate
        kaggle.api.authenticate()
        print("✅ Kaggle API is configured and authenticated")
        return True
    except ImportError:
        print("❌ Kaggle package not installed")
        print("💡 Install with: pip install kaggle")
        return False
    except Exception as e:
        print(f"❌ Kaggle API error: {e}")
        print("\n🔧 To set up Kaggle API:")
        print("1. Install: pip install kaggle")
        print("2. Go to kaggle.com → Account → Create New API Token")
        print("3. Download kaggle.json file")
        print("4. Place kaggle.json in ~/.kaggle/ directory")
        print("5. Run: chmod 600 ~/.kaggle/kaggle.json (on Mac/Linux)")
        return False

def check_discogs_data_exists():
    """Check if Discogs data already exists in our directory"""
    print(f"\n📁 Checking existing files in {DISCOGS_DIR}...")
    
    # Look for common Discogs file patterns
    patterns = ['*discogs*', '*releases*', '*artists*', '*labels*', '*.xml*', '*.csv']
    existing_files = []
    
    for pattern in patterns:
        files = list(DISCOGS_DIR.glob(pattern))
        existing_files.extend(files)
    
    # Remove duplicates and sort
    existing_files = sorted(set(existing_files))
    
    if existing_files:
        print(f"📄 Found {len(existing_files)} existing files:")
        for f in existing_files[:10]:  # Show first 10
            size_mb = f.stat().st_size / 1024 / 1024
            print(f"   📄 {f.name} ({size_mb:.1f} MB)")
        if len(existing_files) > 10:
            print(f"   ... and {len(existing_files) - 10} more files")
        return True
    else:
        print("📭 No existing Discogs files found")
        return False

# Check Kaggle setup
print("🔍 Checking Kaggle API setup...")
kaggle_ready = check_kaggle_setup()

# Check for existing data
data_exists = check_discogs_data_exists()

# Summary
print(f"\n📋 Status Summary:")
print(f"   Kaggle API Ready: {'✅' if kaggle_ready else '❌'}")
print(f"   Existing Data: {'✅' if data_exists else '📭'}")

if not kaggle_ready and not data_exists:
    print(f"\n⚠️  Next steps:")
    print(f"   1. Set up Kaggle API (see instructions above)")
    print(f"   2. Or manually download Discogs data")
elif kaggle_ready and not data_exists:
    print(f"\n✅ Ready to download Discogs data in next cell!")
elif data_exists:
    print(f"\n✅ Can proceed with existing data or download fresh data!")
else:
    print(f"\n✅ Ready to proceed!")

🔍 Checking Kaggle API setup...
❌ Kaggle API error: Could not find kaggle.json. Make sure it's located in /Users/richpointofview/.kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/

🔧 To set up Kaggle API:
1. Install: pip install kaggle
2. Go to kaggle.com → Account → Create New API Token
3. Download kaggle.json file
4. Place kaggle.json in ~/.kaggle/ directory
5. Run: chmod 600 ~/.kaggle/kaggle.json (on Mac/Linux)

📁 Checking existing files in /Users/richpointofview/smart-vinyl-catalog/data/discogs...
📭 No existing Discogs files found

📋 Status Summary:
   Kaggle API Ready: ❌
   Existing Data: 📭

⚠️  Next steps:
   1. Set up Kaggle API (see instructions above)
   2. Or manually download Discogs data


In [None]:
# Cell 3 - Download Discogs Data (Multiple Methods)

def download_discogs_kaggle(dataset_name: str = "ofurkancoban/discogs-data-dumps-april-2025"):
    """Download Discogs dataset from Kaggle"""
    try:
        import kaggle
        kaggle.api.authenticate()
        
        print(f"📥 Downloading {dataset_name}...")
        kaggle.api.dataset_download_files(
            dataset_name,
            path=str(DISCOGS_DIR),
            unzip=True
        )
        
        print("✅ Download complete!")
        return True
        
    except Exception as e:
        print(f"❌ Kaggle download failed: {e}")
        return False

def download_discogs_manual():
    """Provide manual download instructions"""
    print("🔗 Manual Download Options:")
    print("\n📋 Option 1 - Kaggle (Recommended):")
    print("   1. Go to: https://www.kaggle.com/datasets/ofurkancoban/discogs-data-dumps-april-2025")
    print("   2. Click 'Download' button")
    print(f"   3. Extract files to: {DISCOGS_DIR}")
    
    print("\n📋 Option 2 - Official Discogs (Large files):")
    print("   1. Go to: https://discogs-data-dumps.s3.us-west-2.amazonaws.com/index.html")
    print("   2. Download recent releases.xml.gz file")
    print(f"   3. Save to: {DISCOGS_DIR}")
    
    print(f"\n📁 Target directory: {DISCOGS_DIR}")
    
def create_sample_discogs_data():
    """Create sample Discogs-style data for testing"""
    print("🎯 Creating sample Discogs data for testing...")
    
    # Create sample releases data
    sample_releases = []
    
    # Sample classic vinyl releases
    classic_data = [
        {"title": "Kind of Blue", "artist": "Miles Davis", "label": "Columbia", "year": 1959, "genre": "Jazz", "country": "US"},
        {"title": "Pet Sounds", "artist": "The Beach Boys", "label": "Capitol", "year": 1966, "genre": "Pop", "country": "US"},
        {"title": "Sgt. Pepper's Lonely Hearts Club Band", "artist": "The Beatles", "label": "Parlophone", "year": 1967, "genre": "Rock", "country": "UK"},
        {"title": "What's Going On", "artist": "Marvin Gaye", "label": "Tamla", "year": 1971, "genre": "Soul", "country": "US"},
        {"title": "The Dark Side of the Moon", "artist": "Pink Floyd", "label": "Harvest", "year": 1973, "genre": "Progressive Rock", "country": "UK"},
        {"title": "Songs in the Key of Life", "artist": "Stevie Wonder", "label": "Tamla", "year": 1976, "genre": "Soul", "country": "US"},
        {"title": "Rumours", "artist": "Fleetwood Mac", "label": "Warner Bros.", "year": 1977, "genre": "Rock", "country": "US"},
        {"title": "Unknown Pleasures", "artist": "Joy Division", "label": "Factory", "year": 1979, "genre": "Post-Punk", "country": "UK"},
        {"title": "London Calling", "artist": "The Clash", "label": "CBS", "year": 1979, "genre": "Punk", "country": "UK"},
        {"title": "Purple Rain", "artist": "Prince", "label": "Warner Bros.", "year": 1984, "genre": "Pop", "country": "US"}
    ]
    
    # Expand sample data
    for i in range(1000):
        base_record = classic_data[i % len(classic_data)]
        
        sample_releases.append({
            'discogs_id': f'discogs_{i+1}',
            'title': base_record['title'] if i < 50 else f"{base_record['title']} (Reissue {i//50})",
            'artist': base_record['artist'],
            'label': base_record['label'],
            'year': base_record['year'] + (i // 100),  # Vary years
            'genre': base_record['genre'],
            'country': base_record['country'],
            'format': np.random.choice(['LP', '12"', '7"'], p=[0.7, 0.2, 0.1]),
            'rating': np.random.normal(4.1, 0.6),
            'catalog_number': f"CAT-{i+1000}",
            'status': 'Accepted',
            'source': 'discogs_sample'
        })
    
    # Create DataFrame
    sample_df = pd.DataFrame(sample_releases)
    sample_df['rating'] = sample_df['rating'].clip(1, 5).round(1)
    
    # Save to CSV for easier handling
    sample_path = DISCOGS_DIR / 'sample_discogs_releases.csv'
    sample_df.to_csv(sample_path, index=False)
    
    print(f"✅ Created {len(sample_df)} sample Discogs releases")
    print(f"📄 Saved to: {sample_path}")
    return sample_df

# Main execution
print("🎯 Choose your download method:")
print("1. 🔑 Set up Kaggle API first (recommended)")
print("2. 📥 Try Kaggle download anyway")
print("3. 📋 Manual download instructions")
print("4. 🎯 Create sample data for testing")

choice = input("\nEnter choice (1-4): ").strip()

if choice == '1':
    print("\n🔑 Kaggle API Setup Instructions:")
    print("1. Go to kaggle.com and log in")
    print("2. Go to Account > API > Create New API Token")
    print("3. Download kaggle.json file")
    print("4. Run these commands in terminal:")
    print(f"   mkdir -p ~/.kaggle")
    print(f"   mv ~/Downloads/kaggle.json ~/.kaggle/")
    print(f"   chmod 600 ~/.kaggle/kaggle.json")
    print("5. Come back and run this cell again with option 2")
    
elif choice == '2':
    success = download_discogs_kaggle()
    if success:
        # Check what we downloaded
        files = list(DISCOGS_DIR.glob('*'))
        print(f"\n📁 Downloaded files ({len(files)}):")
        for f in files[:10]:
            print(f"   📄 {f.name}")
    
elif choice == '3':
    download_discogs_manual()
    
elif choice == '4':
    sample_data = create_sample_discogs_data()
    print(f"\n📊 Sample data preview:")
    print(sample_data[['title', 'artist', 'label', 'year', 'genre', 'rating']].head())
    
else:
    print("❌ Invalid choice. Please run the cell again and choose 1-4.")
