In [2]:
# Setup Cell 1 - Imports
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import gzip
import requests
from pathlib import Path
import sqlite3
import json
import re
import warnings
from typing import Dict, List, Optional, Union
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import time

print("‚úÖ All imports successful")

‚úÖ All imports successful


In [3]:
# Setup Cell 2 - Configure display and warnings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('default')  # Use default if seaborn issues
sns.set_palette("husl")

print("‚úÖ Display options configured")

‚úÖ Display options configured


In [4]:
# Setup Cell 3 - Project paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = PROJECT_ROOT / 'data'
DISCOGS_DIR = DATA_DIR / 'discogs'
PROCESSED_DIR = DATA_DIR / 'processed'
RAW_DIR = DATA_DIR / 'raw'

# Create directories
DISCOGS_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Project root: {PROJECT_ROOT}")
print(f"üìÅ Data directory: {DATA_DIR}")
print(f"üìÅ Discogs directory: {DISCOGS_DIR}")
print(f"üìÅ Processed directory: {PROCESSED_DIR}")
print("‚úÖ Directory structure created")

üìÅ Project root: /Users/richpointofview/smart-vinyl-catalog
üìÅ Data directory: /Users/richpointofview/smart-vinyl-catalog/data
üìÅ Discogs directory: /Users/richpointofview/smart-vinyl-catalog/data/discogs
üìÅ Processed directory: /Users/richpointofview/smart-vinyl-catalog/data/processed
‚úÖ Directory structure created


In [None]:
# Cell 1 - Load Existing Catalog
# Load the FMA-integrated catalog from the previous notebook

# Load existing FMA catalog
fma_catalog_path = PROCESSED_DIR / 'fma_integrated.csv'

if fma_catalog_path.exists():
    existing_catalog = pd.read_csv(fma_catalog_path)
    print(f"‚úÖ Loaded existing catalog: {len(existing_catalog):,} tracks")
    
    # Check if source column exists, if not add it
    if 'source' not in existing_catalog.columns:
        existing_catalog['source'] = 'fma_data'  # Default source for FMA data
        print("üìù Added 'source' column (set to 'fma_data')")
    
    print(f"üìä Sources: {existing_catalog['source'].value_counts().to_dict()}")
    
    # Display basic stats
    print(f"\nüìà Catalog Statistics:")
    print(f"   üéµ Unique genres: {existing_catalog['genre'].nunique() if 'genre' in existing_catalog.columns else 'N/A'}")
    print(f"   üé§ Unique artists: {existing_catalog['artist'].nunique() if 'artist' in existing_catalog.columns else 'N/A'}")
    print(f"   ‚≠ê Average rating: {existing_catalog['rating'].mean():.2f}" if 'rating' in existing_catalog.columns else "   ‚≠ê No ratings available")
    print(f"   üìÖ Year range: {existing_catalog['year'].min():.0f} - {existing_catalog['year'].max():.0f}" if 'year' in existing_catalog.columns else "   üìÖ No year data")
    
else:
    print("‚ö†Ô∏è  FMA catalog not found. Creating sample catalog for demonstration.")
    # Create sample data that matches FMA structure
    existing_catalog = pd.DataFrame({
        'track_id': range(100),
        'title': [f'Sample Track {i+1}' for i in range(100)],
        'artist': ['Various Artists'] * 100,
        'genre': np.random.choice(['Electronic', 'Rock', 'Jazz', 'Folk'], 100),
        'rating': np.random.normal(3.5, 0.8, 100),
        'year': np.random.randint(1990, 2024, 100),
        'duration': np.random.randint(120, 400, 100),
        'plays': np.random.randint(100, 10000, 100),
        'favorites': np.random.randint(10, 1000, 100),
        'source': 'sample_data'
    })
    print(f"üìù Created sample catalog: {len(existing_catalog)} tracks")

# Display sample
print(f"\nüîç Existing catalog sample:")
display(existing_catalog.head())

print(f"\nüìã Column info:")
print(f"   Columns: {list(existing_catalog.columns)}")
print(f"   Shape: {existing_catalog.shape}")
print(f"   Memory usage: {existing_catalog.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")