In [1]:
# Cell 1 - Setup and Load Existing Data
import pandas as pd
import numpy as np
from pathlib import Path
import time

# Setup paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = PROJECT_ROOT / 'data'
DISCOGS_DIR = DATA_DIR / 'discogs'
PROCESSED_DIR = DATA_DIR / 'processed'

print(f"Project root: {PROJECT_ROOT}")
print(f"Discogs directory: {DISCOGS_DIR}")
print(f"Processed directory: {PROCESSED_DIR}")

# Load existing combined catalog
existing_catalog_path = PROCESSED_DIR / 'final_combined_catalog_real_data.csv'

if existing_catalog_path.exists():
    existing_catalog = pd.read_csv(existing_catalog_path, low_memory=False)
    print(f"Loaded existing catalog: {len(existing_catalog):,} records")
    
    # Show current structure
    print(f"Current columns: {list(existing_catalog.columns)}")
    print(f"Memory usage: {existing_catalog.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")
else:
    print("Error: Existing catalog not found. Run the integration notebook first.")
    existing_catalog = None

# Check available Discogs files
discogs_files = {
    'artists': DISCOGS_DIR / 'discogs_20250901_artists.csv',
    'labels': DISCOGS_DIR / 'discogs_20250901_labels.csv', 
    'masters': DISCOGS_DIR / 'discogs_20250901_masters.csv',
    'releases': DISCOGS_DIR / 'discogs_20250901_releases.csv'
}

print("\nDiscogs files available:")
for name, path in discogs_files.items():
    if path.exists():
        size_gb = path.stat().st_size / (1024**3)
        print(f"  ✅ {name}: {path.name} ({size_gb:.1f} GB)")
    else:
        print(f"  ❌ {name}: Not found")


Project root: /Users/richpointofview/smart-vinyl-catalog
Discogs directory: /Users/richpointofview/smart-vinyl-catalog/data/discogs
Processed directory: /Users/richpointofview/smart-vinyl-catalog/data/processed
Loaded existing catalog: 55,000 records
Current columns: ['release_id', 'title', 'artist', 'album', 'genre', 'year', 'duration', 'tags', 'plays', 'favorites', 'license', 'popularity_score', 'rating', 'label', 'country', 'review_text', 'review_source', 'source', 'catalog_number', 'discogs_id', 'format', 'status', 'release_videos_video_embed', 'release_labels_label_name', 'release_videos_video_duration', 'tracklist_track_position', 'companies_company_resource_url', 'release_labels_label_id', 'releases_release_data_quality', 'companies_company_catno', 'sub_tracks_track_title', 'release_formats_format_text', 'companies_company_name', 'releases_release_title', 'artists_artist_id', 'releases_release_released', 'companies_company_entity_type_name', 'artists_artist_join', 'releases_rele

In [None]:
# Cell 2 - Process Artists Data from CSV

import pandas as pd

def process_artists_data_final(artists_csv_path, sample_size=10000):
    print(f"Processing artists data from CSV (sample size: {sample_size})...")
    
    try:
        # Load the CSV data
        df_raw = pd.read_csv(artists_csv_path, low_memory=False)
        print(f"✅ Loaded {len(df_raw):,} total artists from: {artists_csv_path.name}")
        
        # Sample data
        sample_data = df_raw.sample(n=min(sample_size, len(df_raw)), random_state=42).copy()
        
        processed_artists = []

        for i, artist in sample_data.iterrows():
            try:
                artist_id = artist.get('id') or artist.get('artist_id')
                name = artist.get('name') or artist.get('artist_name')
                real_name = artist.get('realname') or artist.get('real_name')

                profile = str(artist.get('profile')) if pd.notna(artist.get('profile')) else ''
                urls = str(artist.get('urls')) if pd.notna(artist.get('urls')) else ''
                urls_str = urls[:200] if urls else ''
                
                name_variations = artist.get('namevariations') or artist.get('name_variations')
                name_variations_str = str(name_variations)[:200] if pd.notna(name_variations) else ''

                aliases = artist.get('aliases', '')
                aliases_str = str(aliases)[:200] if pd.notna(aliases) else ''

                members = artist.get('members', '')
                members_str = str(members)[:200] if pd.notna(members) else ''

                groups = artist.get('groups', '')
                groups_str = str(groups)[:200] if pd.notna(groups) else ''

                images = artist.get('images', '')
                image_count = 1 if pd.notna(images) and str(images).strip() else 0

                # Quality score
                quality_score = sum([
                    bool(name), bool(real_name), bool(profile), bool(urls_str),
                    bool(name_variations_str), bool(aliases_str),
                    bool(members_str), image_count > 0
                ])

                processed_artist = {
                    'artist_id': artist_id,
                    'name': name,
                    'real_name': real_name,
                    'profile': profile[:500],
                    'urls': urls_str,
                    'name_variations': name_variations_str,
                    'aliases': aliases_str,
                    'members': members_str,
                    'groups': groups_str,
                    'image_count': image_count,
                    'quality_score': quality_score,
                    'has_profile': bool(profile),
                    'has_real_name': bool(real_name),
                    'has_variations': bool(name_variations_str),
                    'has_aliases': bool(aliases_str),
                    'has_members': bool(members_str),
                    'has_groups': bool(groups_str)
                }

                processed_artists.append(processed_artist)

            except Exception as e:
                print(f"⚠️ Error processing row {i}: {e}")
                continue

        df_artists = pd.DataFrame(processed_artists)

        print(f"\n✅ Successfully processed {len(df_artists)} artists")
        print(f"Average quality score: {df_artists['quality_score'].mean():.2f}")
        print(f"Artists with profiles: {df_artists['has_profile'].sum()}")
        print(f"Artists with real names: {df_artists['has_real_name'].sum()}")
        print(f"Artists with name variations: {df_artists['has_variations'].sum()}")
        print(f"Artists with aliases: {df_artists['has_aliases'].sum()}")

        # Save processed data
        output_path = PROCESSED_DIR / 'processed_artists.csv'
        df_artists.to_csv(output_path, index=False)
        print(f"📁 Saved processed artists data to '{output_path}'")

        return df_artists

    except Exception as e:
        print("❌ Failed to process artists data")
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run processing
artists_csv_path = DISCOGS_DIR / 'discogs_20250901_artists.csv'
df_artists = process_artists_data_final(artists_csv_path)


In [None]:
# Cell 3 - Process Labels Data from CSV

def process_labels_data(labels_csv_path, sample_size=10000):
    print(f"Processing labels data from CSV (sample size: {sample_size})...")

    try:
        df_raw = pd.read_csv(labels_csv_path, low_memory=False)
        print(f"✅ Loaded {len(df_raw):,} total labels from: {labels_csv_path.name}")

        sample_data = df_raw.sample(n=min(sample_size, len(df_raw)), random_state=42).copy()

        processed_labels = []

        for i, label in sample_data.iterrows():
            try:
                label_id = label.get('id') or label.get('label_id')
                label_name = label.get('name') or label.get('label_name')
                contact_info = label.get('contact_info', '')
                parent_label_id = label.get('parent_label_id') if 'parent_label_id' in label else None
                profile = str(label.get('profile')) if pd.notna(label.get('profile')) else ''
                data_quality = label.get('data_quality', '')

                processed_label = {
                    'label_id': label_id,
                    'label_name': label_name,
                    'contact_info': contact_info,
                    'parent_label_id': parent_label_id,
                    'profile': profile[:500],
                    'data_quality': data_quality
                }

                processed_labels.append(processed_label)
            except Exception as e:
                print(f"⚠️ Error processing label row {i}: {e}")
                continue

        df_labels = pd.DataFrame(processed_labels)

        print(f"\n✅ Successfully processed {len(df_labels)} labels")
        output_path = PROCESSED_DIR / 'processed_labels.csv'
        df_labels.to_csv(output_path, index=False)
        print(f"📁 Saved processed labels data to '{output_path}'")

        return df_labels

    except Exception as e:
        print("❌ Failed to process labels data")
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run processing
labels_csv_path = DISCOGS_DIR / 'discogs_20250901_labels.csv'
df_labels = process_labels_data(labels_csv_path)


In [None]:
# Cell 4 - Process Masters Data from CSV

def process_masters_data(masters_csv_path, sample_size=10000):
    print(f"Processing masters data from CSV (sample size: {sample_size})...")

    try:
        df_raw = pd.read_csv(masters_csv_path, low_memory=False)
        print(f"✅ Loaded {len(df_raw):,} total masters from: {masters_csv_path.name}")

        sample_data = df_raw.sample(n=min(sample_size, len(df_raw)), random_state=42).copy()

        processed_masters = []

        for i, master in sample_data.iterrows():
            try:
                master_id = master.get('id') or master.get('master_id')
                original_year = master.get('year') or master.get('original_year')
                main_release_id = master.get('main_release') or master.get('main_release_id')
                num_versions = master.get('num_versions', 0)
                master_title = master.get('title') or master.get('master_title')

                processed_master = {
                    'master_id': master_id,
                    'original_year': original_year,
                    'main_release_id': main_release_id,
                    'num_versions': num_versions,
                    'master_title': master_title
                }

                processed_masters.append(processed_master)
            except Exception as e:
                print(f"⚠️ Error processing master row {i}: {e}")
                continue

        df_masters = pd.DataFrame(processed_masters)

        print(f"\n✅ Successfully processed {len(df_masters)} masters")
        output_path = PROCESSED_DIR / 'processed_masters.csv'
        df_masters.to_csv(output_path, index=False)
        print(f"📁 Saved processed masters data to '{output_path}'")

        return df_masters

    except Exception as e:
        print("❌ Failed to process masters data")
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return None

# Run processing
masters_csv_path = DISCOGS_DIR / 'discogs_20250901_masters.csv'
df_masters = process_masters_data(masters_csv_path)


In [None]:
# Cell 5 - Filter High-Quality Releases from Existing Catalog

def filter_high_quality_releases(df_catalog):
    print("Filtering high-quality releases from existing catalog...")

    if df_catalog is None or df_catalog.empty:
        print("⚠️ Catalog is empty or missing.")
        return None

    # Convert columns safely to numeric
    df_catalog['popularity_score'] = pd.to_numeric(df_catalog.get('popularity_score', pd.Series()), errors='coerce')
    df_catalog['rating'] = pd.to_numeric(df_catalog.get('rating', pd.Series()), errors='coerce')

    # Drop rows where both scores are NaN (optional)
    df_catalog = df_catalog.dropna(subset=['popularity_score', 'rating'], how='all')

    # Filter
    filtered = df_catalog[
        (df_catalog['popularity_score'] > 0.7) |
        (df_catalog['rating'] > 4.0)
    ].copy()

    print(f"✅ Filtered down to {len(filtered):,} high-quality releases (from {len(df_catalog):,})")

    output_path = PROCESSED_DIR / 'filtered_catalog.csv'
    filtered.to_csv(output_path, index=False)
    print(f"📁 Saved filtered catalog to '{output_path}'")

    return filtered

# Run filtering
filtered_catalog = filter_high_quality_releases(existing_catalog)


In [None]:
# Cell 6 - Integration and Final Enhanced Catalog

def merge_enriched_catalog(df_catalog, df_artists, df_labels, df_masters, save_path):
    print("Integrating enriched data into final catalog...")

    df = df_catalog.copy()

    # Merge label info if present
    if df_labels is not None:
        if 'release_labels_label_id' in df.columns and 'label_id' in df_labels.columns:
            df = df.merge(
                df_labels[['label_id', 'label_name', 'contact_info', 'parent_label_id', 'profile', 'data_quality']],
                left_on='release_labels_label_id',
                right_on='label_id',
                how='left'
            )
            print(f"✅ Merged label data: {len(df)} rows")
        else:
            print("❌ Label merge keys missing, skipping label merge.")
    else:
        print("❌ df_labels is None, skipping label merge.")

    # Merge artist info
    if df_artists is not None and 'artists_artist_id' in df.columns:
        df = df.merge(
            df_artists[['artist_id', 'real_name', 'profile', 'quality_score']],
            left_on='artists_artist_id',
            right_on='artist_id',
            how='left'
        )
        print(f"✅ Merged artist data: {len(df)} rows")
    else:
        print("❌ 'artists_artist_id' column missing in catalog, skipping artist merge.")

    # Merge master info
    if df_masters is not None and 'releases_release_master_id' in df.columns:
        df = df.merge(
            df_masters[['master_id', 'original_year', 'main_release_id', 'num_versions', 'master_title']],
            left_on='releases_release_master_id',
            right_on='master_id',
            how='left'
        )
        print(f"✅ Merged master data: {len(df)} rows")
    else:
        print("❌ 'releases_release_master_id' column missing in catalog, skipping master merge.")

    df.to_csv(save_path, index=False)
    print(f"📁 Final enriched catalog saved to: {save_path}")

    return df

# Run integration
final_catalog = merge_enriched_catalog(
    filtered_catalog,
    df_artists,
    df_labels,
    df_masters,
    save_path=PROCESSED_DIR / 'final_enriched_catalog.csv'
)
