# DYLUMO - Image to Music Training

This notebook trains a FastMLP model to map image features to music features.

**Architecture:** CLIP Image Features (512-dim) → FastMLP → Spotify Audio Features (13-dim)

## BEFORE RUNNING:

1. **Enable GPU**: Click **Settings** (gear icon) → **Accelerator** → Select **"GPU T4 x2"**
2. Click **"Run All"** and wait ~30 minutes

**Both datasets download automatically!**

---

**Datasets:**
- Spotify 1M Tracks (from Kaggle)
- EMID Images (from HuggingFace)


In [None]:
# Cell 1: Install packages and setup
!pip install open-clip-torch faiss-cpu huggingface_hub pyarrow -q

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm
from PIL import Image
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if device.type == 'cuda':
    print(f'GPU: {torch.cuda.get_device_name(0)}')


In [None]:
# Cell 2: Download and load Spotify dataset
AUDIO_FEATURES = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                  'duration_ms', 'time_signature']

# Try multiple possible paths for Spotify data
SPOTIFY_PATHS = [
    '/kaggle/input/spotify-1million-tracks/spotify_data.csv',  # If added via UI
    '/kaggle/working/spotify/spotify_data.csv'  # If downloaded via API
]

spotify_path = None
for path in SPOTIFY_PATHS:
    if os.path.exists(path):
        spotify_path = path
        break

# Download if not found
if spotify_path is None:
    print('Spotify dataset not found. Downloading from Kaggle...')
    import subprocess
    os.makedirs('/kaggle/working/spotify', exist_ok=True)
    
    # Download using Kaggle API (available in Kaggle notebooks)
    result = subprocess.run(
        ['kaggle', 'datasets', 'download', '-d', 'amitanshjoshi/spotify-1million-tracks', 
         '-p', '/kaggle/working/spotify', '--unzip'],
        capture_output=True, text=True
    )
    
    if result.returncode == 0:
        spotify_path = '/kaggle/working/spotify/spotify_data.csv'
        print('Download complete!')
    else:
        print(f'Download error: {result.stderr}')
        raise FileNotFoundError(
            "Could not download Spotify dataset.\n"
            "Please add it manually: Click '+ Add Data' → Search 'spotify-1million-tracks' → Add"
        )

print(f'Loading Spotify data from: {spotify_path}')
spotify_df = pd.read_csv(spotify_path)
print(f'Loaded {len(spotify_df):,} Spotify tracks')

# Normalize features
scaler = MinMaxScaler()
features_norm = scaler.fit_transform(spotify_df[AUDIO_FEATURES])

# Map to emotions
def map_emotion(v, e):
    if v >= 0.5:
        return 'excitement' if e >= 0.7 else 'amusement' if e >= 0.5 else 'contentment'
    return 'anger' if e >= 0.7 else 'fear' if e >= 0.5 else 'sadness'

spotify_df['emotion'] = spotify_df.apply(lambda r: map_emotion(r['valence'], r['energy']), axis=1)
for i, c in enumerate(AUDIO_FEATURES):
    spotify_df[f'{c}_norm'] = features_norm[:, i]

print(spotify_df['emotion'].value_counts())


In [None]:
# Cell 3: Download EMID dataset from HuggingFace (automatic)
from huggingface_hub import list_repo_files, hf_hub_download
import pyarrow.parquet as pq

os.makedirs('/kaggle/working/emid/images', exist_ok=True)
os.makedirs('/kaggle/working/emid/raw', exist_ok=True)

files = list_repo_files('ecnu-aigc/EMID', repo_type='dataset')
parquet_files = [f for f in files if f.endswith('.parquet')]

for pf in tqdm(parquet_files, desc='Downloading EMID'):
    hf_hub_download(repo_id='ecnu-aigc/EMID', repo_type='dataset', filename=pf, local_dir='/kaggle/working/emid/raw')

hf_hub_download(repo_id='ecnu-aigc/EMID', repo_type='dataset', filename='EMID_data.csv', local_dir='/kaggle/working/emid')
print('Download complete!')


In [None]:
# Cell 4: Extract images from parquet files
images_dir = Path('/kaggle/working/emid/images')
saved = 0
for pq_file in tqdm(list(Path('/kaggle/working/emid/raw/data').glob('*.parquet'))):
    df = pq.read_table(pq_file).to_pandas()
    for idx, row in df.iterrows():
        for col in ['Image1_filename', 'Image2_filename', 'Image3_filename']:
            try:
                d = row[col]
                if d and 'bytes' in d and d['bytes']:
                    fn = os.path.basename(d.get('path', f'{col}_{idx}.jpg'))
                    p = images_dir / fn
                    if not p.exists():
                        p.write_bytes(d['bytes'])
                        saved += 1
            except: pass
print(f'Extracted {saved} images')


In [None]:
# Cell 5: Prepare EMID metadata
emid_df = pd.read_csv('/kaggle/working/emid/EMID_data.csv')
img_data = []
for _, row in emid_df.iterrows():
    for i in [1,2,3]:
        fn, tag = row[f'Image{i}_filename'], row[f'Image{i}_tag']
        if pd.notna(fn): img_data.append({'filename': fn, 'emotion': tag})

images_df = pd.DataFrame(img_data).drop_duplicates('filename')
images_df['exists'] = images_df['filename'].apply(lambda x: (images_dir/x).exists())
images_df = images_df[images_df['exists']].reset_index(drop=True)
print(f'{len(images_df)} images ready')


In [None]:
# Cell 6: Extract CLIP features from images (GPU accelerated)
import open_clip

print('Loading CLIP model...')
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')
clip_model = clip_model.to(device).eval()
print('CLIP model loaded!')

all_feats = []
for i in tqdm(range(0, len(images_df), 64), desc='CLIP'):
    batch = images_df.iloc[i:i+64]
    imgs = [clip_preprocess(Image.open(images_dir/r['filename']).convert('RGB')) for _,r in batch.iterrows()]
    with torch.no_grad():
        t = torch.stack(imgs).to(device)
        f = clip_model.encode_image(t)
        f = f / f.norm(dim=-1, keepdim=True)
        all_feats.append(f.cpu().numpy())

image_features = np.vstack(all_feats).astype(np.float32)
print(f'Features: {image_features.shape}')

# Free GPU memory
del clip_model
torch.cuda.empty_cache()
print('CLIP model unloaded to free GPU memory')


In [None]:
# Cell 7: Create image-song training pairs
norm_cols = [f'{c}_norm' for c in AUDIO_FEATURES]
SONGS_PER_IMAGE = 5
X, y = [], []
for idx, row in tqdm(images_df.iterrows(), total=len(images_df)):
    songs = spotify_df[spotify_df['emotion'] == row['emotion']]
    if len(songs) == 0: continue
    for _, s in songs.sample(min(SONGS_PER_IMAGE, len(songs)), random_state=42+idx).iterrows():
        X.append(image_features[idx])
        y.append(s[norm_cols].values.astype(np.float32))

X, y = np.array(X), np.array(y)
print(f'Pairs: {len(X)}')


In [None]:
# Cell 8: Split into train/val/test sets
n = len(X)
idx = np.random.permutation(n)
train_X, train_y = X[idx[:int(0.8*n)]], y[idx[:int(0.8*n)]]
val_X, val_y = X[idx[int(0.8*n):int(0.9*n)]], y[idx[int(0.8*n):int(0.9*n)]]
test_X, test_y = X[idx[int(0.9*n):]], y[idx[int(0.9*n):]]
print(f'Train: {len(train_X)}, Val: {len(val_X)}, Test: {len(test_X)}')


In [None]:
# Cell 9: Define FastMLP model
class FastMLP(nn.Module):
    """Maps CLIP features (512-dim) to Spotify audio features (13-dim)"""
    def __init__(self, input_dim=512, output_dim=13):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, output_dim)
        )
    def forward(self, x): 
        return self.net(x)

model = FastMLP().to(device)
print(f'FastMLP Parameters: {sum(p.numel() for p in model.parameters()):,}')


In [None]:
# Cell 10: Training loop
EPOCHS = 100
BATCH_SIZE = 128
PATIENCE = 15  # Early stopping

train_loader = DataLoader(TensorDataset(torch.FloatTensor(train_X), torch.FloatTensor(train_y)), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(TensorDataset(torch.FloatTensor(val_X), torch.FloatTensor(val_y)), batch_size=BATCH_SIZE)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=5)

best_loss = float('inf')
patience_counter = 0

print(f'Training for up to {EPOCHS} epochs (early stopping patience: {PATIENCE})')
print('-' * 50)

for epoch in range(EPOCHS):
    # Training
    model.train()
    train_loss = 0.0
    for bx, by in train_loader:
        bx, by = bx.to(device), by.to(device)
        optimizer.zero_grad()
        loss = criterion(model(bx), by)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for bx, by in val_loader:
            bx, by = bx.to(device), by.to(device)
            val_loss += criterion(model(bx), by).item()
    val_loss /= len(val_loader)
    
    scheduler.step(val_loss)
    
    # Progress
    if epoch % 10 == 0 or val_loss < best_loss:
        print(f'Epoch {epoch+1:3d}/{EPOCHS} | Train: {train_loss:.4f} | Val: {val_loss:.4f}')
    
    # Save best
    if val_loss < best_loss:
        best_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), '/kaggle/working/best_model.pt')
    else:
        patience_counter += 1
    
    # Early stopping
    if patience_counter >= PATIENCE:
        print(f'\nEarly stopping at epoch {epoch+1}')
        break

print('-' * 50)
print(f'Training complete! Best val loss: {best_loss:.4f}')


In [None]:
# Cell 11: Save all output files
print('Saving output files...')

# Load best model
model.load_state_dict(torch.load('/kaggle/working/best_model.pt'))

# 1. Save model with metadata
torch.save({
    'model_state_dict': model.state_dict(),
    'input_dim': 512,
    'output_dim': 13,
    'hidden_dims': [512, 256, 128],
    'best_val_loss': best_loss
}, '/kaggle/working/dylumo_model.pt')
print('1. Saved: dylumo_model.pt')

# 2. Save scaler
with open('/kaggle/working/spotify_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print('2. Saved: spotify_scaler.pkl')

# 3. Save Spotify features for FAISS
np.save('/kaggle/working/spotify_features.npy', spotify_df[norm_cols].values.astype(np.float32))
print(f'3. Saved: spotify_features.npy ({len(spotify_df):,} songs)')

# 4. Save Spotify metadata
cols = ['track_id', 'track_name', 'artist_name', 'popularity', 'emotion']
available = [c for c in cols if c in spotify_df.columns]
spotify_df[available].to_parquet('/kaggle/working/spotify_metadata.parquet')
print('4. Saved: spotify_metadata.parquet')

print('\n' + '=' * 60)
print('TRAINING COMPLETE!')
print('=' * 60)
print('\nDownload these files from the OUTPUT tab (right sidebar):')
print('  - dylumo_model.pt        -> Put in checkpoints/')
print('  - spotify_scaler.pkl     -> Put in checkpoints/')
print('  - spotify_features.npy   -> Put in data/processed/')
print('  - spotify_metadata.parquet -> Put in data/processed/')
