# DYLUMO - Image to Music Training (Kaggle GPU)

This notebook trains the FastMLP model to map image features to music features.

**Architecture:** CLIP Image Features (512-dim) → FastMLP → Spotify Audio Features (13-dim)

**Run this on Kaggle with GPU enabled!**


In [None]:
# Install required packages
!pip install open-clip-torch faiss-cpu huggingface_hub datasets pyarrow -q

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from PIL import Image
from pathlib import Path
import pickle

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


In [None]:
# Audio features
AUDIO_FEATURES = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                  'duration_ms', 'time_signature']

# Load Spotify data
spotify_df = pd.read_csv('/kaggle/input/spotify-1million-tracks/spotify_data.csv')
print(f'Loaded {len(spotify_df):,} tracks')

# Normalize features
scaler = MinMaxScaler()
features_norm = scaler.fit_transform(spotify_df[AUDIO_FEATURES])

# Map to emotions
def map_emotion(v, e):
    if v >= 0.5:
        return 'excitement' if e >= 0.7 else 'amusement' if e >= 0.5 else 'contentment'
    return 'anger' if e >= 0.7 else 'fear' if e >= 0.5 else 'sadness'

spotify_df['emotion'] = spotify_df.apply(lambda r: map_emotion(r['valence'], r['energy']), axis=1)
for i, c in enumerate(AUDIO_FEATURES):
    spotify_df[f'{c}_norm'] = features_norm[:, i]

print(spotify_df['emotion'].value_counts())


In [None]:
# Download EMID from HuggingFace
from huggingface_hub import list_repo_files, hf_hub_download
import pyarrow.parquet as pq

os.makedirs('/kaggle/working/emid/images', exist_ok=True)
os.makedirs('/kaggle/working/emid/raw', exist_ok=True)

files = list_repo_files('ecnu-aigc/EMID', repo_type='dataset')
parquet_files = [f for f in files if f.endswith('.parquet')]

for pf in tqdm(parquet_files, desc='Downloading EMID'):
    hf_hub_download(repo_id='ecnu-aigc/EMID', repo_type='dataset', filename=pf, local_dir='/kaggle/working/emid/raw')

hf_hub_download(repo_id='ecnu-aigc/EMID', repo_type='dataset', filename='EMID_data.csv', local_dir='/kaggle/working/emid')
print('Download complete!')


In [None]:
# Extract images from parquet
images_dir = Path('/kaggle/working/emid/images')
saved = 0
for pq_file in tqdm(list(Path('/kaggle/working/emid/raw/data').glob('*.parquet'))):
    df = pq.read_table(pq_file).to_pandas()
    for idx, row in df.iterrows():
        for col in ['Image1_filename', 'Image2_filename', 'Image3_filename']:
            try:
                d = row[col]
                if d and 'bytes' in d and d['bytes']:
                    fn = os.path.basename(d.get('path', f'{col}_{idx}.jpg'))
                    p = images_dir / fn
                    if not p.exists():
                        p.write_bytes(d['bytes'])
                        saved += 1
            except: pass
print(f'Extracted {saved} images')


In [None]:
# Prepare EMID metadata
emid_df = pd.read_csv('/kaggle/working/emid/EMID_data.csv')
img_data = []
for _, row in emid_df.iterrows():
    for i in [1,2,3]:
        fn, tag = row[f'Image{i}_filename'], row[f'Image{i}_tag']
        if pd.notna(fn): img_data.append({'filename': fn, 'emotion': tag})

images_df = pd.DataFrame(img_data).drop_duplicates('filename')
images_df['exists'] = images_df['filename'].apply(lambda x: (images_dir/x).exists())
images_df = images_df[images_df['exists']].reset_index(drop=True)
print(f'{len(images_df)} images ready')


In [None]:
# Extract CLIP features
import open_clip
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai')
clip_model = clip_model.to(device).eval()

all_feats = []
for i in tqdm(range(0, len(images_df), 64), desc='CLIP'):
    batch = images_df.iloc[i:i+64]
    imgs = [clip_preprocess(Image.open(images_dir/r['filename']).convert('RGB')) for _,r in batch.iterrows()]
    with torch.no_grad():
        t = torch.stack(imgs).to(device)
        f = clip_model.encode_image(t)
        f = f / f.norm(dim=-1, keepdim=True)
        all_feats.append(f.cpu().numpy())

image_features = np.vstack(all_feats).astype(np.float32)
print(f'Features: {image_features.shape}')


In [None]:
# Create training pairs
norm_cols = [f'{c}_norm' for c in AUDIO_FEATURES]
X, y = [], []
for idx, row in tqdm(images_df.iterrows(), total=len(images_df)):
    songs = spotify_df[spotify_df['emotion'] == row['emotion']]
    if len(songs) == 0: continue
    for _, s in songs.sample(min(5, len(songs)), random_state=42+idx).iterrows():
        X.append(image_features[idx])
        y.append(s[norm_cols].values.astype(np.float32))

X, y = np.array(X), np.array(y)
print(f'Pairs: {len(X)}')


In [None]:
# Split data
n = len(X)
idx = np.random.permutation(n)
train_X, train_y = X[idx[:int(0.8*n)]], y[idx[:int(0.8*n)]]
val_X, val_y = X[idx[int(0.8*n):int(0.9*n)]], y[idx[int(0.8*n):int(0.9*n)]]
test_X, test_y = X[idx[int(0.9*n):]], y[idx[int(0.9*n):]]
print(f'Train: {len(train_X)}, Val: {len(val_X)}, Test: {len(test_X)}')


In [None]:
# FastMLP Model
class FastMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(512, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 13)
        )
    def forward(self, x): return self.net(x)

model = FastMLP().to(device)
print(f'Parameters: {sum(p.numel() for p in model.parameters()):,}')


In [None]:
# Training
train_loader = DataLoader(TensorDataset(torch.FloatTensor(train_X), torch.FloatTensor(train_y)), batch_size=128, shuffle=True)
val_loader = DataLoader(TensorDataset(torch.FloatTensor(val_X), torch.FloatTensor(val_y)), batch_size=128)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=5)

best_loss = float('inf')
for epoch in range(100):
    model.train()
    t_loss = sum(criterion(model(bx.to(device)), by.to(device)).item() for bx, by in train_loader) / len(train_loader)
    for bx, by in train_loader:
        optimizer.zero_grad()
        criterion(model(bx.to(device)), by.to(device)).backward()
        optimizer.step()
    
    model.eval()
    with torch.no_grad():
        v_loss = sum(criterion(model(bx.to(device)), by.to(device)).item() for bx, by in val_loader) / len(val_loader)
    
    scheduler.step(v_loss)
    if epoch % 10 == 0: print(f'Epoch {epoch}: train={t_loss:.4f}, val={v_loss:.4f}')
    if v_loss < best_loss:
        best_loss = v_loss
        torch.save(model.state_dict(), '/kaggle/working/best_model.pt')

print(f'Best val loss: {best_loss:.4f}')


In [None]:
# Save artifacts
model.load_state_dict(torch.load('/kaggle/working/best_model.pt'))
torch.save({'model_state_dict': model.state_dict(), 'input_dim': 512, 'output_dim': 13, 
            'hidden_dims': [512, 256, 128]}, '/kaggle/working/dylumo_model.pt')

with open('/kaggle/working/spotify_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

np.save('/kaggle/working/spotify_features.npy', spotify_df[norm_cols].values.astype(np.float32))
spotify_df[['track_id', 'track_name', 'artist_name', 'popularity', 'emotion']].to_parquet('/kaggle/working/spotify_metadata.parquet')

print('Saved: dylumo_model.pt, spotify_scaler.pkl, spotify_features.npy, spotify_metadata.parquet')
print('Download these files and place in checkpoints/ folder!')
