In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
import math
import json

In [2]:
torch.set_float32_matmul_precision('high')

In [3]:
class MelodyDataset(Dataset):
    def __init__(self, data, sequence_length=10, device='cuda'):
        self.data = torch.tensor(data, dtype=torch.float32)
        self.sequence_length = sequence_length
        self.device = device
        
    def __len__(self):
        return len(self.data) - self.sequence_length
    
    def __getitem__(self, idx):
        x = self.data[idx:idx + self.sequence_length]
        y = self.data[idx + self.sequence_length]
        # Move to device if specified
        if self.device == 'cuda':
            x = x.to(self.device)
            y = y.to(self.device)
        return x, y

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                           (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

In [5]:
class MelodyTransformer(nn.Module):
    def __init__(self, input_dim=4, d_model=64, nhead=4, num_layers=3, 
                 dim_feedforward=256, dropout=0.3):
        super().__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        
        self.input_projection = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.output_projection = nn.Linear(d_model, input_dim)
        
        self._init_weights()
    
    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
    
    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        x = self.input_projection(x)
        x = x.transpose(0, 1)
        x = self.pos_encoder(x)
        x = x.transpose(0, 1)
        transformer_out = self.transformer(x)
        last_token = transformer_out[:, -1, :]
        output = self.output_projection(last_token)
        return output

In [6]:
class MelodyGenerator:
    def __init__(self, model_params=None, force_cpu=False):
        self.model_params = model_params or {
            'd_model': 128,
            'nhead': 8,
            'num_layers': 6,
            'dim_feedforward': 512,
            'dropout': 0.1
        }
        
        # CUDA setup with fallback
        if force_cpu:
            self.device = torch.device('cpu')
            print("Forced to use CPU")
        else:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            print(f"Using device: {self.device}")
        
        self.model = None
        self.data_stats = None
        
        # Enable CUDA optimizations if available
        if torch.cuda.is_available() and not force_cpu:
            torch.backends.cudnn.benchmark = True  # Optimize for consistent input sizes
            torch.backends.cudnn.deterministic = False  # Allow non-deterministic algorithms for speed
    
    def normalize_data(self, data):
        data = np.array(data, dtype=np.float32)
        self.data_stats = {
            'mean': np.mean(data, axis=0),
            'std': np.std(data, axis=0) + 1e-8
        }
        normalized_data = (data - self.data_stats['mean']) / self.data_stats['std']
        return normalized_data.tolist()
    
    def denormalize_data(self, data):
        if self.data_stats is None:
            return data
        
        data = np.array(data)
        denormalized = data * self.data_stats['std'] + self.data_stats['mean']
        denormalized[:, [0, 3]] = np.round(denormalized[:, [0, 3]])
        denormalized[:, [1, 2]] = np.round(denormalized[:, [1, 2]], 2)
        return denormalized.tolist()
    
    def train(self, data, sequence_length=10, epochs=100, batch_size=32, 
              learning_rate=0.001, validation_split=0.2, use_amp=True):
        """
        Train with CUDA optimizations
        
        Args:
            use_amp: Use Automatic Mixed Precision for faster training on modern GPUs
        """
        print(f"Training on {len(data)} samples using {self.device}...")
        
        # CUDA memory management
        if self.device.type == 'cuda':
            torch.cuda.empty_cache()  # Clear cache
            print(f"GPU Memory before training: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        
        # Normalize data
        normalized_data = self.normalize_data(data)
        
        # Split data
        split_idx = int(len(normalized_data) * (1 - validation_split))
        train_data = normalized_data[:split_idx]
        val_data = normalized_data[split_idx:]
        
        # Create datasets - pin_memory for faster GPU transfer
        pin_memory = self.device.type == 'cuda'
        
        train_dataset = MelodyDataset(train_data, sequence_length, device='cpu')  # Keep on CPU for DataLoader
        val_dataset = MelodyDataset(val_data, sequence_length, device='cpu')
        
        train_loader = DataLoader(
            train_dataset, 
            batch_size=batch_size, 
            shuffle=True, 
            pin_memory=pin_memory,
            num_workers=2 if self.device.type == 'cuda' else 0  # Parallel loading for GPU
        )
        val_loader = DataLoader(
            val_dataset, 
            batch_size=batch_size, 
            shuffle=False, 
            pin_memory=pin_memory,
            num_workers=2 if self.device.type == 'cuda' else 0
        )
        
        # Initialize model and move to device
        self.model = MelodyTransformer(**self.model_params).to(self.device)
        
        # Enable compilation for PyTorch 2.0+ (massive speedup on GPU)
        if hasattr(torch, 'compile') and self.device.type == 'cuda':
            try:
                self.model = torch.compile(self.model)
                print("Model compiled for faster execution!")
            except Exception as e:
                print(f"Compilation failed (using uncompiled model): {e}")
        
        # Loss and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.5)
        
        # Mixed precision training for modern GPUs
        scaler = torch.amp.GradScaler('cuda') if use_amp and self.device.type == 'cuda' else None
        
        # Training loop
        train_losses = []
        val_losses = []
        
        for epoch in range(epochs):
            # Training
            self.model.train()
            train_loss = 0
            
            for batch_x, batch_y in train_loader:
                # Move data to device
                batch_x, batch_y = batch_x.to(self.device, non_blocking=True), batch_y.to(self.device, non_blocking=True)
                
                optimizer.zero_grad()
                
                # Forward pass with optional mixed precision
                if scaler is not None:
                    with torch.amp.autocast('cuda'):
                        output = self.model(batch_x)
                        loss = criterion(output, batch_y)
                    
                    # Backward pass with gradient scaling
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    output = self.model(batch_x)
                    loss = criterion(output, batch_y)
                    loss.backward()
                    optimizer.step()
                
                train_loss += loss.item()
            
            # Validation
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch_x, batch_y in val_loader:
                    batch_x, batch_y = batch_x.to(self.device, non_blocking=True), batch_y.to(self.device, non_blocking=True)
                    
                    if scaler is not None:
                        with torch.amp.autocast('cuda'):
                            output = self.model(batch_x)
                            loss = criterion(output, batch_y)
                    else:
                        output = self.model(batch_x)
                        loss = criterion(output, batch_y)
                    
                    val_loss += loss.item()
            
            train_loss /= len(train_loader)
            val_loss /= len(val_loader)
            train_losses.append(train_loss)
            val_losses.append(val_loss)
            
            scheduler.step()
            
            if (epoch + 1) % 10 == 0:
                memory_info = f", GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB" if self.device.type == 'cuda' else ""
                print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}{memory_info}')
        
        print("Training completed!")
        
        # Clear cache after training
        if self.device.type == 'cuda':
            torch.cuda.empty_cache()
        
        return train_losses, val_losses
    
    def generate(self, seed_sequence, num_generate=10):
        if self.model is None:
            raise ValueError("Model not trained yet. Call train() first.")
        
        self.model.eval()
        
        # Normalize seed sequence
        seed_normalized = []
        for tuple_data in seed_sequence:
            normalized_tuple = ((np.array(tuple_data) - self.data_stats['mean']) / 
                              self.data_stats['std']).tolist()
            seed_normalized.append(normalized_tuple)
        
        generated = []
        current_sequence = seed_normalized.copy()
        
        with torch.no_grad():
            for _ in range(num_generate):
                # Convert to tensor and move to device
                input_tensor = torch.tensor([current_sequence], dtype=torch.float32).to(self.device)
                
                # Generate next tuple
                output = self.model(input_tensor)
                next_tuple = output.cpu().numpy()[0].tolist()
                
                generated.append(next_tuple)
                current_sequence = current_sequence[1:] + [next_tuple]
        
        # Denormalize generated data
        generated_denormalized = self.denormalize_data(generated)
        return generated_denormalized

In [7]:
instrument = 0

# Load your data
with open(f'raw_data/{instrument}.json', 'r') as file:
    loaded_data = json.load(file)

print(f"Loaded {len(loaded_data)} samples")

# Create CUDA-enabled generator
generator = MelodyGenerator()

# Train the model with CUDA optimizations
train_losses, val_losses = generator.train(
    data=loaded_data,
    sequence_length=10,
    epochs=100,
    batch_size=64,  # Larger batch size for GPU efficiency
    learning_rate=1e-4,
    use_amp=True  # Enable mixed precision for speed
)

# Generate new patterns
seed = loaded_data[:10]  # Use first 10 tuples as seed
generated_tuples = generator.generate(seed, num_generate=20)

print("\nGenerated tuples:")
for i, tuple_data in enumerate(generated_tuples):
    print(f"  {i+1}: {tuple_data}")

Loaded 1724 samples
Using device: cuda
Training on 1724 samples using cuda...
GPU Memory before training: 0.00 GB
Model compiled for faster execution!
Epoch [10/100], Train Loss: 0.167741, Val Loss: 0.055143, GPU Memory: 0.04 GB
Epoch [20/100], Train Loss: 0.114650, Val Loss: 0.085822, GPU Memory: 0.04 GB
Epoch [30/100], Train Loss: 0.095802, Val Loss: 0.065267, GPU Memory: 0.04 GB
Epoch [40/100], Train Loss: 0.087101, Val Loss: 0.099337, GPU Memory: 0.04 GB
Epoch [50/100], Train Loss: 0.082106, Val Loss: 0.103828, GPU Memory: 0.04 GB
Epoch [60/100], Train Loss: 0.076499, Val Loss: 0.112738, GPU Memory: 0.04 GB
Epoch [70/100], Train Loss: 0.079612, Val Loss: 0.107870, GPU Memory: 0.04 GB
Epoch [80/100], Train Loss: 0.072470, Val Loss: 0.119233, GPU Memory: 0.04 GB
Epoch [90/100], Train Loss: 0.069350, Val Loss: 0.121909, GPU Memory: 0.04 GB
Epoch [100/100], Train Loss: 0.068723, Val Loss: 0.125661, GPU Memory: 0.04 GB
Training completed!

Generated tuples:
  1: [81.0, 42.77, 40.06, 100