In [None]:
import os

# Trying to optimize memory usage here, can be removed
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch


In [None]:


device = torch.device("cuda")
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
##Memory usage tracking attempts
print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024 ** 2} MB")
print(f"Reserved memory: {torch.cuda.memory_reserved() / 1024 ** 2} MB")

print(f"Using device: {device}")

Allocated memory: 0.0 MB
Reserved memory: 0.0 MB
Using device: cuda


In [None]:
import os
import pickle

poisoned_data_dir = "poisoned_data"

poisoned_dataset = []
for file_name in os.listdir(poisoned_data_dir):
    if file_name.endswith(".p"):
        file_path = os.path.join(poisoned_data_dir, file_name)
        with open(file_path, 'rb') as f:
            poisoned_sample = pickle.load(f)
            poisoned_dataset.append(poisoned_sample)

In [None]:
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose, Resize, ToTensor


transform = transforms.Compose([
    transforms.ToTensor(),  # Convert PIL Image to PyTorch Tensor
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])  # Normalize to [-1, 1]
])

class PoisonedDataset(Dataset):
    def __init__(self, poisoned_data, transform=None):
        self.poisoned_data = poisoned_data
        self.transform = transform

    def __len__(self):
        return len(self.poisoned_data)

    def __getitem__(self, idx):
        sample = self.poisoned_data[idx]
        image = sample['img']
        prompt = sample['text']
        
        if self.transform:
            image = self.transform(image)
        
        return {"image": image, "prompt": prompt}



dataset = PoisonedDataset(poisoned_dataset, transform=transform)

dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [None]:

from torch.cuda.amp import autocast, GradScaler
from torch.optim import Adam
from diffusers import StableDiffusionPipeline
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, ToTensor


scaler = GradScaler()

# Load pre-trained Stable Diffusion model
model_id = "CompVis/stable-diffusion-v1-4"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipeline = StableDiffusionPipeline.from_pretrained(model_id).to(device)

# More memory saving attempts
try:
    pipeline.unet.enable_xformers_memory_efficient_attention()
except Exception:
    print("Memory-efficient attention is not available for this GPU.")

pipeline.vae.requires_grad_(False)
pipeline.text_encoder.requires_grad_(False)

###
pipeline.vae.eval()
pipeline.text_encoder.eval()
pipeline.unet.train()  # We are fine-tuning the U-Net

optimizer = Adam(pipeline.unet.parameters(), lr=5e-5)

# Dataset and DataLoader (with reduced resolution and batch size = 1)...
transform = Compose([
    Resize((128, 128)),  # Reduce resolution to 256x256
    ToTensor(),
])

dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

accumulation_steps = 4  

for epoch in range(4):  
    for step, batch in enumerate(dataloader):
        images = batch["image"].to(device)  
        prompts = batch["prompt"]  

        text_inputs = pipeline.tokenizer(
            prompts, padding="max_length", return_tensors="pt", truncation=True, max_length=77
        )
        text_embeddings = pipeline.text_encoder(text_inputs.input_ids.to(device))[0]

        # Encode images into latents
        with torch.no_grad():  # VAE does not require gradients
            latents = pipeline.vae.encode(images * 2 - 1).latent_dist.sample()
        latents = latents * pipeline.vae.config.scaling_factor

        # Add noise to latents for the diffusion process
        noise = torch.randn_like(latents)
        timesteps = torch.randint(0, 1000, (latents.size(0),), device=latents.device).long()
        noisy_latents = pipeline.scheduler.add_noise(latents, noise, timesteps)

        # Forward pass with mixed precision
        with autocast(dtype=torch.float16):  # Enable FP16
            model_pred = pipeline.unet(noisy_latents, timesteps, encoder_hidden_states=text_embeddings).sample
            loss = torch.nn.functional.mse_loss(model_pred, noise)

        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        print(f"Epoch {epoch}, Step {step}, Loss: {loss.item()}")

    optimizer.zero_grad()

  scaler = GradScaler()


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  return t.to(


Memory-efficient attention is not available for this GPU.


  with autocast(dtype=torch.float16):  # Enable FP16


Epoch 0, Step 0, Loss: 0.30144771933555603


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.78 GiB is allocated by PyTorch, and 41.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)