<a href="https://colab.research.google.com/github/rogerpanel/CV/blob/main/Transformer_1E.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Transformer_cloude_+

# Import libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import cv2
import numpy as np
import matplotlib.pyplot as plt
from transformers import ViTModel, ViTFeatureExtractor, ASTModel, ASTFeatureExtractor
from torchvision import transforms
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
import seaborn as sns
from sklearn.metrics import mean_squared_error

# Device location of input datasets

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


# Audio Feature Extraction



In [None]:
import librosa

def extract_audio_features(audio_file, feature_extractor, model):
    # Load audio file
    audio, sr = librosa.load(audio_file, sr=None)  # sr=None preserves the original sampling rate

    # Resample if necessary
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000

    # Convert to float32 and normalize
    audio = librosa.util.normalize(audio.astype(np.float32))

    # Extract features
    inputs = feature_extractor(audio, sampling_rate=sr, return_tensors="pt")

    with torch.no_grad():
      outputs = model(**inputs.to(device))
    return outputs.last_hidden_state.to(device)


# Image Feature Extraction

In [None]:

def extract_image_features(image_file, feature_extractor, model):
    image = cv2.imread(image_file)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    inputs = feature_extractor(images=image, return_tensors="pt")

    with torch.no_grad():
      outputs = model(**inputs.to(device))
    return outputs.last_hidden_state.to(device)


# Cross Modal Transformer

In [None]:
class CrossModalTransformer(nn.Module):
    def __init__(self, audio_dim, image_dim, hidden_dim, num_heads, num_layers):
        super(CrossModalTransformer, self).__init__()
        self.audio_proj = nn.Linear(audio_dim, hidden_dim)
        self.image_proj = nn.Linear(image_dim, hidden_dim)

        self.encoder_layer = nn.TransformerEncoderLayer(hidden_dim, num_heads, dim_feedforward=hidden_dim*4, dropout=0.1)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers)

        self.cross_attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=0.1)
        self.fc = nn.Linear(hidden_dim, image_dim)

    def forward(self, audio_features, image_features):
        # Project inputs to common hidden dimension
        audio_projected = self.audio_proj(audio_features)
        image_projected = self.image_proj(image_features)

        # Combine and encode features
        combined_features = torch.cat([audio_projected, image_projected], dim=1)
        encoded_features = self.transformer_encoder(combined_features)

        # Split encoded features back into audio and image
        audio_encoded, image_encoded = torch.split(encoded_features, [audio_projected.size(1), image_projected.size(1)], dim=1)

        # Cross-attention
        cross_attn_output, attention_weights = self.cross_attn(audio_encoded, image_encoded, image_encoded)

        # Project back to image dimension
        output = self.fc(cross_attn_output)
        return output, attention_weights


# the stego System development

In [None]:
class StegoSystem(nn.Module):
    def __init__(self, audio_dim, image_dim, hidden_dim, num_heads, num_layers):
        super(StegoSystem, self).__init__()
        self.cmt = CrossModalTransformer(audio_dim, image_dim, hidden_dim, num_heads, num_layers)
        self.decoder = nn.Sequential(
            nn.Linear(image_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, audio_dim)
        )

    def forward(self, audio_features, image_features):
        stego_features, attention_weights = self.cmt(audio_features, image_features)
        reconstructed_audio = self.decoder(stego_features)
        return stego_features, reconstructed_audio, attention_weights


# Set up models and feature extractors

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ViT for image
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k").to(device)


# AST for audio
ast_feature_extractor = ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
ast_model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

# Initializing the stego system

In [None]:

# Stego System
audio_dim = ast_model.config.hidden_size  # This should be 768 for the AST model
image_dim = vit_model.config.hidden_size  # This should be 768 for the ViT model
hidden_dim = 512  # This can be adjusted
num_heads = 8
num_layers = 6

stego_system = StegoSystem(audio_dim, image_dim, hidden_dim, num_heads, num_layers).to(device)

# Optimization
optimizer = optim.Adam(stego_system.parameters(), lr=1e-4)
scaler = GradScaler()

# Loss functions
mse_loss = nn.MSELoss()
l1_loss = nn.L1Loss()

def train_step(audio_features, image_features, alpha=0.5):
    optimizer.zero_grad()

    audio_features = audio_features.to(device)
    image_features = image_features.to(device)

    with autocast():
        stego_features, reconstructed_audio, _ = stego_system(audio_features, image_features)

        # Reconstruction loss
        recon_loss = mse_loss(reconstructed_audio, audio_features)

        # Imperceptibility loss
        imperceptibility_loss = l1_loss(stego_features, image_features)

        # Total loss
        total_loss = recon_loss + alpha * imperceptibility_loss

    scaler.scale(total_loss).backward()
    scaler.step(optimizer)
    scaler.update()

    return total_loss.item(), recon_loss.item(), imperceptibility_loss.item()




# Add these functions for visualization and metrics

In [None]:

def visualize_attention(attention_weights, save_path):
    plt.figure(figsize=(10, 8))
    sns.heatmap(attention_weights.cpu().numpy(), cmap='viridis')
    plt.title('Attention Weights Heatmap')
    plt.xlabel('Image Patches')
    plt.ylabel('Audio Frames')
    plt.savefig(save_path)
    plt.close()

def plot_losses(losses, save_path):
    plt.figure(figsize=(10, 6))
    for loss_name, loss_values in losses.items():
        plt.plot(loss_values, label=loss_name)
    plt.title('Training Losses')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(save_path)
    plt.close()



# Building the Training loop function

In [None]:
def train(audio_file, image_file, epochs=100):
    audio_features = extract_audio_features(audio_file, ast_feature_extractor, ast_model).to(device)
    image_features = extract_image_features(image_file, vit_feature_extractor, vit_model).to(device)

    # Ensure compatible shapes
    if audio_features.shape[1] != image_features.shape[1]:
        min_length = min(audio_features.shape[1], image_features.shape[1])
        audio_features = audio_features[:, :min_length, :]
        image_features = image_features[:, :min_length, :]

    losses = {'total': [], 'reconstruction': [], 'imperceptibility': []}

    for epoch in range(epochs):
        total_loss, recon_loss, imperceptibility_loss = train_step(audio_features, image_features)

        losses['total'].append(total_loss)
        losses['reconstruction'].append(recon_loss)
        losses['imperceptibility'].append(imperceptibility_loss)

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Total Loss: {total_loss:.4f}, Recon Loss: {recon_loss:.4f}, Imperceptibility Loss: {imperceptibility_loss:.4f}")

    # Visualize losses
    plot_losses(losses, 'losses.png')

    # Get final stego image and attention weights
    with torch.no_grad():
        stego_features, reconstructed_audio, attention_weights = stego_system(audio_features, image_features)

    # Visualize attention weights
    visualize_attention(attention_weights[0], 'attention_weights.png')

    # Save model and stego image
    save_model_and_stego_image(stego_system, stego_features, 'stego_model.pth', 'stego_image.png')

    return stego_system



# Capacity measurement

In [None]:
# Modify the measure_capacity function
def measure_capacity(stego_system, audio_file, image_file):
    audio_features = extract_audio_features(audio_file, ast_feature_extractor, ast_model).to(device)
    image_features = extract_image_features(image_file, vit_feature_extractor, vit_model).to(device)

    # Ensure compatible shapes
    if audio_features.shape[1] != image_features.shape[1]:
        min_length = min(audio_features.shape[1], image_features.shape[1])
        audio_features = audio_features[:, :min_length, :]
        image_features = image_features[:, :min_length, :]

    with torch.no_grad():
        stego_features, reconstructed_audio, _ = stego_system(audio_features, image_features)

    # Calculate MSE
    mse = mean_squared_error(image_features.cpu().numpy().flatten(), stego_features.cpu().numpy().flatten())

    # Calculate PSNR
    psnr = 10 * np.log10(1 / mse)

    # Calculate SSIM
    def ssim(x, y):
      c1 = (0.01 * 255) ** 2
      c2 = (0.03 * 255) ** 2

      x = x.view(x.size(0), -1)
      y = y.view(y.size(0), -1)

      mu_x = x.mean(dim=1)
      mu_y = y.mean(dim=1)

      sigma_x = x.std(dim=1)
      sigma_y = y.std(dim=1)
      sigma_xy = ((x - mu_x.unsqueeze(1)) * (y - mu_y.unsqueeze(1))).mean(dim=1)

      ssim_map = ((2 * mu_x * mu_y + c1) * (2 * sigma_xy + c2)) / ((mu_x**2 + mu_y**2 + c1) * (sigma_x**2 + sigma_y**2 + c2))
      return ssim_map.mean()

    ssim_value = ssim(stego_features, image_features)

    # Calculate bit-per-pixel (bpp) capacity
    audio_size = audio_features.numel() * audio_features.element_size() * 8  # in bits
    image_size = image_features.numel() * image_features.element_size() * 8  # in bits
    bpp = audio_size / image_size

    return mse, psnr, ssim_value.item(), bpp


# Save CMT model

In [None]:
# Modified the save_model_and_stego_image function
def save_model_and_stego_image(stego_system, stego_image, model_path, image_path):
    torch.save(stego_system.state_dict(), model_path)
    stego_image_np = stego_image.squeeze().cpu().numpy()
    if stego_image_np.ndim == 3:
        stego_image_np = np.transpose(stego_image_np, (1, 2, 0))
    plt.imsave(image_path, stego_image_np)




# Function to handle the CUDNN error

In [None]:
def set_cudnn_flags():
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.allow_tf32 = False
    torch.backends.cuda.matmul.allow_tf32 = False

# The Stegnography system performance and capacity measurement

Neccessary Libraries

In [None]:
import torch
import numpy as np
import librosa
import cv2
from skimage.metrics import structural_similarity as ssim
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


Extraction, comparison and visualization functions

In [None]:

def extract_stego_content(stego_system, stego_features):
    with torch.no_grad():
        _, reconstructed_audio, _ = stego_system(stego_features, stego_features)
    return reconstructed_audio

def compare_audio(original_audio, reconstructed_audio):
    # Convert to numpy arrays and flatten
    original_audio = original_audio.cpu().numpy().flatten()
    reconstructed_audio = reconstructed_audio.cpu().numpy().flatten()

    # Calculate MSE
    mse = mean_squared_error(original_audio, reconstructed_audio)

    # Calculate PSNR
    max_val = np.max(np.abs(original_audio))
    psnr = 20 * np.log10(max_val / np.sqrt(mse))

    # Calculate SSIM
    # SSIM is typically used for images, so we'll reshape the audio to 2D
    width = int(np.sqrt(len(original_audio)))
    original_2d = original_audio[:width**2].reshape((width, width))
    reconstructed_2d = reconstructed_audio[:width**2].reshape((width, width))
    ssim_value = ssim(original_2d, reconstructed_2d, data_range=max_val)

    return mse, psnr, ssim_value

def compare_images(original_image, stego_image):
    # Ensure images are in the same format
    original_image = original_image.squeeze().cpu().numpy()
    stego_image = stego_image.squeeze().cpu().numpy()

    # Flatten the arrays
    original_image = original_image.reshape(-1)
    stego_image = stego_image.reshape(-1)

    # Calculate MSE
    mse = mean_squared_error(original_image, stego_image)

    # Calculate PSNR
    max_val = np.max(original_image)
    psnr = 20 * np.log10(max_val / np.sqrt(mse))

    # Calculate SSIM
    # Reshape to 2D for SSIM calculation
    width = int(np.sqrt(len(original_image)))
    original_2d = original_image[:width**2].reshape((width, width))
    stego_2d = stego_image[:width**2].reshape((width, width))
    ssim_value = ssim(original_2d, stego_2d, data_range=max_val)

    return mse, psnr, ssim_value

def visualize_comparison(original, reconstructed, title):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.title("Original")
    plt.imshow(original.squeeze().cpu().numpy(), aspect='auto')
    plt.subplot(1, 2, 2)
    plt.title("Reconstructed")
    plt.imshow(reconstructed.squeeze().cpu().numpy(), aspect='auto')
    plt.suptitle(title)
    plt.savefig(f"{title.lower().replace(' ', '_')}_comparison.png")
    plt.close()


# Main execution

In [None]:
if __name__ == "__main__":
    set_cudnn_flags()

    audio_file = '/content/5. female3.wav'
    image_file = '/content/5. santa.jpg'

    # Train the stego system
    trained_stego_system = train(audio_file, image_file)

    # Extract original features
    original_audio_features = extract_audio_features(audio_file, ast_feature_extractor, ast_model).to(device)
    original_image_features = extract_image_features(image_file, vit_feature_extractor, vit_model).to(device)

    # Ensure compatible shapes
    min_length = min(original_audio_features.shape[1], original_image_features.shape[1])
    original_audio_features = original_audio_features[:, :min_length, :]
    original_image_features = original_image_features[:, :min_length, :]

    # Generate stego content
    with torch.no_grad():
        stego_features, _, _ = trained_stego_system(original_audio_features, original_image_features)

    # Extract audio from stego content
    reconstructed_audio = extract_stego_content(trained_stego_system, stego_features)

    # Compare original and reconstructed audio
    audio_mse, audio_psnr, audio_ssim = compare_audio(original_audio_features, reconstructed_audio)

    # Compare original and stego images
    image_mse, image_psnr, image_ssim = compare_images(original_image_features, stego_features)

    # Print results
    print("Audio Comparison:")
    print(f"MSE: {audio_mse:.4f}")
    print(f"PSNR: {audio_psnr:.2f} dB")
    print(f"SSIM: {audio_ssim:.4f}")

    print("\nImage Comparison:")
    print(f"MSE: {image_mse:.4f}")
    print(f"PSNR: {image_psnr:.2f} dB")
    print(f"SSIM: {image_ssim:.4f}")

    # Visualize comparisons
    visualize_comparison(original_audio_features, reconstructed_audio, "Audio Comparison")
    visualize_comparison(original_image_features, stego_features, "Image Comparison")

    # Calculate capacity
    audio_size = original_audio_features.numel() * original_audio_features.element_size() * 8  # in bits
    image_size = original_image_features.numel() * original_image_features.element_size() * 8  # in bits
    bpp = audio_size / image_size

    print(f"\nEmbedding Capacity: {bpp:.4f} bits per pixel")





Epoch 10/100, Total Loss: 1.2926, Recon Loss: 1.1572, Imperceptibility Loss: 0.2708
Epoch 20/100, Total Loss: 1.0289, Recon Loss: 0.8216, Imperceptibility Loss: 0.4146
Epoch 30/100, Total Loss: 0.8421, Recon Loss: 0.6606, Imperceptibility Loss: 0.3629
Epoch 40/100, Total Loss: 0.7429, Recon Loss: 0.5954, Imperceptibility Loss: 0.2949
Epoch 50/100, Total Loss: 0.6681, Recon Loss: 0.5248, Imperceptibility Loss: 0.2866
Epoch 60/100, Total Loss: 0.6222, Recon Loss: 0.4888, Imperceptibility Loss: 0.2669
Epoch 70/100, Total Loss: 0.5748, Recon Loss: 0.4487, Imperceptibility Loss: 0.2524
Epoch 80/100, Total Loss: 0.5326, Recon Loss: 0.4116, Imperceptibility Loss: 0.2420
Epoch 90/100, Total Loss: 0.4949, Recon Loss: 0.3777, Imperceptibility Loss: 0.2344
Epoch 100/100, Total Loss: 0.4578, Recon Loss: 0.3436, Imperceptibility Loss: 0.2284
Audio Comparison:
MSE: 0.3752
PSNR: 28.81 dB
SSIM: 0.7719

Image Comparison:
MSE: 0.1140
PSNR: 11.13 dB
SSIM: 0.1392

Embedding Capacity: 1.0000 bits per pixel