<a href="https://colab.research.google.com/github/shreyas-shrestha/VizFoldAutoencoder/blob/main/vizfoldfinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import shutil

# Create a new directory for the individual .npy files
individual_files_dir = "individual_protein_files"
if not os.path.exists(individual_files_dir):
    os.makedirs(individual_files_dir)

# List of the four individual .npy files
npy_files_to_move = [
    '7b3a_A_pair_block_22.npy',
    '7b3a_A_pair_block_47.npy',
    '7b3a_A_pair_block_35.npy',
    '7b3a_A_pair_block_5.npy'
]

# Move the files into the new directory
print(f"Moving individual .npy files to '{individual_files_dir}'...")
for file_name in npy_files_to_move:
    if os.path.exists(file_name):
        shutil.move(file_name, os.path.join(individual_files_dir, file_name))
        print(f"Moved {file_name}")
    else:
        print(f"File not found: {file_name}. Skipping.")
print("Finished moving files.")

Moving individual .npy files to 'individual_protein_files'...
File not found: 7b3a_A_pair_block_22.npy. Skipping.
File not found: 7b3a_A_pair_block_47.npy. Skipping.
File not found: 7b3a_A_pair_block_35.npy. Skipping.
File not found: 7b3a_A_pair_block_5.npy. Skipping.
Finished moving files.


In [20]:
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler

def load_protein_data(data_directory="Proteins_layer47", normalize=True):
    """
    Load and preprocess protein data for autoencoder training.

    Args:
        data_directory (str): Path to folder containing protein .npy files
        normalize (bool): Whether to standardize the data

    Returns:
        tuple: (data_matrix, protein_names, original_L)
            - data_matrix: numpy array of shape (n_proteins * 128, 49000)
            - protein_names: list of protein names (repeated 128 times)
            - original_L: int, original L dimension before padding to 700x700
    """

    data_path = Path(data_directory)
    if not data_path.exists():
        raise FileNotFoundError(f"Directory {data_directory} not found")

    npy_files = list(data_path.glob("*.npy"))
    if not npy_files:
        raise ValueError(f"No .npy files found in {data_directory}")

    print(f"Found {len(npy_files)} protein files")

    # Load protein data and extract vectors for each of the 128 channels
    all_vectors = []
    all_protein_names = []
    original_L = None

    for file_path in npy_files:
        try:
            # Load the protein data (L×L×128)
            data = np.load(file_path)

            if original_L is None:
                original_L = data.shape[0]  # Store original L dimension

            # Extract each of the 128 feature vectors (L×L each)
            # This gives us 128 vectors of size L*L for each protein
            L = data.shape[0]
            for channel in range(data.shape[2]):
                # Get L×L matrix for this channel
                channel_matrix = data[:, :, channel]  # Shape: (L, L)

                # Pad or truncate to 700×700 to get exactly 49,000 elements
                target_L = 700
                if L < target_L:
                    # Zero-pad to 700×700
                    padded_matrix = np.zeros((target_L, target_L), dtype=channel_matrix.dtype)
                    padded_matrix[:L, :L] = channel_matrix
                    vector = padded_matrix.flatten()  # 700*700 = 49,000
                elif L > target_L:
                    # Truncate to 700×700
                    truncated_matrix = channel_matrix[:target_L, :target_L]
                    vector = truncated_matrix.flatten()  # 700*700 = 49,000
                else:
                    # L == 700, perfect fit
                    vector = channel_matrix.flatten()  # 700*700 = 49,000

                all_vectors.append(vector)
                all_protein_names.append(f"{file_path.stem}_ch{channel:03d}")

        except Exception as e:
            print(f"Error loading {file_path.name}: {e}")
            continue

    if not all_vectors:
        raise ValueError("No protein data was successfully loaded")

    # Convert to numpy array
    data_matrix = np.array(all_vectors)

    # Normalize the data if requested
    if normalize:
        scaler = StandardScaler()
        data_matrix = scaler.fit_transform(data_matrix)

    print(f"Loaded data shape: {data_matrix.shape}")
    # Note: This comment might be inaccurate depending on the actual number of files.
    print(f"Total vectors: {len(all_vectors)}")
    print(f"Vector size: {data_matrix.shape[1]} (padded to 700×700 = 49,000)")
    print(f"Original L dimension: {original_L}")


    return data_matrix, all_protein_names, original_L

# Simple usage example
if __name__ == "__main__":
    # Load the data from the directory with individual files
    data, names, original_L = load_protein_data("individual_protein_files")

    print(f"\nData ready for autoencoder:")
    print(f"Shape: {data.shape}")
    print(f"Data type: {data.dtype}")
    print(f"Value range: [{data.min():.3f}, {data.max():.3f}]")
    print(f"Original L dimension: {original_L}")

Found 4 protein files
Loaded data shape: (512, 490000)
Total vectors: 512
Vector size: 490000 (padded to 700×700 = 49,000)
Original L dimension: 280

Data ready for autoencoder:
Shape: (512, 490000)
Data type: float32
Value range: [-11.610, 14.519]
Original L dimension: 280


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt
from Data_Ingestion import load_protein_data

# --- Model ---
class SimpleAutoencoder(nn.Module):
    def __init__(self, input_dim=49000):
        super().__init__()
        # Encoder: input -> 1024 -> 128 -> 8
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 128),
            nn.ReLU(),
            nn.Linear(128, 8)
        )
        # Decoder: 8 -> 128 -> 1024 -> input
        self.decoder = nn.Sequential(
            nn.Linear(8, 128),
            nn.ReLU(),
            nn.Linear(128, 1024),
            nn.ReLU(),
            nn.Linear(1024, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)

# --- Training ---
def train_model(model, train_loader, val_loader, lr=1e-3, wd=1e-5, epochs=20, device="cpu"):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    criterion = nn.MSELoss()

    train_losses, val_losses = [], []
    for epoch in range(epochs):
        # Train
        model.train()
        total_loss = 0
        for batch in train_loader:
            data = batch[0].to(device)
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, data)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        train_losses.append(total_loss / len(train_loader))

        # Validate
        model.eval()
        total_val = 0
        with torch.no_grad():
            for batch in val_loader:
                data = batch[0].to(device)
                out = model(data)
                loss = criterion(out, data)
                total_val += loss.item()
        val_losses.append(total_val / len(val_loader))

        print(f"Epoch {epoch+1}/{epochs} | Train {train_losses[-1]:.6f} | Val {val_losses[-1]:.6f}")

    return train_losses, val_losses

# --- Visualization ---
def visualize(model, data, train_losses, val_losses, protein_names, original_L, device="cpu"):
    L_vis = min(original_L, 700)

    # Loss curves
    plt.plot(train_losses, label="Train")
    plt.plot(val_losses, label="Val")
    plt.legend(), plt.xlabel("Epoch"), plt.ylabel("MSE"), plt.title("Training Progress")
    plt.show()

    # Show one reconstruction
    model.eval()
    with torch.no_grad():
        sample = torch.FloatTensor(data[:1]).to(device)
        recon = model(sample)[0].cpu().reshape(700, 700).numpy()
        original = data[0].reshape(700, 700)

        fig, axes = plt.subplots(1, 3, figsize=(12, 4))
        axes[0].imshow(original[:L_vis,:L_vis], cmap="viridis"); axes[0].set_title("Original")
        axes[1].imshow(recon[:L_vis,:L_vis], cmap="viridis"); axes[1].set_title("Reconstructed")
        axes[2].imshow(np.abs(original-recon)[:L_vis,:L_vis], cmap="Reds"); axes[2].set_title("Error")
        plt.show()

# --- Main ---
def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Loading data...")
    data_matrix, protein_names, original_L = load_protein_data("individual_protein_files", normalize=True)
    print("Data shape:", data_matrix.shape)

    # Simple split
    split = int(0.8*len(data_matrix))
    train_data, val_data = data_matrix[:split], data_matrix[split:]

    train_loader = DataLoader(TensorDataset(torch.FloatTensor(train_data)), batch_size=64, shuffle=True)
    val_loader = DataLoader(TensorDataset(torch.FloatTensor(val_data)), batch_size=64)

    # Train model
    model = SimpleAutoencoder(input_dim=train_data.shape[1])
    train_losses, val_losses = train_model(model, train_loader, val_loader, device=device)

    # Visualize
    visualize(model, data_matrix, train_losses, val_losses, protein_names, original_L, device=device)

    # Save
    torch.save(model.state_dict(), "simple_autoencoder.pth")
    print("Model saved.")

if __name__ == "__main__":
    main()

Loading data...
Found 4 protein files
Loaded data shape: (512, 490000)
Total vectors: 512 (10 proteins × 128 channels)
Vector size: 490000 (padded to 700×700 = 49,000)
Original L dimension: 280
Data shape: (512, 490000)
