In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import tarfile

protein_tar_path = '/content/drive/MyDrive/MUST_CNN/4Protein.tar.gz'
cb513_tar_path = '/content/drive/MyDrive/MUST_CNN/cb513_culldb.tar.gz'

with tarfile.open(protein_tar_path, "r:gz") as tar:
    tar.extractall(path="/content/4Protein")
with tarfile.open(cb513_tar_path, "r:gz") as tar:
    tar.extractall(path="/content/cb513")

protein_data_path = '/content/4Protein/4Protein/data/aa1.dat'
protein_label_path = '/content/4Protein/4Protein/data/dssp.lab.tag.dat'

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

class ProteinDataset(Dataset):
    def __init__(self, aa_file, label_file, max_length=300, num_classes=21):
        self.max_length = max_length
        self.num_classes = num_classes

        # Load amino acid sequences
        with open(aa_file, 'r') as f:
            self.sequences = [list(map(int, line.strip().split())) for line in f]

        # Load secondary structure labels
        with open(label_file, 'r') as f:
            self.labels = [list(map(int, line.strip().split())) for line in f]

        assert len(self.sequences) == len(self.labels), "Mismatch between sequences and labels"

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = torch.tensor(self.sequences[idx], dtype=torch.long) - 1  # Convert to zero-based index
        labels = torch.tensor(self.labels[idx], dtype=torch.long)

        # One-hot encode and pad sequences
        one_hot_sequence = F.one_hot(sequence, num_classes=self.num_classes).float()
        padded_sequence = F.pad(one_hot_sequence, (0, 0, 0, self.max_length - one_hot_sequence.shape[0]))

        # Pad labels to match max length, using -1 for padding
        padded_labels = F.pad(labels, (0, self.max_length - labels.shape[0]), value=-1)

        return padded_sequence, padded_labels

# Instantiate and verify the dataset
dataset = ProteinDataset(protein_data_path, protein_label_path)
print("Dataset length:", len(dataset))
print("Sample sequence shape:", dataset[0][0].shape)
print("Sample label shape:", dataset[0][1].shape)


Dataset length: 11794
Sample sequence shape: torch.Size([300, 21])
Sample label shape: torch.Size([300])


In [10]:
# Mapping for indices to amino acids
index_to_amino_acid = [
    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
]

# Decode function with index range check
def decode_to_amino_acids(indices):
    return [index_to_amino_acid[idx] for idx in indices if 0 <= idx < len(index_to_amino_acid)]

# Decode the sample protein sequence
decoded_amino_acids = decode_to_amino_acids(decoded_sequence)
print("Sample protein sequence (as amino acid symbols):")
print("".join(decoded_amino_acids))


Sample protein sequence (as amino acid symbols):
CDEFGFHIGGFFFKCLLHMNMLFFPQCLRELSSMTVDWLGLYGTHWNWRPLMSFPHHRNNCWLLPNNNREHPDRCILVMNRMGPLCEKEITQYEKSGCFHQHCVVHQGPMHGCHFFLFSDLFFSGWHHAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA


In [11]:
import torch.nn as nn
import torch.nn.functional as F

class MUSTCNN(nn.Module):
    def __init__(self, input_channels=21, num_classes=3, max_length=300):
        super(MUSTCNN, self).__init__()

        # Shift-and-stitch convolution layers with different dilations
        self.shift_convs = nn.ModuleList([
            nn.Conv1d(input_channels, 64, kernel_size=3, padding=1, dilation=1),
            nn.Conv1d(input_channels, 64, kernel_size=3, padding=2, dilation=2),
            nn.Conv1d(input_channels, 64, kernel_size=3, padding=3, dilation=3)
        ])

        # Additional convolutional layers
        self.conv2 = nn.Conv1d(64 * len(self.shift_convs), 128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)

        # Pooling and output layer
        self.pool = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
        self.output = nn.Conv1d(256, num_classes, kernel_size=1)

    def forward(self, x):
        # Transpose input to (batch, channels, sequence_length)
        x = x.permute(0, 2, 1)

        # Apply shift-and-stitch convolutions and concatenate results
        shift_outputs = [F.relu(conv(x)) for conv in self.shift_convs]
        x = torch.cat(shift_outputs, dim=1)

        # Additional convolutions and pooling
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x)

        # Output layer to produce class predictions for each position
        x = self.output(x)

        # Transpose to (batch, sequence_length, num_classes) for compatibility with labels
        return x.permute(0, 2, 1)

# Instantiate and verify the model
model = MUSTCNN(input_channels=21, num_classes=3)
print("Model structure:", model)
print("Output shape for a sample input:", model(torch.randn(1, 300, 21)).shape)

Model structure: MUSTCNN(
  (shift_convs): ModuleList(
    (0): Conv1d(21, 64, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): Conv1d(21, 64, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
    (2): Conv1d(21, 64, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
  )
  (conv2): Conv1d(192, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=1, padding=1, dilation=1, ceil_mode=False)
  (output): Conv1d(256, 3, kernel_size=(1,), stride=(1,))
)
Output shape for a sample input: torch.Size([1, 301, 3])


In [15]:
import torch.optim as optim
from torch.utils.data import random_split

# Split dataset into training and validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for batch processing
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MUSTCNN(input_channels=21, num_classes=3).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-1)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_model(model, train_loader, val_loader, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss, correct, total = 0.0, 0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)
            outputs = outputs.reshape(-1, outputs.shape[-1])  # Flatten for loss calculation
            labels = labels.reshape(-1)  # Flatten labels

            # Compute loss, ignoring padding (-1)
            loss = criterion(outputs, labels)

            # Backpropagation and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Track accuracy and loss
            _, predicted = torch.max(outputs, 1)
            mask = labels != -1  # Ignore padding in accuracy calculation
            correct += (predicted[mask] == labels[mask]).sum().item()
            total += mask.sum().item()
            running_loss += loss.item()

        # Print training metrics
        train_loss = running_loss / len(train_loader)
        train_acc = 100 * correct / total
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%")

# Run the updated training function
train_model(model, train_loader, val_loader, num_epochs=10)


ValueError: Expected input batch_size (9632) to match target batch_size (9600).