<a href="https://colab.research.google.com/github/sqbitegh/Colabs/blob/main/MiniNetwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install 'portalocker>=2.0.0'
!pip install torchtext==0.15.1 --no-cache-dir
!pip install numpy==1.24.1

In [None]:
#!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#!pip install --upgrade torchtext

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

import portalocker
import numpy
import random
import math

In [14]:
import torch
import torchtext
print(torch.__version__)
print(torchtext.__version__)
print(portalocker.__version__)
print(numpy.__version__)

"""
expected
2.0.0+cu117
0.15.1+cpu
3.1.1
1.24.1
"""


2.0.0+cu117
0.15.1+cpu
3.1.1
1.24.1


'\nexpected\n2.0.0+cu117\n0.15.1+cpu\n3.1.1\n1.24.1\n'

In [28]:

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenizer and Vocabulary
tokenizer = get_tokenizer('basic_english')
train_iter, test_iter = IMDB()

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Dataset and DataLoader
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        #print(f"rawlabel {_label} {_text}")
        label_list.append(_label - 1)
        processed_text = torch.tensor(vocab(tokenizer(_text)), dtype=torch.int64)
        text_list.append(processed_text)
    return torch.tensor(label_list, dtype=torch.int64), nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])

batch_size = 32

train_iter, test_iter = IMDB()
train_data = list(train_iter)
random.shuffle(train_data)

train_size = int(0.8 * len(train_data))
train_data, test_data = train_data[:train_size], train_data[train_size:] # Splitting training data into train and validation
train_dataloader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_batch, shuffle=True) # Shuffle training data
test_dataloader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_batch, shuffle=True)

"""
print("test_data print:")
for dp in test_data:
  print(dp)
print("test_data print end.")
"""
# Single-Head Transformer Model
class SingleHeadTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_class):
        super(SingleHeadTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        #self.pos_encoder = nn.Parameter(torch.zeros(1, embed_size)) #learnable position encodings
        self.pos_encoder =self.positional_encoding(embed_size)  # Use fixed encoding
        # Cache for trimmed positional encodings
        self.trimmed_pos_encoder_cache = {}
        self.attention = nn.MultiheadAttention(embed_size, num_heads=1)
        self.fc = nn.Linear(embed_size, num_class)


    def forward(self, x):
        #x = self.embedding(x) + self.pos_encoder

        x = self.embedding(x)
        # Slice the pre-computed positional encodings
        seq_len = x.shape[1]

        if seq_len in self.trimmed_pos_encoder_cache:
            trimmed_pos_encoder = self.trimmed_pos_encoder_cache[seq_len]
            trimmed_pos_encoder = trimmed_pos_encoder.expand(x.size(0), *trimmed_pos_encoder.size()[1:]) #add batch size

        else:
          trimmed_pos_encoder = self.pos_encoder[:seq_len, 0, :].unsqueeze(0).to(x.device) # Updated to remove middle dimension
          #print(f"self.pos_encoder.shape: {self.pos_encoder.shape}, trimmed_pos_encoder.shape: {trimmed_pos_encoder.shape}")
          trimmed_pos_encoder = trimmed_pos_encoder.expand(x.size(0), *trimmed_pos_encoder.size()[1:]) #add batch size
          #print(f"x.shape: {x.shape}, trimmed_pos_encoder.shape: {trimmed_pos_encoder.shape}")
          self.trimmed_pos_encoder_cache[seq_len] = trimmed_pos_encoder  # Store in cache

        #print(f"x.shape: {x.shape}, trimmed_pos_encoder.shape: {trimmed_pos_encoder.shape}")
        x = x + trimmed_pos_encoder  # Add positional encodings


        x = x.transpose(0, 1)  # Transform to (seq_len, batch_size, embed_size) for attention
        x, _ = self.attention(x, x, x)
        x = x.mean(dim=0)  # Aggregate over sequence length
        return self.fc(x)
    def positional_encoding(self, d_model, max_len=5000):
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        return pe

# Instantiate model, loss, optimizer
vocab_size = len(vocab)
embed_size = 4*64
num_class = 2
model = SingleHeadTransformer(vocab_size, embed_size, num_class).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Function
def train(dataloader, model, criterion, optimizer):
    model.train()
    for labels, texts in dataloader:
        labels, texts = labels.to(device), texts.to(device)
        outputs = model(texts)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Evaluation Function with Activation Logging
def evaluate_and_log_activations(dataloader, model):
    model.eval()
    activations = []

    def forward_hook(module, input, output):
        activations.append(output.cpu().detach().numpy())
        #activations.append(output.numpy())

    handle = model.fc.register_forward_hook(forward_hook)

    corrects, total_loss = 0, 0
    with torch.no_grad():
        for labels, texts in dataloader:
            #print(f"labels: {labels}")
            labels, texts = labels.to(device), texts.to(device)
            outputs = model(texts)
            #print(f"outputs: {outputs}")
            #print(f"labels: {labels}")

            total_loss += criterion(outputs, labels).item()
            corrects += (outputs.argmax(1) == labels).sum().item()

            # Log each activation for each data point
            # Note: This stores the last batch activations
            #print(activations[-1])
            #print(f"Activation shape: {len(activations[-1])}")
            # Dump activations to file
            with open(f'activations_epoch_{epoch + 1}.txt', 'w') as f:
              for activation in activations[-1]:
                f.write(str(activation) + '\n')
              f.write("end")

    print(len(dataloader.dataset))
    accuracy = corrects / len(dataloader.dataset)
    handle.remove()
    print(f"Dataset size: {len(dataloader.dataset)}, Dataloader length: {len(dataloader)}, Accuracy: {accuracy:.4f}, Corrects: {corrects}, total_loss {total_loss}") # Modified: Print formatting

    return total_loss / len(dataloader), accuracy

# Training Loop
num_epochs = 25
epoch = 0

val_loss, val_accuracy = evaluate_and_log_activations(train_dataloader, model)
print(f'Epoch {0}, train Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
val_loss, val_accuracy = evaluate_and_log_activations(test_dataloader, model)
print(f'Epoch {0}, test Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
print(f"train_dataloader size: {len(train_dataloader.dataset)}")
print(f"test_dataloader size: {len(test_dataloader.dataset)}")


for epoch in range(num_epochs):
    train(train_dataloader, model, criterion, optimizer)
    val_loss, val_accuracy = evaluate_and_log_activations(train_dataloader, model)
    print(f'train Epoch {epoch+1}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
    val_loss, val_accuracy = evaluate_and_log_activations(test_dataloader, model)
    print(f'test Epoch {epoch+1}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')

print("Training complete.")

x.shape: torch.Size([32, 866, 256]), trimmed_pos_encoder.shape: torch.Size([32, 866, 256])
x.shape: torch.Size([32, 557, 256]), trimmed_pos_encoder.shape: torch.Size([32, 557, 256])
x.shape: torch.Size([32, 973, 256]), trimmed_pos_encoder.shape: torch.Size([32, 973, 256])
x.shape: torch.Size([32, 1130, 256]), trimmed_pos_encoder.shape: torch.Size([32, 1130, 256])
x.shape: torch.Size([32, 625, 256]), trimmed_pos_encoder.shape: torch.Size([32, 625, 256])
x.shape: torch.Size([32, 560, 256]), trimmed_pos_encoder.shape: torch.Size([32, 560, 256])
x.shape: torch.Size([32, 704, 256]), trimmed_pos_encoder.shape: torch.Size([32, 704, 256])
x.shape: torch.Size([32, 909, 256]), trimmed_pos_encoder.shape: torch.Size([32, 909, 256])
x.shape: torch.Size([32, 951, 256]), trimmed_pos_encoder.shape: torch.Size([32, 951, 256])
x.shape: torch.Size([32, 590, 256]), trimmed_pos_encoder.shape: torch.Size([32, 590, 256])
x.shape: torch.Size([32, 521, 256]), trimmed_pos_encoder.shape: torch.Size([32, 521, 256

KeyboardInterrupt: 