<a href="https://colab.research.google.com/github/sqbitegh/Colabs/blob/main/MiniNetwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install 'portalocker>=2.0.0'
!pip install torchtext==0.15.1 --no-cache-dir
!pip install numpy==1.24.1

In [None]:
#!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#!pip install --upgrade torchtext

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
from itertools import chain

import portalocker
import numpy
import random
import math

In [2]:
import torch
import torchtext
print(torch.__version__)
print(torchtext.__version__)
print(portalocker.__version__)
print(numpy.__version__)

"""
expected
2.0.0+cu117
0.15.1+cpu
3.1.1
1.24.1
"""


2.0.0+cu117
0.15.1+cpu
3.1.1
1.24.1


'\nexpected\n2.0.0+cu117\n0.15.1+cpu\n3.1.1\n1.24.1\n'

In [None]:

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenizer and Vocabulary
tokenizer = get_tokenizer('basic_english')
train_iter, test_iter = IMDB()

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Dataset and DataLoader
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        #print(f"rawlabel {_label} {_text}")
        label_list.append(_label - 1)
        processed_text = torch.tensor(vocab(tokenizer(_text)), dtype=torch.int64)
        text_list.append(processed_text)
    return torch.tensor(label_list, dtype=torch.int64), nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=vocab["<pad>"])

batch_size = 32

train_iter, test_iter = IMDB()
#merged_iter = chain(train_iter, test_iter)
merged_iter = train_iter

#train_data = list(merged_iter)
train_data = [(label, text[:1000]) for label, text in merged_iter]
random.shuffle(train_data)

train_size = int(0.5 * len(train_data))
train_data, test_data = train_data[:train_size], train_data[train_size:] # Splitting training data into train and validation
train_dataloader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_batch) # Shuffle training data
test_dataloader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_batch)

"""
print("test_data print:")
for dp in test_data:
  print(dp)
print("test_data print end.")
"""
# Single-Head Transformer Model
class SingleHeadTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_class, use_dropout):
        super(SingleHeadTransformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_encoder = nn.Parameter(torch.zeros(1, embed_size)) #learnable position encodings
        #self.pos_encoder =self.positional_encoding(embed_size)  # Use fixed encoding
        # Cache for trimmed positional encodings
        self.trimmed_pos_encoder_cache = {}
        self.attention = nn.MultiheadAttention(embed_size, num_heads=4)
        self.fc = nn.Linear(embed_size, num_class)
        self.use_dropout = use_dropout
        if use_dropout:
            self.dropout = nn.Dropout(0.4)  # Dropout layer with probability 0.2



    def forward(self, x):
        x = self.embedding(x) + self.pos_encoder
        """
        x = self.embedding(x)
        # Slice the pre-computed positional encodings
        seq_len = x.shape[1]
        batch_len = x.shape[0]

        #print(f"seq_len: {seq_len}, batch_len: {batch_len}")
        if seq_len in self.trimmed_pos_encoder_cache and batch_len == 32:
            trimmed_pos_encoder = self.trimmed_pos_encoder_cache[seq_len]
            trimmed_pos_encoder = trimmed_pos_encoder.expand(x.size(0), *trimmed_pos_encoder.size()[1:]) #add batch size
            #print(f"found in cache x.shape: {x.shape}, trimmed_pos_encoder.shape: {trimmed_pos_encoder.shape}")
        else:
          trimmed_pos_encoder = self.pos_encoder[:seq_len, 0, :].unsqueeze(0).to(x.device) # Updated to remove middle dimension
          #print(f"self.pos_encoder.shape: {self.pos_encoder.shape}, trimmed_pos_encoder.shape: {trimmed_pos_encoder.shape}")
          trimmed_pos_encoder = trimmed_pos_encoder.expand(x.size(0), *trimmed_pos_encoder.size()[1:]) #add batch size
          #print(f"x.shape: {x.shape}, trimmed_pos_encoder.shape: {trimmed_pos_encoder.shape}")
          if batch_len == 32:
            #print("adding to cache")
            self.trimmed_pos_encoder_cache[seq_len] = trimmed_pos_encoder  # Store in cache

        #print(f"x.shape: {x.shape}, trimmed_pos_encoder.shape: {trimmed_pos_encoder.shape}")
        x = x + trimmed_pos_encoder  # Add positional encodings
        """

        x = x.transpose(0, 1)  # Transform to (seq_len, batch_size, embed_size) for attention
        x, _ = self.attention(x, x, x)
        x = x.mean(dim=0)  # Aggregate over sequence length
        if self.use_dropout:
          x = self.dropout(x)  # Apply dropout if enabled
        return self.fc(x)

    def positional_encoding(self, d_model, max_len=5000):
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        return pe

# Instantiate model, loss, optimizer
num_epochs = 5
save_activations = True
epoch = 0
vocab_size = len(vocab)
embed_size = 4*64
num_class = 2
model = SingleHeadTransformer(vocab_size, embed_size, num_class, use_dropout=False).to(device)

criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Function
def train(dataloader, model, criterion, optimizer):
    model.train()
    for labels, texts in dataloader:
        labels, texts = labels.to(device), texts.to(device)
        outputs = model(texts)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Evaluation Function with Activation Logging
def evaluate_and_log_activations(dataloader, model, mode, save_activations=True):
    model.eval()
    activations = []

    def forward_hook(module, input, output):
        #activations.append(output.cpu().detach().numpy()) #fc outputs
        #print(f"input {input[0]}") #thats tensor
        activations.append(input[0].cpu().detach().numpy()) #fc inputs
        #print(f"hook acts shape {activations[-1].shape}")
    def att_forward_hook(module, input, output):
        attention_output = output[0]
        activations.append(attention_output.cpu().detach().numpy())
        #print(f"hook acts shape {activations[-1].shape}")

    handle = model.fc.register_forward_hook(forward_hook)
    #handle_att = model.attention.register_forward_hook(att_forward_hook)

    corrects, total_loss = 0, 0
    corrects_list = []
    with torch.no_grad():
        for labels, texts in dataloader:
            #print(f"labels: {labels}")
            labels, texts = labels.to(device), texts.to(device)
            outputs = model(texts)
            #print(f"outputs: {outputs}")
            #print(f"labels: {labels}")

            total_loss += criterion(outputs, labels).item()
            corrects += (outputs.argmax(1) == labels).sum().item()
            batch_corrects = (outputs.argmax(1) == labels).cpu().numpy()
            corrects_list.extend(batch_corrects)  # Add to overall list


            # Log each activation for each data point
            # Note: This stores the last batch activations
            #print(activations[-1])
            #print(f"Activation shape: {len(activations[-1])}")
            # Dump activations to file
    if save_activations:
      with open(f'activations_fc_input_d64_h1_epoch_{epoch}_{mode}.txt', 'w') as f:
        if(len(activations) == 0):
          f.write("empty")
        else:
          for activation in activations:
            numpy.savetxt(f, activation, fmt='%f') # Use numpy.savetxt
            f.write('\n')
        f.write("end")
      with open(f'corrects_list_d64_h1_epoch_{epoch}_{mode}.txt', 'w') as f:
        for item in corrects_list:
          f.write(str(item) + '\n')
        f.write("end")

    #print(len(dataloader.dataset))
    accuracy = corrects / len(dataloader.dataset)
    handle.remove()
    #handle_att.remove()
    print(f"Dataset size: {len(dataloader.dataset)}, Dataloader length: {len(dataloader)}, Accuracy: {accuracy:.4f}, Corrects: {corrects}, total_loss {total_loss}") # Modified: Print formatting

    return total_loss / len(dataloader), accuracy

def cleanup_memory():
  #del model
  del train_dataloader
  del test_dataloader
  del train_data
  del test_data
  torch.cuda.empty_cache()

# Training Loop


val_loss, val_accuracy = evaluate_and_log_activations(train_dataloader, model, "train", save_activations)
print(f'Epoch {0}, train Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
val_loss, val_accuracy = evaluate_and_log_activations(test_dataloader, model, "test", save_activations)
print(f'Epoch {0}, test Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
print(f"train_dataloader size: {len(train_dataloader.dataset)}")
print(f"test_dataloader size: {len(test_dataloader.dataset)}")

scheduler = StepLR(optimizer, step_size=2, gamma=0.9)
for epoch in range(1, num_epochs):
    train(train_dataloader, model, criterion, optimizer)
    val_loss, val_accuracy = evaluate_and_log_activations(train_dataloader, model, "train",save_activations)
    print(f'train Epoch {epoch}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
    val_loss, val_accuracy = evaluate_and_log_activations(test_dataloader, model, "test",save_activations)
    print(f'test Epoch {epoch}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
    #scheduler.step()
print("Training complete.")
cleanup_memory()
print("Cleaning complete.")

