In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.fft as fft
import numpy as np
import pandas as pd
import re

In [6]:
device='cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [7]:
from datasets import load_dataset
datasets=load_dataset('wikitext','wikitext-2-raw-v1')

In [9]:
def preprocess_function(sentence):
    text=sentence['text'].lower()
    text=re.sub(r'[^a-zA-Z0-9\s]','',text)
    text=re.sub('\s\s',' ',text)
    sentence['text']=text
    return sentence
datasets['train']=datasets['train'].map(preprocess_function)
datasets['validation']=datasets['validation'].map(preprocess_function)
datasets['test']=datasets['test'].map(preprocess_function)  


datasets['train']=datasets['train'].filter(lambda example: len(example['text'])>20)
datasets['validation']=datasets['validation'].filter(lambda example: len(example['text'])>20)
datasets['test']=datasets['test'].filter(lambda example: len(example['text'])>20)

  text=re.sub('\s\s',' ',text)


Map:   0%|          | 0/19530 [00:00<?, ? examples/s]

Map:   0%|          | 0/2083 [00:00<?, ? examples/s]

Map:   0%|          | 0/2383 [00:00<?, ? examples/s]

Filter:   0%|          | 0/19530 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2083 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2383 [00:00<?, ? examples/s]

tokenization

In [10]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding 
from transformers import AutoTokenizer

checkpoint='distilbert-base-uncased-finetuned-sst-2-english'
tokenizer=AutoTokenizer.from_pretrained(checkpoint)
def tokenize(sentence):
    sentence=tokenizer(sentence['text'],truncation=True,max_length=512)
    return sentence
tokenized_inputs=datasets.map(tokenize,batched=True)
tokenized_inputs.remove_columns(['text'])
batch=16
data_collator=DataCollatorWithPadding(tokenizer=tokenizer,padding=True,return_tensors='pt')
dataloader=DataLoader(tokenized_inputs['train'],batch_size=batch,shuffle=True,collate_fn=data_collator)

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Map:   0%|          | 0/2333 [00:00<?, ? examples/s]

Map:   0%|          | 0/19067 [00:00<?, ? examples/s]

Map:   0%|          | 0/2034 [00:00<?, ? examples/s]

In [17]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.d_model = d_model
        self.max_sequence_length = max_sequence_length
        pe = self._generate_positional_encoding()
        # Register as buffer so it moves with .to(device)
        self.register_buffer("positional_encoding", pe)

    def _generate_positional_encoding(self):
        positional_encoding = np.zeros((self.max_sequence_length, self.d_model))
        for pos in range(self.max_sequence_length):
            for i in range(0, self.d_model, 2):
                positional_encoding[pos, i] = np.sin(pos / (10000 ** ((2 * i) / self.d_model)))
                if i + 1 < self.d_model:
                    positional_encoding[pos, i + 1] = np.cos(pos / (10000 ** ((2 * (i + 1)) / self.d_model)))
        return torch.from_numpy(positional_encoding).float()

    def forward(self, x):
        # x: (batch, seq_len, d_model)
        pe = self.positional_encoding.unsqueeze(0).expand(x.size(0), -1, -1)
        return x + pe[:, : x.size(1), :]


class PositionalEncodding(nn.Module):
    def __init__(self, sequence_length, vocab_size, embed_dim):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = PositionalEncoding(embed_dim, sequence_length)

    def forward(self, x):
        embedded_tokens = self.token_embedding(x)
        return self.position_embedding(embedded_tokens)


In [18]:
class FNetEncoder(nn.Module):
    def __init__(self,embed_dim,ffn_dim,dropout):
        super(FNetEncoder,self).__init__()
        self.layer_norm1=nn.LayerNorm(embed_dim)
        self.layer_norm2=nn.LayerNorm(embed_dim)
        self.ffn=nn.Sequential(
            nn.Linear(embed_dim,ffn_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ffn_dim,embed_dim),
            nn.Dropout(dropout)
        )
    def forward(self,x):
        y=self.layer_norm1(x)
        y_fft=fft.fft2(y).real
        x=x+y_fft
        y=self.layer_norm2(x)
        y=self.ffn(y)
        output=x+y
        return output

In [19]:
class FNetDecoder(nn.Module):
    def __init__(self,num_layers,sequence_length,vocab_size,embed_dim,ffn_dim,dropout):
        super(FNetDecoder,self).__init__()
        self.embedding=PositionalEncodding(sequence_length,vocab_size,embed_dim)
        self.layers=nn.ModuleList([FNetEncoder(embed_dim,ffn_dim,dropout) for _ in range(num_layers)])
        self.layer_norm=nn.LayerNorm(embed_dim)
        self.output_layer=nn.Linear(embed_dim,vocab_size)
    def forward(self,x):
        x=self.embedding(x)
        for layer in self.layers:
            x=layer(x)
        x=self.layer_norm(x)
        output=self.output_layer(x)
        return output

FNet model 

In [20]:
class FNetModel(nn.Module):
    def __init__(self,num_layers,sequence_length,vocab_size,embed_dim,ffn_dim,dropout):
        super(FNetModel,self).__init__()
        self.decoder=FNetDecoder(num_layers,sequence_length,vocab_size,embed_dim,ffn_dim,dropout)
    def forward(self,x):
        output=self.decoder(x)
        return output

In [21]:
optimizer=torch.optim.AdamW
model=FNetModel(num_layers=6,
               sequence_length=512,
               vocab_size=tokenizer.vocab_size,
               embed_dim=256,
               ffn_dim=1024,
               dropout=0.1).to(device)
loss_fn=nn.CrossEntropyLoss()
optimizer=optimizer(model.parameters(), lr=1e-4)    
epochs=10
for epoch in range(epochs):
    model.train()
    total_loss=0
    for batch in dataloader:
        inputs=batch['input_ids'].to(device)
        attention_mask=batch['attention_mask'].to(device)
        labels=inputs.clone().to(device)
        optimizer.zero_grad()
        outputs=model(inputs)
        loss=loss_fn(outputs.view(-1,tokenizer.vocab_size),labels.view(-1))
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    avg_loss=total_loss/len(dataloader)
    print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).