In [64]:
import re
import random
import torch
import torch.nn as nn

In [65]:
text='Science is the systematic pursuit of knowledge through observation, experimentation, and logical reasoning. It helps us understand the natural world, from the smallest particles to the vastness of the universe. Through scientific inquiry, humans uncover patterns, develop theories, and create technologies that improve daily life. Science encourages curiosity, critical thinking, and evidence-based decision-making. It evolves continuously as new discoveries challenge old ideas and expand our understanding. Whether exploring biological systems, studying energy, or investigating cosmic phenomena, science provides a reliable method for explaining how things work. Its progress shapes society and guides future innovation for the benefit of all.'

In [66]:
words = [token for token in re.split(r'([,.:;?_!"()\']|--|\s)', text) if token.strip() != '']
words

['Science',
 'is',
 'the',
 'systematic',
 'pursuit',
 'of',
 'knowledge',
 'through',
 'observation',
 ',',
 'experimentation',
 ',',
 'and',
 'logical',
 'reasoning',
 '.',
 'It',
 'helps',
 'us',
 'understand',
 'the',
 'natural',
 'world',
 ',',
 'from',
 'the',
 'smallest',
 'particles',
 'to',
 'the',
 'vastness',
 'of',
 'the',
 'universe',
 '.',
 'Through',
 'scientific',
 'inquiry',
 ',',
 'humans',
 'uncover',
 'patterns',
 ',',
 'develop',
 'theories',
 ',',
 'and',
 'create',
 'technologies',
 'that',
 'improve',
 'daily',
 'life',
 '.',
 'Science',
 'encourages',
 'curiosity',
 ',',
 'critical',
 'thinking',
 ',',
 'and',
 'evidence-based',
 'decision-making',
 '.',
 'It',
 'evolves',
 'continuously',
 'as',
 'new',
 'discoveries',
 'challenge',
 'old',
 'ideas',
 'and',
 'expand',
 'our',
 'understanding',
 '.',
 'Whether',
 'exploring',
 'biological',
 'systems',
 ',',
 'studying',
 'energy',
 ',',
 'or',
 'investigating',
 'cosmic',
 'phenomena',
 ',',
 'science',
 'pro

In [67]:
unique_words = sorted(list(set(words)))
vocab={
    '[CLS]':0,
    '[SEP]':1,
    '[MASK]':2,
    '[PAD]':3
}
for i, word in enumerate(unique_words):
  vocab[word]=i+4
vocab

{'[CLS]': 0,
 '[SEP]': 1,
 '[MASK]': 2,
 '[PAD]': 3,
 ',': 4,
 '.': 5,
 'It': 6,
 'Its': 7,
 'Science': 8,
 'Through': 9,
 'Whether': 10,
 'a': 11,
 'all': 12,
 'and': 13,
 'as': 14,
 'benefit': 15,
 'biological': 16,
 'challenge': 17,
 'continuously': 18,
 'cosmic': 19,
 'create': 20,
 'critical': 21,
 'curiosity': 22,
 'daily': 23,
 'decision-making': 24,
 'develop': 25,
 'discoveries': 26,
 'encourages': 27,
 'energy': 28,
 'evidence-based': 29,
 'evolves': 30,
 'expand': 31,
 'experimentation': 32,
 'explaining': 33,
 'exploring': 34,
 'for': 35,
 'from': 36,
 'future': 37,
 'guides': 38,
 'helps': 39,
 'how': 40,
 'humans': 41,
 'ideas': 42,
 'improve': 43,
 'innovation': 44,
 'inquiry': 45,
 'investigating': 46,
 'is': 47,
 'knowledge': 48,
 'life': 49,
 'logical': 50,
 'method': 51,
 'natural': 52,
 'new': 53,
 'observation': 54,
 'of': 55,
 'old': 56,
 'or': 57,
 'our': 58,
 'particles': 59,
 'patterns': 60,
 'phenomena': 61,
 'progress': 62,
 'provides': 63,
 'pursuit': 64,
 '

In [68]:
class Tokenizer:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {idx: word for word, idx in vocab.items()}

  def encode(self,text):
    return [self.str_to_int[token] for token in re.split(r'([,.:;?_!"()\']|--|\s)', text) if token.strip() != '']

  def decode(self,token_ids):
    return [self.int_to_str[id] for id in token_ids]

In [69]:
tokenizer=Tokenizer(vocab)

ids=tokenizer.encode('Science is the systematic')
ids

[8, 47, 77, 73]

In [70]:
print(tokenizer.decode(ids))

['Science', 'is', 'the', 'systematic']


In [None]:
class BertDataset:
    def __init__(self, sentence_1, sentence_2, max_len=20, max_pred=10):
        self.tokenizer = Tokenizer(vocab)

        CLS = vocab['[CLS]']
        SEP = vocab['[SEP]']
        MASK = vocab['[MASK]']

        tokens1 = self.tokenizer.encode(sentence_1)
        tokens2 = self.tokenizer.encode(sentence_2)

        # build input
        input_ids = [CLS] + tokens1 + [SEP] + tokens2 + [SEP]

        # segment ids
        segment_ids = (
            [0] * (1 + len(tokens1) + 1) +
            [1] * (len(tokens2) + 1)
        )

        # ----- MLM MASKING -----
        cand_pos = [i for i, tid in enumerate(input_ids) if tid not in [CLS, SEP]]
        random.shuffle(cand_pos)

        mask_len = min(max_pred, max(1, int(len(input_ids) * 0.15)))

        masked_tokens = []
        masked_position = []

        for i in range(mask_len):
            pos = cand_pos[i]
            masked_tokens.append(input_ids[pos])
            masked_position.append(pos)

            prob = random.random()

            if prob < 0.8:
                input_ids[pos] = MASK  # 80% mask token
            elif prob < 0.9:
                rand_id = random.randint(0, len(vocab) - 1)
                input_ids[pos] = rand_id  # 10% random
            else:
                pass  # 10% keep original

        # pad input to max_len
        padding = max_len - len(input_ids)
        input_ids += [0] * padding
        segment_ids += [0] * padding

        # pad masked labels
        pad_mlm = max_pred - len(masked_tokens)
        masked_tokens += [0] * pad_mlm
        masked_position += [0] * pad_mlm

        # save
        self.input_ids = input_ids
        self.segment_ids = segment_ids
        self.masked_tokens = masked_tokens
        self.masked_position = masked_position
        self.is_next = True  # or False for NSP


In [72]:
d=BertDataset('Science is the systematic','knowledge through observation')
print('Input IDs: ',d.input_ids)
print('Segment IDs: ',d.segment_ids)
print('Mask Token: ',d.masked_tokens)
print('Mask Token Position: ',d.masked_position)


Input IDs:  [0, 8, 47, 2, 73, 1, 48, 81, 54, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Segment IDs:  [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Mask Token:  [77, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Mask Token Position:  [3, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [73]:
class SelfAttention(nn.Module):
  def __init__(self,d_in,d_out):
    super().__init__()
    self.d_out=d_out
    self.d_in=d_in


    self.w_k=nn.Linear(d_in,d_out)
    self.w_q=nn.Linear(d_in,d_out)
    self.w_v=nn.Linear(d_in,d_out)

  def forward(self,x):
    keys=self.w_k(x)
    values=self.w_v(x)
    queries=self.w_q(x)

    attn_scores=queries @ keys.transpose(-1, -2)
    scaled_weight=torch.softmax(attn_scores/torch.sqrt(self.d_out),dim=-1)
    attn_matrix=scaled_weight @ values
    return attn_matrix




In [74]:
class MultiHeadAttention(nn.Module):
  def __init__(self,num_heads,d_in,d_out):
    super().__init__()
    self.heads=nn.Modulelist(
        [SelfAttention(d_in,d_out) for _ in range(num_heads)]
    )
  def forward(self,x):
    return torch.cat([head(x) for head in self.heads],dim=-1)


In [75]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self,x):
    return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2/torch.pi))*(x+0.044715*x**3)))


In [76]:
class FeedForward(nn.Module):
  def __init__(self,emb_dim):
    super().__init__()
    self.layers=nn.Sequential(
        nn.Linear(emb_dim,4*emb_dim),
        GELU(),
        nn.Linear(4*emb_dim,emb_dim)
    )

  def forward(self,x):
    return self.layers(x)


In [77]:
class LayerNormalization(nn.Module):
  def __init__(self,emb_dim):
    super().__init__()
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))
    self.eps=1e-5

  def forward(self,x):
    mean=torch.mean(x,dim=-1,keepdim=True)
    variance=torch.var(x,dim=-1,keepdim=True, unbiased=False)
    norm=(x-mean)/torch.sqrt(variance+self.eps)
    return self.scale*norm+self.shift

In [78]:
class BERTEmbedding(nn.Module):
  def __init__(self,vocab_size, emb_dim):
    super().__init__()
    self.token_embedding=nn.Embedding(vocab_size,emb_dim,segment_token_type=2,max_token=512)
    self.segmentation_embedding=nn.Embedding(segment_token_type,emb_dim)
    self.position_embedding=nn.Embedding(max_token,emb_dim)


