In [6]:
import torch
import torch.nn as nn
from argparse import Namespace


In [7]:
args = Namespace(
    
    VOCAB_SIZE = 30000,
    N_SEGMENTS = 3,
    MAX_LEN = 512,
    EMBED_DIM = 768,
    N_LAYERS = 12,
    ATTN_HEADS = 12,
    DROPOUT = 0.1,
    # Data and path information
    frequency_cutoff=25,
    model_state_file='model.pth', review_csv='data/yelp/reviews_with_splits_lite.csv', save_dir='model_storage/ch3/yelp/', vectorizer_file='vectorizer.json',
    # No model hyperparameters
    # Training hyperparameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options omitted for space
)

### Embeddings

In [14]:
class BERTEmbedding(nn.Module):
    def __init__(self,
                 vocab_size,
                 n_segments,
                 max_len,
                 embed_dim,
                 dropout):
        super().__init__()
        self.tok_embed = nn.Embedding(vocab_size, embed_dim)
        self.seg_embed = nn.Embedding(n_segments, embed_dim)
        self.pos_embed = nn.Embedding(max_len, embed_dim)

        self.drop = nn.Dropout(dropout)
        self.pos_inp = torch.tensor([i for i in range(max_len)],)

    def forward(self, seq, seg):
        embed_val = self.tok_embed(seq) + self.seg_embed(seg) + self.pos_embed(self.pos_inp)
        return self.drop(embed_val)
    
class BERT(nn.Module):
    def __init__(self,
                 vocab_size,
                 n_segments,
                 max_len,
                 embed_dim,
                 n_layers,
                 attn_heads,
                 dropout):
        super().__init__()
        self.embedding = BERTEmbedding(vocab_size, n_segments, max_len, embed_dim, dropout)
        self.enc_layer = nn.TransformerEncoderLayer(embed_dim, attn_heads, embed_dim*4)
        self.enc_block = nn.TransformerEncoder(self.enc_layer, n_layers)
    
    def forward(self, seq, seg):
        embed_val = self.embedding(seq, seg)
        return self.enc_block(embed_val)
        

In [15]:
sample_seq = torch.randint(high = args.VOCAB_SIZE, size = [args.MAX_LEN,])
sample_seg = torch.randint(high = args.N_SEGMENTS, size = [args.MAX_LEN,])

embedding = BERTEmbedding(args.VOCAB_SIZE, args.N_SEGMENTS, args.MAX_LEN, args.EMBED_DIM, args.DROPOUT)
embedding_tensor = embedding(sample_seq, sample_seg)
print(embedding_tensor.shape)  # [512, 768] -> [max_len, embed_dim]

bert = BERT(args.VOCAB_SIZE, args.N_SEGMENTS, args.MAX_LEN, args.EMBED_DIM, args.N_LAYERS, args.ATTN_HEADS, args.DROPOUT)
out = bert(sample_seq, sample_seg)
print(out.shape)  # [512, 768] -> [max_len, embed_dim]

torch.Size([512, 768])




torch.Size([512, 768])


# BERT MODEL

In [None]:
import torch
import torch.nn as nn
from embed import BERTEmbedding, PositionalEmbeddings
from encoder import MultiHeadedAttention, FeedForward, EncoderLayer

class BERT(nn.Moduel):
    def __init__(self, 
                vocab_size, 
                d_in=768, 
                n_layers=12, 
                n_heads=12, 
                dropout=0.1):
        super().__init__()

        self.d_in = d_in
        self.n_layers = n_layers
        self.heads = n_heads

        #paper has 4*hidden_size for ff_hidden_size
        self.feed_forward_hidden = 4*d_in

        self.embedding = BERTEmbedding(vocab_size, d_in)

        #multi attention
        self.encoder_block = nn.ModuleList(
            [EncoderLayer(d_in, n_heads, d_in*4, dropout) for _ in range(n_layers)]
        )
    
    def forward(self, x, segment_info):
        mask = (x>0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)

        x = self.embedding(x, segment_info)

        for layer in self.encoder_block:
            x = layer(x, mask)
        return x
    
