# **Step by step implementation of Transformer model from the scratch using Numpy**

### **Import libraries**

libraries for data handling, text preprocessing, and embedding creation

In [None]:
import numpy as np
import pandas as pd
import re
import math
from nltk.corpus import stopwords

# Ensure you have the stopwords corpus
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# **Data loading and Pre-processing**

loading the data from a CSV file and preparing it for training by creating source and target columns

In [None]:
file_path = r'/content/sample_data/en-fr.csv'
df = pd.read_csv(file_path)
df['source'] = df['English words/sentences']
df['target'] = df['French words/sentences'].apply(lambda x: '[start] ' + x + ' [end]')
df = df.drop(['English words/sentences', 'French words/sentences'], axis=1)

print(df.head(5))


  source                    target
0    Hi.      [start] Salut! [end]
1   Run!     [start] Cours ! [end]
2   Run!    [start] Courez ! [end]
3   Who?       [start] Qui ? [end]
4   Wow!  [start] Ça alors ! [end]


shuffle the data and split it into training, validation, and test sets

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.2)
test_size = int(len(df) * 0.1)

print(f"Train size: {train_size}, Val size: {val_size}, Test size: {test_size}")  # Check split sizes

train_df = df[:train_size]
val_df = df[train_size:train_size + val_size]
test_df = df[train_size + val_size:]

print(f"Train set size: {len(train_df)}, Validation set size: {len(val_df)}, Test set size: {len(test_df)}")  # Verify dataset splits

Train size: 122934, Val size: 35124, Test size: 17562
Train set size: 122934, Validation set size: 35124, Test set size: 17563


Normalize the text by converting to lowercase, removing punctuation, and removing stopwords and very short tokens

In [None]:
# Preprocess sentences to normalize text
stop_words = set(stopwords.words('english'))

def preprocess_sentence(sentence):
    # Convert to lowercase
    sentence = sentence.lower()
    # Remove punctuation (except for tokens like [start] and [end])
    sentence = re.sub(r'[^a-z0-9\s\[\]]', '', sentence)
    # Remove stop words and very short tokens
    sentence = ' '.join([word for word in sentence.split() if word not in stop_words and len(word) > 2])
    return sentence

In [None]:
# Apply preprocessing
train_df['source'] = train_df['source'].apply(preprocess_sentence)
train_df['target'] = train_df['target'].apply(preprocess_sentence)
val_df['source'] = val_df['source'].apply(preprocess_sentence)
val_df['target'] = val_df['target'].apply(preprocess_sentence)
test_df['source'] = test_df['source'].apply(preprocess_sentence)
test_df['target'] = test_df['target'].apply(preprocess_sentence)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['source'] = train_df['source'].apply(preprocess_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['target'] = train_df['target'].apply(preprocess_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['source'] = val_df['source'].apply(preprocess_sentence)
A value 

In [None]:
print(train_df.head())  # Verify preprocessing

                  source                                             target
0                    let                [start] laissemoi faire place [end]
1             call party                      [start] jai annuler fte [end]
2                   want               [start] veux parcourir nouveau [end]
3         world come end      [start] quand monde connatratil une fin [end]
4  owe part success luck  [start] nous devons une part notre succs chanc...


### **Create Vocabulary**

create a simple vocabulary from the training data

In [None]:
def create_vocab(sentences):
    vocab = set()
    for sentence in sentences:
        vocab.update(sentence.split())
    vocab = {word: idx for idx, word in enumerate(vocab)}
    return vocab

# Create vocabulary from the training data
vocab = create_vocab(train_df['source'].tolist() + train_df['target'].tolist())
vocab_size = len(vocab)


# **Input Embedding**

Initialize the embedding matrix with random values and create functions to get word embeddings

In [None]:
# Hyperparameters
embedding_dim = 512

# Initialize the embedding matrix with random values
embedding_matrix = np.random.rand(vocab_size, embedding_dim)

print(f"Embedding matrix shape: {embedding_matrix.shape}")  # Verify shape of embedding matrix

Embedding matrix shape: (35203, 512)


In [None]:
def get_embedding(word, vocab, embedding_matrix):
    idx = vocab.get(word, -1)
    if idx == -1:
        raise ValueError(f"Word '{word}' not in vocabulary.")
    return embedding_matrix[idx]


# **Positional Encoding**



In [None]:
# Function to create positional encoding
def get_positional_encoding(max_len, embedding_dim):
    position = np.arange(max_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, embedding_dim, 2) * -(math.log(10000.0) / embedding_dim))
    pos_encoding = np.zeros((max_len, embedding_dim))
    pos_encoding[:, 0::2] = np.sin(position * div_term)
    pos_encoding[:, 1::2] = np.cos(position * div_term)
    return pos_encoding

### **Combine Embeddings and Positional Encodings**

combine word embeddings and positional encodings for a given sentence

In [None]:
sentence = train_df['source'].iloc[0].split()
sentence_len = len(sentence)
print(f"Sentence: {sentence}, Length: {sentence_len}")  # Verify sentence and length

sentence_embeddings = np.array([get_embedding(word, vocab, embedding_matrix) for word in sentence])
print(f"Sentence Embeddings shape: {sentence_embeddings.shape}")  # Verify embeddings shape

positional_encodings = get_positional_encoding(sentence_len, embedding_dim)
print(f"Positional Encodings shape: {positional_encodings.shape}")  # Verify positional encodings shape

# Add input embeddings and positional encodings
input_embedding_with_position = sentence_embeddings + positional_encodings[:sentence_len, :]
print(f"Combined Embedding and Positional Encoding shape: {input_embedding_with_position.shape}")
print(f"Combined Embedding and Positional Encoding:\n{input_embedding_with_position}")

Sentence: ['let'], Length: 1
Sentence Embeddings shape: (1, 512)
Positional Encodings shape: (1, 512)
Combined Embedding and Positional Encoding shape: (1, 512)
Combined Embedding and Positional Encoding:
[[1.73731108e-01 1.90352661e+00 4.58789950e-02 1.41171438e+00
  2.35262215e-01 1.89901722e+00 9.75038506e-02 1.02556805e+00
  4.93127897e-01 1.34464184e+00 7.38890157e-01 1.38065074e+00
  9.29246763e-01 1.07834625e+00 5.59320069e-01 1.10910912e+00
  4.06384533e-01 1.79570402e+00 6.51046344e-01 1.59282110e+00
  1.21720618e-01 1.14286398e+00 7.67434200e-02 1.38213775e+00
  1.50402843e-01 1.34742302e+00 1.68151550e-01 1.06144327e+00
  2.21312866e-01 1.57477523e+00 6.93169151e-02 1.24039111e+00
  4.21081651e-01 1.23549873e+00 6.55963583e-01 1.45318278e+00
  8.92497455e-01 1.29214904e+00 3.82383566e-01 1.07382502e+00
  4.24003743e-01 1.67548654e+00 7.61881782e-01 1.58933606e+00
  9.83268641e-01 1.85585367e+00 4.34684264e-01 1.50688094e+00
  9.39106900e-01 1.59592973e+00 1.90595739e-01 1.65

# **Encoder**

Layer normalization layer

In [None]:
def layer_normalization(x, epsilon=1e-6):
    mean = np.mean(x, axis=-1, keepdims=True)
    variance = np.var(x, axis=-1, keepdims=True)
    normalized_x = (x - mean) / np.sqrt(variance + epsilon)
    return normalized_x

Multi-head Self Attention

In [None]:
def multi_head_attention(query, key, value, num_heads):
    d_k = query.shape[-1]  # Dimensionality of the key/query/value
    assert d_k % num_heads == 0

    # Split the embedding into multiple heads
    def split_heads(x):
        new_shape = x.shape[:-1] + (num_heads, d_k // num_heads)
        x = x.reshape(new_shape)
        return x

    query = split_heads(query)
    key = split_heads(key)
    value = split_heads(value)

    # Scaled Dot-Product Attention
    def scaled_dot_product_attention(q, k, v):
        matmul_qk = np.matmul(q, k.transpose((0, 1, 3, 2)))  # (batch_size, num_heads, seq_len, seq_len)
        dk = q.shape[-1]  # d_k
        scaled_attention_logits = matmul_qk / np.sqrt(dk)
        attention_weights = np.exp(scaled_attention_logits - np.max(scaled_attention_logits, axis=-1, keepdims=True))
        attention_weights /= np.sum(attention_weights, axis=-1, keepdims=True)
        output = np.matmul(attention_weights, v)
        return output

    attention_output = scaled_dot_product_attention(query, key, value)

    # Concatenate heads
    def concatenate_heads(x):
        new_shape = x.shape[:-2] + (d_k,)
        x = x.reshape(new_shape)
        return x

    attention_output = concatenate_heads(attention_output)
    return attention_output


Feed Forward Network

In [None]:
def feed_forward_network(x, d_ff, embedding_dim):
    W1 = np.random.randn(embedding_dim, d_ff)
    b1 = np.random.randn(d_ff)
    W2 = np.random.randn(d_ff, embedding_dim)
    b2 = np.random.randn(embedding_dim)

    def feed_forward(x):
        x = np.maximum(0, np.matmul(x, W1) + b1)  # ReLU activation
        x = np.matmul(x, W2) + b2
        return x

    return feed_forward(x)


Encoder block

In [None]:
def encoder_layer(x, num_heads, d_ff, embedding_dim, epsilon=1e-6):
    # Self-Attention
    attention_output = multi_head_attention(x, x, x, num_heads)
    attention_output += x  # Residual connection
    attention_output = layer_normalization(attention_output, epsilon)  # Add & Normalize

    # Feed-Forward Network
    ff_output = feed_forward_network(attention_output, d_ff, embedding_dim)
    ff_output += attention_output  # Residual connection
    ff_output = layer_normalization(ff_output, epsilon)  # Add & Normalize

    return ff_output


Encoder stack

In [None]:
def encoder_stack(x, num_layers, num_heads, d_ff, embedding_dim, epsilon=1e-6):
    for _ in range(num_layers):
        x = encoder_layer(x, num_heads, d_ff, embedding_dim, epsilon)
    return x


In [None]:
# test
num_layers = 6
num_heads = 8
d_ff = 2048
embedding_dim = 512

batch_size = 1
sentence_length = 10
sentence_embeddings = np.random.rand(batch_size, sentence_length, embedding_dim)

# encoder stack with 6 encoder blocks
encoded_output = encoder_stack(sentence_embeddings, num_layers, num_heads, d_ff, embedding_dim)
print(f"Encoded Output after 6 encoder layers:\n{encoded_output}")


Encoded Output after 6 encoder layers:
[[[ 1.78587034 -0.89151299 -0.8893528  ... -1.75079118  0.06390319
    1.07716843]
  [ 1.79452838 -1.38380661 -1.18681588 ... -1.05053406  1.52258043
    0.44015191]
  [ 2.00914486 -2.2124666  -0.39139131 ... -1.74896168  1.29809918
    0.31650239]
  ...
  [ 0.99193251 -1.32423736 -0.15863095 ... -1.99730583  0.74907146
    1.24960147]
  [ 0.92000762 -0.72016071 -0.85252363 ... -0.64731377  0.08072207
    0.3547932 ]
  [ 0.87252849 -1.5321061  -0.95942637 ... -1.83749966  0.78949547
    0.80534251]]]


# **Decoder**