# **Step by step implementation of Transformer model from the scratch using Numpy**

### **Import libraries**

libraries for data handling, text preprocessing, and embedding creation

In [None]:
import numpy as np
import pandas as pd
import re
import math
from nltk.corpus import stopwords

# Ensure you have the stopwords corpus
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### **Data loading and Pre-processing**

loading the data from a CSV file and preparing it for training by creating source and target columns

In [None]:
file_path = r'/content/sample_data/en-fr.csv'
df = pd.read_csv(file_path)
df['source'] = df['English words/sentences']
df['target'] = df['French words/sentences'].apply(lambda x: '[start] ' + x + ' [end]')
df = df.drop(['English words/sentences', 'French words/sentences'], axis=1)

print(df.head(5))


  source                    target
0    Hi.      [start] Salut! [end]
1   Run!     [start] Cours ! [end]
2   Run!    [start] Courez ! [end]
3   Who?       [start] Qui ? [end]
4   Wow!  [start] Ça alors ! [end]


shuffle the data and split it into training, validation, and test sets

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.2)
test_size = int(len(df) * 0.1)

print(f"Train size: {train_size}, Val size: {val_size}, Test size: {test_size}")  # Check split sizes

train_df = df[:train_size]
val_df = df[train_size:train_size + val_size]
test_df = df[train_size + val_size:]

print(f"Train set size: {len(train_df)}, Validation set size: {len(val_df)}, Test set size: {len(test_df)}")  # Verify dataset splits

Train size: 122934, Val size: 35124, Test size: 17562
Train set size: 122934, Validation set size: 35124, Test set size: 17563


Normalize the text by converting to lowercase, removing punctuation, and removing stopwords and very short tokens

In [None]:
# Preprocess sentences to normalize text
stop_words = set(stopwords.words('english'))

def preprocess_sentence(sentence):
    # Convert to lowercase
    sentence = sentence.lower()
    # Remove punctuation (except for tokens like [start] and [end])
    sentence = re.sub(r'[^a-z0-9\s\[\]]', '', sentence)
    # Remove stop words and very short tokens
    sentence = ' '.join([word for word in sentence.split() if word not in stop_words and len(word) > 2])
    return sentence

In [None]:
# Apply preprocessing
train_df['source'] = train_df['source'].apply(preprocess_sentence)
train_df['target'] = train_df['target'].apply(preprocess_sentence)
val_df['source'] = val_df['source'].apply(preprocess_sentence)
val_df['target'] = val_df['target'].apply(preprocess_sentence)
test_df['source'] = test_df['source'].apply(preprocess_sentence)
test_df['target'] = test_df['target'].apply(preprocess_sentence)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['source'] = train_df['source'].apply(preprocess_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['target'] = train_df['target'].apply(preprocess_sentence)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['source'] = val_df['source'].apply(preprocess_sentence)
A value 

In [None]:
print(train_df.head())  # Verify preprocessing

                       source  \
0  hes bit rough around edges   
1                    much get   
2   understand tom didnt want   
3          hes worried result   
4                  drank wine   

                                              target  
0                       [start] est peu revche [end]  
1         [start] combien devraientils obtenir [end]  
2  [start] comprends pourquoi tom voulait pas fai...  
3                     [start] proccupe rsultat [end]  
4                        [start] nous bmes vin [end]  


### **Create Vocabulary**

create a simple vocabulary from the training data

In [None]:
def create_vocab(sentences):
    vocab = set()
    for sentence in sentences:
        vocab.update(sentence.split())
    vocab = {word: idx for idx, word in enumerate(vocab)}
    return vocab

# Create vocabulary from the training data
vocab = create_vocab(train_df['source'].tolist() + train_df['target'].tolist())
vocab_size = len(vocab)


### **Input Embedding**

Initialize the embedding matrix with random values and create functions to get word embeddings

In [None]:
# Hyperparameters
embedding_dim = 512

# Initialize the embedding matrix with random values
embedding_matrix = np.random.rand(vocab_size, embedding_dim)

print(f"Embedding matrix shape: {embedding_matrix.shape}")  # Verify shape of embedding matrix

Embedding matrix shape: (35196, 512)


In [None]:
def get_embedding(word, vocab, embedding_matrix):
    idx = vocab.get(word, -1)
    if idx == -1:
        raise ValueError(f"Word '{word}' not in vocabulary.")
    return embedding_matrix[idx]


### **Positional Encoding**



In [None]:
# Function to create positional encoding
def get_positional_encoding(max_len, embedding_dim):
    position = np.arange(max_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, embedding_dim, 2) * -(math.log(10000.0) / embedding_dim))
    pos_encoding = np.zeros((max_len, embedding_dim))
    pos_encoding[:, 0::2] = np.sin(position * div_term)
    pos_encoding[:, 1::2] = np.cos(position * div_term)
    return pos_encoding

### **Combine Embeddings and Positional Encodings**

combine word embeddings and positional encodings for a given sentence

In [None]:
sentence = train_df['source'].iloc[0].split()
sentence_len = len(sentence)
print(f"Sentence: {sentence}, Length: {sentence_len}")  # Verify sentence and length

sentence_embeddings = np.array([get_embedding(word, vocab, embedding_matrix) for word in sentence])
print(f"Sentence Embeddings shape: {sentence_embeddings.shape}")  # Verify embeddings shape

positional_encodings = get_positional_encoding(sentence_len, embedding_dim)
print(f"Positional Encodings shape: {positional_encodings.shape}")  # Verify positional encodings shape

# Add input embeddings and positional encodings
input_embedding_with_position = sentence_embeddings + positional_encodings[:sentence_len, :]
print(f"Combined Embedding and Positional Encoding shape: {input_embedding_with_position.shape}")
print(f"Combined Embedding and Positional Encoding:\n{input_embedding_with_position}")

Sentence: ['hes', 'bit', 'rough', 'around', 'edges'], Length: 5
Sentence Embeddings shape: (5, 512)
Positional Encodings shape: (5, 512)
Combined Embedding and Positional Encoding shape: (5, 512)
Combined Embedding and Positional Encoding:
[[ 0.78493689  1.23427811  0.13268065 ...  1.12921899  0.13940888
   1.51599795]
 [ 0.88012215  1.21395799  1.08978388 ...  1.66885422  0.30056578
   1.48030144]
 [ 1.89831024  0.24811682  1.03117638 ...  1.04754841  0.10934511
   1.05131897]
 [ 1.01997201 -0.13534729  0.52024244 ...  1.03105488  0.62201448
   1.72469254]
 [-0.61093644  0.2523216   0.22767455 ...  1.68926288  0.96791953
   1.40688246]]
