From 701683de151ef3bf6d7235b51663375a30857b7d Mon Sep 17 00:00:00 2001
From: Youshaa Murhij <yosha.morheg@gmail.com>
Date: Thu, 1 Jun 2023 23:58:15 +0300
Subject: [PATCH 1/3] Update transformer_tutorial.py

 Add description for positional encoding calculation for Transformers
---
 beginner_source/transformer_tutorial.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index cce52eefdb3..6710fb72be0 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -103,7 +103,23 @@ def generate_square_subsequent_mask(sz: int) -> Tensor:
 # positional encodings have the same dimension as the embeddings so that
 # the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
 # different frequencies.
-#
+# The div_term in the code is calculated as 
+# torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)). 
+# This calculation is based on the original Transformer paper’s formulation 
+# for positional encoding. The purpose of this calculation is to create 
+# a range of values that decrease exponentially. 
+# This allows the model to learn to attend to positions based on their relative distances.
+# The math.log(10000.0) term in the exponent represents the maximum effective 
+# input length (in this case, 10000). Dividing this term by d_model scales 
+# the values to be within a reasonable range for the exponential function. 
+# The negative sign in front of the logarithm ensures that the values decrease exponentially.
+# The reason for writing math.log(10000.0) instead of 4 in the code is to make it clear
+# that this value represents the logarithm of the maximum effective input length 
+# (in this case, 10000). This makes the code more readable and easier to understand.
+# Using math.log(10000.0) instead of 4 also makes it easier to change the maximum effective 
+# input length if needed. If you want to use a different value for the maximum effective 
+# input length, you can simply change the argument of the math.log 
+# function instead of recalculating the logarithm manually.
 
 class PositionalEncoding(nn.Module):
 

From 51e989f2a3e477109bb1be2cccf73ca978fb90b0 Mon Sep 17 00:00:00 2001
From: Youshaa Murhij <yosha.morheg@gmail.com>
Date: Fri, 2 Jun 2023 00:13:06 +0300
Subject: [PATCH 2/3] Update Positional Encoding description in
 transformer_tutorial.py

---
 beginner_source/transformer_tutorial.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index 6710fb72be0..b98a658a215 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -103,22 +103,22 @@ def generate_square_subsequent_mask(sz: int) -> Tensor:
 # positional encodings have the same dimension as the embeddings so that
 # the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
 # different frequencies.
-# The div_term in the code is calculated as 
-# torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)). 
+# The ``div_term`` in the code is calculated as 
+# ``torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))``. 
 # This calculation is based on the original Transformer paper’s formulation 
 # for positional encoding. The purpose of this calculation is to create 
 # a range of values that decrease exponentially. 
 # This allows the model to learn to attend to positions based on their relative distances.
-# The math.log(10000.0) term in the exponent represents the maximum effective 
-# input length (in this case, 10000). Dividing this term by d_model scales 
+# The ``math.log(10000.0)`` term in the exponent represents the maximum effective 
+# input length (in this case, ``10000``). Dividing this term by ``d_model`` scales 
 # the values to be within a reasonable range for the exponential function. 
 # The negative sign in front of the logarithm ensures that the values decrease exponentially.
-# The reason for writing math.log(10000.0) instead of 4 in the code is to make it clear
+# The reason for writing ``math.log(10000.0)`` instead of ``4`` in the code is to make it clear
 # that this value represents the logarithm of the maximum effective input length 
-# (in this case, 10000). This makes the code more readable and easier to understand.
-# Using math.log(10000.0) instead of 4 also makes it easier to change the maximum effective 
+# (in this case, ``10000``). This makes the code more readable and easier to understand.
+# Using ``math.log(10000.0)`` instead of ``4`` also makes it easier to change the maximum effective 
 # input length if needed. If you want to use a different value for the maximum effective 
-# input length, you can simply change the argument of the math.log 
+# input length, you can simply change the argument of the ``math.log`` 
 # function instead of recalculating the logarithm manually.
 
 class PositionalEncoding(nn.Module):

From f453a2364ac077b4ead62fae8ccb50b85491168d Mon Sep 17 00:00:00 2001
From: Youshaa Murhij <yosha.morheg@gmail.com>
Date: Fri, 2 Jun 2023 15:17:33 +0300
Subject: [PATCH 3/3] Update transformer_tutorial.py

---
 beginner_source/transformer_tutorial.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index b98a658a215..5ed9a0d1390 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -112,14 +112,7 @@ def generate_square_subsequent_mask(sz: int) -> Tensor:
 # The ``math.log(10000.0)`` term in the exponent represents the maximum effective 
 # input length (in this case, ``10000``). Dividing this term by ``d_model`` scales 
 # the values to be within a reasonable range for the exponential function. 
-# The negative sign in front of the logarithm ensures that the values decrease exponentially.
-# The reason for writing ``math.log(10000.0)`` instead of ``4`` in the code is to make it clear
-# that this value represents the logarithm of the maximum effective input length 
-# (in this case, ``10000``). This makes the code more readable and easier to understand.
-# Using ``math.log(10000.0)`` instead of ``4`` also makes it easier to change the maximum effective 
-# input length if needed. If you want to use a different value for the maximum effective 
-# input length, you can simply change the argument of the ``math.log`` 
-# function instead of recalculating the logarithm manually.
+#
 
 class PositionalEncoding(nn.Module):