From 701683de151ef3bf6d7235b51663375a30857b7d Mon Sep 17 00:00:00 2001 From: Youshaa Murhij Date: Thu, 1 Jun 2023 23:58:15 +0300 Subject: [PATCH 1/3] Update transformer_tutorial.py Add description for positional encoding calculation for Transformers --- beginner_source/transformer_tutorial.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py index cce52eefdb3..6710fb72be0 100644 --- a/beginner_source/transformer_tutorial.py +++ b/beginner_source/transformer_tutorial.py @@ -103,7 +103,23 @@ def generate_square_subsequent_mask(sz: int) -> Tensor: # positional encodings have the same dimension as the embeddings so that # the two can be summed. Here, we use ``sine`` and ``cosine`` functions of # different frequencies. -# +# The div_term in the code is calculated as +# torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)). +# This calculation is based on the original Transformer paper’s formulation +# for positional encoding. The purpose of this calculation is to create +# a range of values that decrease exponentially. +# This allows the model to learn to attend to positions based on their relative distances. +# The math.log(10000.0) term in the exponent represents the maximum effective +# input length (in this case, 10000). Dividing this term by d_model scales +# the values to be within a reasonable range for the exponential function. +# The negative sign in front of the logarithm ensures that the values decrease exponentially. +# The reason for writing math.log(10000.0) instead of 4 in the code is to make it clear +# that this value represents the logarithm of the maximum effective input length +# (in this case, 10000). This makes the code more readable and easier to understand. +# Using math.log(10000.0) instead of 4 also makes it easier to change the maximum effective +# input length if needed. If you want to use a different value for the maximum effective +# input length, you can simply change the argument of the math.log +# function instead of recalculating the logarithm manually. class PositionalEncoding(nn.Module): From 51e989f2a3e477109bb1be2cccf73ca978fb90b0 Mon Sep 17 00:00:00 2001 From: Youshaa Murhij Date: Fri, 2 Jun 2023 00:13:06 +0300 Subject: [PATCH 2/3] Update Positional Encoding description in transformer_tutorial.py --- beginner_source/transformer_tutorial.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py index 6710fb72be0..b98a658a215 100644 --- a/beginner_source/transformer_tutorial.py +++ b/beginner_source/transformer_tutorial.py @@ -103,22 +103,22 @@ def generate_square_subsequent_mask(sz: int) -> Tensor: # positional encodings have the same dimension as the embeddings so that # the two can be summed. Here, we use ``sine`` and ``cosine`` functions of # different frequencies. -# The div_term in the code is calculated as -# torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)). +# The ``div_term`` in the code is calculated as +# ``torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))``. # This calculation is based on the original Transformer paper’s formulation # for positional encoding. The purpose of this calculation is to create # a range of values that decrease exponentially. # This allows the model to learn to attend to positions based on their relative distances. -# The math.log(10000.0) term in the exponent represents the maximum effective -# input length (in this case, 10000). Dividing this term by d_model scales +# The ``math.log(10000.0)`` term in the exponent represents the maximum effective +# input length (in this case, ``10000``). Dividing this term by ``d_model`` scales # the values to be within a reasonable range for the exponential function. # The negative sign in front of the logarithm ensures that the values decrease exponentially. -# The reason for writing math.log(10000.0) instead of 4 in the code is to make it clear +# The reason for writing ``math.log(10000.0)`` instead of ``4`` in the code is to make it clear # that this value represents the logarithm of the maximum effective input length -# (in this case, 10000). This makes the code more readable and easier to understand. -# Using math.log(10000.0) instead of 4 also makes it easier to change the maximum effective +# (in this case, ``10000``). This makes the code more readable and easier to understand. +# Using ``math.log(10000.0)`` instead of ``4`` also makes it easier to change the maximum effective # input length if needed. If you want to use a different value for the maximum effective -# input length, you can simply change the argument of the math.log +# input length, you can simply change the argument of the ``math.log`` # function instead of recalculating the logarithm manually. class PositionalEncoding(nn.Module): From f453a2364ac077b4ead62fae8ccb50b85491168d Mon Sep 17 00:00:00 2001 From: Youshaa Murhij Date: Fri, 2 Jun 2023 15:17:33 +0300 Subject: [PATCH 3/3] Update transformer_tutorial.py --- beginner_source/transformer_tutorial.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py index b98a658a215..5ed9a0d1390 100644 --- a/beginner_source/transformer_tutorial.py +++ b/beginner_source/transformer_tutorial.py @@ -112,14 +112,7 @@ def generate_square_subsequent_mask(sz: int) -> Tensor: # The ``math.log(10000.0)`` term in the exponent represents the maximum effective # input length (in this case, ``10000``). Dividing this term by ``d_model`` scales # the values to be within a reasonable range for the exponential function. -# The negative sign in front of the logarithm ensures that the values decrease exponentially. -# The reason for writing ``math.log(10000.0)`` instead of ``4`` in the code is to make it clear -# that this value represents the logarithm of the maximum effective input length -# (in this case, ``10000``). This makes the code more readable and easier to understand. -# Using ``math.log(10000.0)`` instead of ``4`` also makes it easier to change the maximum effective -# input length if needed. If you want to use a different value for the maximum effective -# input length, you can simply change the argument of the ``math.log`` -# function instead of recalculating the logarithm manually. +# class PositionalEncoding(nn.Module):