# Tradutor Português/Inglês utilizando Transformers

## Setup

In [41]:
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re

os.environ['KERAS_BACKEND'] = 'tensorflow'

import keras
from keras import layers
import tensorflow as tf

# ===== OTIMIZAÇÃO: Mixed Precision para acelerar treinamento =====
# Reduz uso de memória e acelera cálculos em GPUs modernas
from keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
print(f"✅ Mixed Precision ativado: {policy.name}")
print(f"   Compute dtype: {policy.compute_dtype}")
print(f"   Variable dtype: {policy.variable_dtype}")


✅ Mixed Precision ativado: mixed_float16
   Compute dtype: float16
   Variable dtype: float32


## Carregamento dos Dados

In [42]:
# Vamos usar listas normais do Python, que são perfeitas para isso.
portuguese_sentences = []
english_sentences = []

# O 'with open(...)' garante que o arquivo seja fechado corretamente no final.
with open('por.txt', 'r', encoding='utf-8') as f:
    for line in f:
        # Ignora linhas em branco que possam existir no arquivo
        if not line.strip():
            continue

        # 1. strip() remove espaços/quebras de linha no início/fim
        # 2. split('\t') quebra a linha no caractere de tabulação
        # O novo formato tem 3 colunas: inglês, português, metadados
        parts = line.strip().split('\t')
        
        # Verificar se a linha tem pelo menos 2 colunas (inglês e português)
        if len(parts) < 2:
            continue
        
        english_part = parts[0]  # Primeira coluna: inglês
        portuguese_part = parts[1]  # Segunda coluna: português
        # parts[2] contém os metadados, que vamos ignorar

        english_sentences.append(english_part)
        portuguese_sentences.append(portuguese_part)

# Vamos conferir quantas frases temos e as 3 primeiras de cada lista
print(f"Total de pares de frases carregados: {len(english_sentences)}")
print("-" * 60)
print("Primeiras 3 frases em inglês:", english_sentences[:3])
print("Primeiras 3 frases em português:", portuguese_sentences[:3])
print("-" * 60)
print("Últimas 3 frases em inglês:", english_sentences[-3:])
print("Últimas 3 frases em português:", portuguese_sentences[-3:])


Total de pares de frases carregados: 168903
------------------------------------------------------------
Primeiras 3 frases em inglês: ['Go.', 'Go.', 'Hi.']
Primeiras 3 frases em português: ['Vai.', 'Vá.', 'Oi.']
------------------------------------------------------------
Últimas 3 frases em inglês: ["No matter how much you try to convince people that chocolate is vanilla, it'll still be chocolate, even though you may manage to convince yourself and a few others that it's vanilla.", 'Some movies make such an impact that one never forgets them. Such is the case with "Life is Beautiful," the emotional Benigni film that mixes drama and comedy in an exceptional manner.', 'A child who is a native speaker usually knows many things about his or her language that a non-native speaker who has been studying for years still does not know and perhaps will never know.']
Últimas 3 frases em português: ['Não importa o quanto você tenta convencer os outros de que chocolate é baunilha, ele ainda será 

## Pré-Processamento: Limpeza, Padronização e Transformação

In [43]:
# Let's keep our original lists for comparison
# portuguese_sentences = ['Vá.', 'Oi.', 'Corra!', ...]
# english_sentences = ['Go.', 'Hi.', 'Run!', ...]

def standardize_text(text):
    """
    Cleans and standardizes a single sentence.
    1. Converts to lowercase.
    2. Removes punctuation.
    3. Removes extra whitespace.
    """
    # Lowercase the text
    text = text.lower()
    
    # Create a translation table to remove punctuation
    # string.punctuation contains characters like '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# 1. Standardize the source (Portuguese) and target (English) phrases
source_phrases_clean = [standardize_text(pt) for pt in portuguese_sentences]
target_phrases_clean = [standardize_text(en) for en in english_sentences]

# 2. NOW, add the special tokens to the CLEANED target phrases
target_phrases_final = [f'[start] {eng} [end]' for eng in target_phrases_clean]

# Let's see the transformation for one example
print(f"Original English phrase: '{english_sentences[2]}'")
print(f"Final target phrase:   '{target_phrases_final[2]}'")
print("-" * 20)
print("Final source phrases (first 3):", source_phrases_clean[:3])
print("Final target phrases (first 3):", target_phrases_final[:3])


Original English phrase: 'Hi.'
Final target phrase:   '[start] hi [end]'
--------------------
Final source phrases (first 3): ['vai', 'vá', 'oi']
Final target phrases (first 3): ['[start] go [end]', '[start] go [end]', '[start] hi [end]']


## Vetorização

In [44]:
# OTIMIZAÇÃO: Calcular vocab_size e sequence_length dinamicamente do dataset
# Isso garante que o modelo se adapte aos dados reais

# Primeiro, vamos analisar os dados para determinar o tamanho máximo de sequência
import numpy as np

# Calcular o comprimento de cada frase (em palavras)
source_lengths = [len(phrase.split()) for phrase in source_phrases_clean]
target_lengths = [len(phrase.split()) for phrase in target_phrases_final]

# Estatísticas dos comprimentos
print("=" * 60)
print("📊 ANÁLISE DO DATASET:")
print("=" * 60)
print(f"Português - Comprimento médio: {np.mean(source_lengths):.1f} palavras")
print(f"Português - Comprimento máximo: {np.max(source_lengths)} palavras")
print(f"Português - Percentil 95: {np.percentile(source_lengths, 95):.0f} palavras")
print("-" * 60)
print(f"Inglês - Comprimento médio: {np.mean(target_lengths):.1f} palavras")
print(f"Inglês - Comprimento máximo: {np.max(target_lengths)} palavras")
print(f"Inglês - Percentil 95: {np.percentile(target_lengths, 95):.0f} palavras")
print("=" * 60)

# Definir sequence_length como o percentil 95 (cobre 95% dos dados)
# Adicionar margem de segurança (+2 para os tokens [start] e [end])
sequence_length = int(max(
    np.percentile(source_lengths, 95),
    np.percentile(target_lengths, 95)
)) + 2

# Criar as camadas de vetorização SEM limite de vocab (None = ilimitado)
source_vectorization = layers.TextVectorization(
    max_tokens=None,  # MUDANÇA: Sem limite de vocabulário
    output_sequence_length=sequence_length,
    standardize=None
)

target_vectorization = layers.TextVectorization(
    max_tokens=None,  # MUDANÇA: Sem limite de vocabulário
    output_sequence_length=sequence_length,
    standardize=None
)

# Treinar as camadas nos datasets
source_vectorization.adapt(source_phrases_clean)
target_vectorization.adapt(target_phrases_final)

# CALCULAR vocab_size dinamicamente após o adapt()
vocab_size_source = len(source_vectorization.get_vocabulary())
vocab_size_target = len(target_vectorization.get_vocabulary())

print("\n🎯 CONFIGURAÇÕES CALCULADAS:")
print("=" * 60)
print(f"✅ Sequence Length: {sequence_length}")
print(f"✅ Vocabulário Português: {vocab_size_source} tokens")
print(f"✅ Vocabulário Inglês: {vocab_size_target} tokens")
print("=" * 60)

# Testar em um exemplo
test_phrase_pt = source_phrases_clean[10]
vectorized_pt = source_vectorization([test_phrase_pt])

test_phrase_en = target_phrases_final[10]
vectorized_en = target_vectorization([test_phrase_en])

print(f"\n📝 TESTE DE VETORIZAÇÃO:")
print(f"Original Portuguese: '{test_phrase_pt}'")
print(f"Vectorized: {vectorized_pt.numpy()}")
print("-" * 60)
print(f"Original English: '{test_phrase_en}'")
print(f"Vectorized: {vectorized_en.numpy()}")

# Verificar tokens especiais
pt_vocab = source_vectorization.get_vocabulary()
en_vocab = target_vectorization.get_vocabulary()
print(f"\n🔍 Primeiras 10 palavras do vocabulário:")
print(f"Português: {pt_vocab[:10]}")
print(f"Inglês: {en_vocab[:10]}")
print(f"\n✅ '[start]' no vocabulário inglês: {'[start]' in en_vocab}")
print(f"✅ '[end]' no vocabulário inglês: {'[end]' in en_vocab}")


📊 ANÁLISE DO DATASET:
Português - Comprimento médio: 6.0 palavras
Português - Comprimento máximo: 33 palavras
Português - Percentil 95: 10 palavras
------------------------------------------------------------
Inglês - Comprimento médio: 8.0 palavras
Inglês - Comprimento máximo: 37 palavras
Inglês - Percentil 95: 12 palavras

🎯 CONFIGURAÇÕES CALCULADAS:
✅ Sequence Length: 14
✅ Vocabulário Português: 22460 tokens
✅ Vocabulário Inglês: 12908 tokens

📝 TESTE DE VETORIZAÇÃO:
Original Portuguese: 'que'
Vectorized: [[3 0 0 0 0 0 0 0 0 0 0 0 0 0]]
------------------------------------------------------------
Original English: '[start] who [end]'
Vectorized: [[ 2 68  3  0  0  0  0  0  0  0  0  0  0  0]]

🔍 Primeiras 10 palavras do vocabulário:
Português: ['', '[UNK]', np.str_('tom'), np.str_('que'), np.str_('o'), np.str_('não'), np.str_('eu'), np.str_('de'), np.str_('a'), np.str_('você')]
Inglês: ['', '[UNK]', np.str_('[start]'), np.str_('[end]'), np.str_('tom'), np.str_('i'), np.str_('to'), np.

## Positional Encoding

In [45]:
class PositionalEncoding(layers.Layer):
    """
    This layer injects positional information into the input embeddings.
    It's a fixed, non-learnable layer.
    """
    def __init__(self, max_length, d_model):
        super().__init__()
        self.d_model = d_model # Dimension of the embedding vector
        self.max_length = max_length # Maximum possible length of a sequence

        # Create a positional encoding matrix of shape (max_length, d_model)
        # This matrix is pre-calculated and will not change during training.
        
        # Create a tensor representing positions (0, 1, ..., max_length-1)
        # Shape: (max_length, 1)
        positions = tf.range(start=0, limit=max_length, delta=1, dtype=tf.float32)
        positions = tf.expand_dims(positions, axis=1)

        # Calculate the denominator term in the formula.
        # 2i/d_model --> [0, 2, 4, ..., d_model-2] / d_model
        div_term = tf.exp(tf.range(0, d_model, 2, dtype=tf.float32) * -(np.log(10000.0) / d_model))

        # Calculate the angles for the sine and cosine functions
        # Broadcasting (positions * div_term) results in a shape of (max_length, d_model/2)
        angles = positions * div_term

        # Calculate sine for even indices and cosine for odd indices
        sin_values = tf.sin(angles)
        cos_values = tf.cos(angles)
        
        # Interleave the sine and cosine values.
        # For example, if sin=[s1,s2] and cos=[c1,c2], the result is [s1,c1,s2,c2]
        # This creates the final positional encoding matrix.
        # Shape: (max_length, d_model)
        pe = tf.stack([sin_values, cos_values], axis=2)  # Shape: (max_length, d_model/2, 2)
        pe = tf.reshape(pe, [max_length, d_model])       # Shape: (max_length, d_model)
        
        # Store as TensorFlow constant for better integration
        self.positional_encoding = tf.constant(pe, dtype=tf.float32)

    def call(self, x):
        """
        The forward pass of the layer.
        Args:
            x: Input embeddings. Shape: (batch_size, sequence_length, d_model)
        Returns:
            Embeddings with added positional information.
        """
        # Get the length of the input sequence as a Python int
        seq_length = x.shape[1] if x.shape[1] is not None else tf.shape(x)[1]
        
        # Add the positional encoding to the input embeddings.
        # We only use the part of the PE matrix that corresponds to the sequence length.
        # Use tf.slice for dynamic slicing which works with both static and dynamic shapes
        pos_encoding = tf.slice(self.positional_encoding, [0, 0], [seq_length, self.d_model])
        
        return x + tf.cast(pos_encoding, dtype=x.dtype)

In [46]:
# --- Configuration ---
# Usar os valores calculados dinamicamente do dataset
d_model = 128  # Dimensão dos embeddings
max_length = sequence_length  # Usar o sequence_length calculado

print(f"📊 Configuração do teste:")
print(f"   d_model: {d_model}")
print(f"   max_length: {max_length}")
print(f"   vocab_size (português): {vocab_size_source}")

# --- Create dummy input data ---
# CORREÇÃO: O tamanho do dummy_input deve ser <= max_length
# Usando max_length-2 para garantir que cabe dentro do limite
test_seq_len = min(10, max_length - 1)  # Usa 10 ou max_length-1, o que for menor
dummy_input = tf.random.uniform((2, test_seq_len), maxval=vocab_size_source, dtype=tf.int64)

print(f"   test_seq_len: {test_seq_len}")

# --- Build and run the layers ---
embedding_layer = layers.Embedding(input_dim=vocab_size_source, output_dim=d_model)
positional_encoding_layer = PositionalEncoding(max_length=max_length, d_model=d_model)

# 1. Pass input through the embedding layer
word_embeddings = embedding_layer(dummy_input)

# 2. Add positional information
final_embeddings = positional_encoding_layer(word_embeddings)

print(f"\n✅ Shape of word embeddings: {word_embeddings.shape}")
print(f"✅ Shape of final embeddings (with positional info): {final_embeddings.shape}")


📊 Configuração do teste:
   d_model: 128
   max_length: 14
   vocab_size (português): 22460
   test_seq_len: 10

✅ Shape of word embeddings: (2, 10, 128)
✅ Shape of final embeddings (with positional info): (2, 10, 128)


## Funções de Máscara (Otimização)

In [47]:
# Substitua a célula de "Funções de Máscara" por esta versão corrigida:

def create_padding_mask(seq):
    """Cria uma máscara de adição para zerar a atenção nos tokens de padding."""
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # Adiciona dimensões extras para broadcasting com os scores de atenção
    return seq[:, tf.newaxis, tf.newaxis, :] * -1e9  # Shape: (batch, 1, 1, seq_len)

def create_look_ahead_mask(size):
    """Cria uma máscara de adição para zerar a atenção em tokens futuros."""
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask * -1e9  # Multiplica por um número muito negativo

def create_decoder_masks(tar, inp):
    """Cria todas as máscaras de adição necessárias para o decoder."""
    # Máscara look-ahead para a primeira sub-camada de atenção do decoder
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])

    # Máscara de padding para a primeira sub-camada de atenção do decoder
    dec_target_padding_mask = create_padding_mask(tar)

    # Combina as duas máscaras para a self-attention do decoder
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    # Máscara de padding para a segunda sub-camada (cross-attention), que olha para o encoder
    cross_attention_mask = create_padding_mask(inp)

    return combined_mask, cross_attention_mask


# --- Teste das máscaras ---
print("🧪 Testando funções de máscara CORRIGIDAS:")

# Teste 1: Padding mask
test_seq = tf.constant([[7, 6, 0, 0], [1, 2, 3, 0]])
padding_mask = create_padding_mask(test_seq)
print(f"\n1. Padding Mask:")
print(f"   Input shape: {test_seq.shape}")
print(f"   Mask shape: {padding_mask.shape} ← DEVE SER (2, 1, 1, 4)")
print(f"   Valores únicos: {tf.unique(tf.reshape(padding_mask, [-1]))[0].numpy()}")

# Teste 2: Look-ahead mask
look_mask = create_look_ahead_mask(5)
print(f"\n2. Look-Ahead Mask:")
print(f"   Shape: {look_mask.shape} ← DEVE SER (5, 5)")
print(f"   Valores:\n{look_mask.numpy()}")

# Teste 3: Máscaras combinadas do decoder
test_tar = tf.constant([[1, 2, 3, 0, 0]])
test_inp = tf.constant([[4, 5, 6, 7, 0]])
look_ahead_combined, padding_combined = create_decoder_masks(test_tar, test_inp)
print(f"\n3. Decoder Masks Combinadas:")
print(f"   Look-ahead mask shape: {look_ahead_combined.shape} ← DEVE SER (1, 1, 5, 5)")
print(f"   Cross-attention mask shape: {padding_combined.shape} ← DEVE SER (1, 1, 1, 5)")

print("\n✅ Máscaras corrigidas com sucesso!")

🧪 Testando funções de máscara CORRIGIDAS:

1. Padding Mask:
   Input shape: (2, 4)
   Mask shape: (2, 1, 1, 4) ← DEVE SER (2, 1, 1, 4)
   Valores únicos: [-0.e+00 -1.e+09]

2. Look-Ahead Mask:
   Shape: (5, 5) ← DEVE SER (5, 5)
   Valores:
[[-0.e+00 -1.e+09 -1.e+09 -1.e+09 -1.e+09]
 [-0.e+00 -0.e+00 -1.e+09 -1.e+09 -1.e+09]
 [-0.e+00 -0.e+00 -0.e+00 -1.e+09 -1.e+09]
 [-0.e+00 -0.e+00 -0.e+00 -0.e+00 -1.e+09]
 [-0.e+00 -0.e+00 -0.e+00 -0.e+00 -0.e+00]]

3. Decoder Masks Combinadas:
   Look-ahead mask shape: (1, 1, 5, 5) ← DEVE SER (1, 1, 5, 5)
   Cross-attention mask shape: (1, 1, 1, 5) ← DEVE SER (1, 1, 1, 5)

✅ Máscaras corrigidas com sucesso!


## Self-Attention

In [48]:
# --- Configuration ---
d_model = 128
num_heads = 8

# The dimension of each head is d_model / num_heads
key_dim = d_model // num_heads

# --- Create dummy input data ---
# Batch of 2 sentences, length 15, embedding dim 128
dummy_input = tf.random.uniform((2, 15, d_model))

# --- Build and run the layer ---
mha_layer = layers.MultiHeadAttention(
    num_heads=num_heads,
    key_dim=key_dim,
    output_shape=d_model # Ensures the output dimension is correct
)

# In self-attention, query, value, and key are the same.
output_keras = mha_layer(query=dummy_input, value=dummy_input, key=dummy_input)

print("Shape of the input:", dummy_input.shape)
print("Shape of the output (from Keras MHA):", output_keras.shape)

Shape of the input: (2, 15, 128)
Shape of the output (from Keras MHA): (2, 15, 128)


## Encoder Block

In [49]:
# --- Encoder Block Configuration ---
d_model = 128       # Dimension of the model (embedding size)
num_heads = 8       # Number of attention heads
dff = 128           # OTIMIZAÇÃO: Reduzido de 512 para 128 (igual ao exemplo Kaggle)
dropout_rate = 0.1  # Dropout rate for regularization

print(f"📊 Configuração do Encoder Block:")
print(f"   d_model: {d_model}")
print(f"   num_heads: {num_heads}")
print(f"   dff: {dff} (otimizado para velocidade)")
print(f"   dropout_rate: {dropout_rate}")

class EncoderBlock(layers.Layer):
    """
    Represents one block of the Transformer's Encoder.
    It consists of Multi-Head Attention and a Feed-Forward Network,
    with residual connections and layer normalization.
    """
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()

        self.supports_masking = True

        # Multi-Head Attention Layer
        self.mha = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=d_model // num_heads,
            output_shape=d_model
        )

        # Feed-Forward Network (consists of two dense layers)
        self.ffn = keras.Sequential([
            layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff)
            layers.Dense(d_model)                 # (batch_size, seq_len, d_model)
        ])

        # Layer Normalization
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)

        # Dropout for regularization
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, x, training, padding_mask=None):
        # 1. Multi-Head Attention sub-layer
        # The input 'x' is used for query, key, and value in self-attention
        attn_output = self.mha(query=x, value=x, key=x, attention_mask=padding_mask)
        attn_output = self.dropout1(attn_output, training=training)
        # Residual connection and Layer Normalization
        out1 = self.layernorm1(x + attn_output)

        # 2. Feed-Forward Network sub-layer
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        # Residual connection and Layer Normalization
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

# --- Test the EncoderBlock ---
encoder_block = EncoderBlock(d_model=d_model, num_heads=num_heads, dff=dff)

# Create some dummy input (output from positional encoding)
dummy_input = tf.random.uniform((2, 15, d_model)) # (batch_size, sequence_length, d_model)

# The call method requires a 'training' flag for dropout
output_from_block = encoder_block(dummy_input, training=False)

print(f"\n✅ Shape of the input to the Encoder Block: {dummy_input.shape}")
print(f"✅ Shape of the output from the Encoder Block: {output_from_block.shape}")


📊 Configuração do Encoder Block:
   d_model: 128
   num_heads: 8
   dff: 128 (otimizado para velocidade)
   dropout_rate: 0.1

✅ Shape of the input to the Encoder Block: (2, 15, 128)
✅ Shape of the output from the Encoder Block: (2, 15, 128)


## Encoder

In [50]:
class Encoder(layers.Layer):
    """
    The complete Encoder, consisting of an embedding layer, positional encoding,
    and a stack of N EncoderBlocks.
    
    OTIMIZAÇÃO: Agora usa máscaras de padding corretamente!
    """
    def __init__(self, num_layers, d_model, num_heads, dff,
                 input_vocab_size, max_length, rate=0.1):
        super().__init__()

        self.supports_masking = True

        self.d_model = d_model
        self.num_layers = num_layers

        # Input Embedding layer
        self.embedding = layers.Embedding(input_vocab_size, d_model)
        
        # Positional Encoding layer
        self.pos_encoding = PositionalEncoding(max_length, self.d_model)

        # Stack of EncoderBlocks
        # We use a list comprehension to create N encoder blocks
        self.enc_layers = [EncoderBlock(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]

        # Dropout layer for regularization
        self.dropout = layers.Dropout(rate)

    def call(self, x, training=False, padding_mask=None):
        seq_len = tf.shape(x)[1]

        # 1. Get embeddings
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        
        # A scaling factor is applied to the embeddings as in the original paper
        # CORREÇÃO: Converter para o mesmo dtype dos embeddings (compatível com mixed precision)
        x *= tf.cast(tf.math.sqrt(tf.cast(self.d_model, tf.float32)), x.dtype)

        # 2. Add positional encoding
        x = self.pos_encoding(x)

        # 3. Apply dropout
        x = self.dropout(x, training=training)

        # 4. Pass through the stack of encoder blocks
        # OTIMIZAÇÃO: Agora passa a máscara para cada bloco
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, padding_mask=padding_mask)

        return x  # (batch_size, input_seq_len, d_model)

# --- Test the complete Encoder ---

# OTIMIZAÇÃO: Usar valores calculados dinamicamente do dataset
num_layers_encoder = 2  # Stack 2 blocks for this test
d_model = 128
num_heads = 8
dff = 128  # OTIMIZADO: Reduzido para 128
input_vocab_size = vocab_size_source  # CALCULADO DINAMICAMENTE
max_sentence_length = sequence_length  # CALCULADO DINAMICAMENTE

print(f"📊 Configuração do Encoder completo:")
print(f"   num_layers: {num_layers_encoder}")
print(f"   d_model: {d_model}")
print(f"   num_heads: {num_heads}")
print(f"   dff: {dff}")
print(f"   input_vocab_size: {input_vocab_size}")
print(f"   max_sentence_length: {max_sentence_length}")

# Create an instance of the Encoder
encoder = Encoder(num_layers=num_layers_encoder,
                  d_model=d_model,
                  num_heads=num_heads,
                  dff=dff,
                  input_vocab_size=input_vocab_size,
                  max_length=max_sentence_length)

# Create some dummy input (vectorized portuguese sentences)
dummy_pt_vector = source_vectorization(["eu amo pizza", "onde fica o banheiro"])

# Get the output from the encoder
encoder_output = encoder(dummy_pt_vector, training=False)

print(f"\n✅ Shape of the input to the Encoder (vectorized text): {dummy_pt_vector.shape}")
print(f"✅ Shape of the output from the Encoder (context vectors): {encoder_output.shape}")


📊 Configuração do Encoder completo:
   num_layers: 2
   d_model: 128
   num_heads: 8
   dff: 128
   input_vocab_size: 22460
   max_sentence_length: 14

✅ Shape of the input to the Encoder (vectorized text): (2, 14)
✅ Shape of the output from the Encoder (context vectors): (2, 14, 128)


## Decoder Block

In [51]:
# This code builds on the previous cells in your notebook.
# Make sure you have run the cells defining d_model, num_heads, dff etc.

class DecoderBlock(layers.Layer):
    """
    Represents one block of the Transformer's Decoder.
    It consists of Masked Multi-Head Self-Attention, Cross-Attention,
    and a Feed-Forward Network.
    """
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()

        self.supports_masking = True

        # 1. Masked Multi-Head Self-Attention (looks at the target sequence)
        self.mha1 = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=d_model // num_heads,
            output_shape=d_model
        )

        # 2. Multi-Head Cross-Attention (looks at the encoder output)
        self.mha2 = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=d_model // num_heads,
            output_shape=d_model
        )

        # 3. Feed-Forward Network
        self.ffn = keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model)
        ])

        # Layer Normalization layers
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)

        # Dropout layers
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        self.dropout3 = layers.Dropout(rate)

    def call(self, x, encoder_output, training, look_ahead_mask, cross_attention_mask):
        # Sub-layer 1: Masked Multi-Head Self-Attention
        # The decoder attends to itself, but can't see future tokens.
        attn1 = self.mha1(query=x, value=x, key=x, attention_mask=look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(x + attn1)

        # Sub-layer 2: Cross-Attention
        # The decoder attends to the encoder's output.
        # Query comes from the decoder, Key and Value from the encoder.
        # Usa a máscara de padding vinda do encoder.
        attn2 = self.mha2(query=out1, value=encoder_output, key=encoder_output, attention_mask=cross_attention_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(out1 + attn2)

        # Sub-layer 3: Feed-Forward Network
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(out2 + ffn_output)

        return out3

# --- Test the DecoderBlock ---
# OTIMIZAÇÃO: Usar valores otimizados
d_model = 128
num_heads = 8
dff = 128  # OTIMIZADO: Reduzido para 128

print(f"📊 Configuração do Decoder Block:")
print(f"   d_model: {d_model}")
print(f"   num_heads: {num_heads}")
print(f"   dff: {dff} (otimizado)")

decoder_block = DecoderBlock(d_model=d_model, num_heads=num_heads, dff=dff)

# Create dummy inputs
# Note: The sequence lengths for the decoder input and encoder output can be different.
dummy_decoder_input = tf.random.uniform((2, 15, d_model)) # (batch_size, target_seq_len, d_model)
dummy_encoder_output = tf.random.uniform((2, 20, d_model)) # (batch_size, input_seq_len, d_model)

# For testing shapes, we can pass masks as None
output_from_block = decoder_block(
    dummy_decoder_input,
    dummy_encoder_output,
    training=False,
    look_ahead_mask=None,
    cross_attention_mask=None
)

print(f"\n✅ Shape of the decoder input: {dummy_decoder_input.shape}")
print(f"✅ Shape of the encoder output: {dummy_encoder_output.shape}")
print(f"✅ Shape of the output from the Decoder Block: {output_from_block.shape}")


📊 Configuração do Decoder Block:
   d_model: 128
   num_heads: 8
   dff: 128 (otimizado)

✅ Shape of the decoder input: (2, 15, 128)
✅ Shape of the encoder output: (2, 20, 128)
✅ Shape of the output from the Decoder Block: (2, 15, 128)


## Decoder

In [52]:
class Decoder(layers.Layer):
    """
    The complete Decoder, consisting of an embedding layer, positional encoding,
    and a stack of N DecoderBlocks.
    """
    def __init__(self, num_layers, d_model, num_heads, dff,
                 target_vocab_size, max_length, rate=0.1):
        super().__init__()

        self.supports_masking = True

        self.d_model = d_model
        self.num_layers = num_layers

        # Input Embedding layer for the target language
        self.embedding = layers.Embedding(target_vocab_size, d_model)

        # Positional Encoding layer
        self.pos_encoding = PositionalEncoding(max_length, d_model)

        # Stack of DecoderBlocks
        self.dec_layers = [DecoderBlock(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]

        # Dropout layer
        self.dropout = layers.Dropout(rate)

    def call(self, x, encoder_output, training=False, look_ahead_mask=None, cross_attention_mask=None):
        seq_len = tf.shape(x)[1]

        # 1. Get embeddings for the target sequence
        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        # CORREÇÃO: Converter para o mesmo dtype dos embeddings (compatível com mixed precision)
        x *= tf.cast(tf.math.sqrt(tf.cast(self.d_model, tf.float32)), x.dtype)

        # 2. Add positional encoding
        x = self.pos_encoding(x)

        # 3. Apply dropout
        x = self.dropout(x, training=training)

        # 4. Pass through the stack of decoder blocks
        for i in range(self.num_layers):
            x = self.dec_layers[i](x, encoder_output, training=training,
                                     look_ahead_mask=look_ahead_mask,
                                     cross_attention_mask=cross_attention_mask)

        # The shape of x is (batch_size, target_seq_len, d_model)
        return x

# --- Test the complete Decoder ---

# OTIMIZAÇÃO: Usar valores calculados dinamicamente
num_layers_decoder = 2  # Stack 2 blocks
d_model = 128
num_heads = 8
dff = 128  # OTIMIZADO: Reduzido para 128
target_vocab_size = vocab_size_target  # CALCULADO DINAMICAMENTE
max_sentence_length = sequence_length  # CALCULADO DINAMICAMENTE

print(f"📊 Configuração do Decoder completo:")
print(f"   num_layers: {num_layers_decoder}")
print(f"   d_model: {d_model}")
print(f"   num_heads: {num_heads}")
print(f"   dff: {dff}")
print(f"   target_vocab_size: {target_vocab_size}")
print(f"   max_sentence_length: {max_sentence_length}")

# Create an instance of the Decoder
decoder = Decoder(num_layers=num_layers_decoder,
                  d_model=d_model,
                  num_heads=num_heads,
                  dff=dff,
                  target_vocab_size=target_vocab_size,
                  max_length=max_sentence_length)

# Create dummy inputs
# We need both a dummy decoder input and the dummy encoder output from the previous test
dummy_en_vector = target_vectorization(["[start] i love pizza", "[start] where is the bathroom"])
# We can reuse the encoder_output from the Encoder test cell
# encoder_output has shape (2, sequence_length, 128)

# Get the output from the decoder
decoder_output = decoder(
    x=dummy_en_vector,
    encoder_output=encoder_output, # This comes from the Encoder
    training=False,
    look_ahead_mask=None # Passing None for shape testing
)

print(f"\n✅ Shape of the decoder input (vectorized English text): {dummy_en_vector.shape}")
print(f"✅ Shape of the encoder output (context vectors): {encoder_output.shape}")
print(f"✅ Shape of the final decoder output: {decoder_output.shape}")


📊 Configuração do Decoder completo:
   num_layers: 2
   d_model: 128
   num_heads: 8
   dff: 128
   target_vocab_size: 12908
   max_sentence_length: 14

✅ Shape of the decoder input (vectorized English text): (2, 14)
✅ Shape of the encoder output (context vectors): (2, 14, 128)
✅ Shape of the final decoder output: (2, 14, 128)


## Modelo Transformer

In [53]:
class Transformer(keras.Model):
    """
    The complete Transformer model, connecting the Encoder and Decoder.
    
    OTIMIZAÇÃO: Agora cria e usa máscaras de padding corretamente!
    Isso melhora a acurácia e acelera a convergência.
    """
    def __init__(self, num_layers, d_model, num_heads, dff,
                 input_vocab_size, target_vocab_size, max_length, rate=0.1):
        super().__init__()
        # Instantiate the Encoder
        self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                               input_vocab_size, max_length, rate)

        # Instantiate the Decoder
        self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                               target_vocab_size, max_length, rate)

        # The final linear layer to project decoder output to vocabulary size
        # OTIMIZAÇÃO: dtype='float32' para compatibilidade com mixed precision
        self.final_layer = layers.Dense(target_vocab_size, dtype='float32')

    def call(self, inputs, training=False):
        # The 'inputs' will be a tuple: (source_sequence, target_sequence)
        inp, tar = inputs

        # Criar máscaras de padding
        enc_padding_mask = create_padding_mask(inp)
        
        # Criar as duas máscaras para o decoder
        look_ahead_mask, cross_attention_mask = create_decoder_masks(tar, inp)

        # 1. Pass the source sequence through the encoder
        encoder_output = self.encoder(inp, training=training, padding_mask=enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

        # 2. Pass the encoder output and the target sequence through the decoder
        decoder_output = self.decoder(
            tar, encoder_output, training=training, 
            look_ahead_mask=look_ahead_mask,
            cross_attention_mask=cross_attention_mask
        ) # (batch_size, tar_seq_len, d_model)

        # 3. Pass the decoder output through the final linear layer
        final_output = self.final_layer(decoder_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output

# --- Test the complete Transformer model ---

# OTIMIZAÇÃO: Usar configurações calculadas dinamicamente do dataset
print("=" * 70)
print("🚀 CRIANDO MODELO TRANSFORMER COM CONFIGURAÇÕES OTIMIZADAS")
print("=" * 70)

PE_MAX_LENGTH = 200

config_summary = {
    "num_layers": num_layers_encoder,
    "d_model": d_model,
    "num_heads": num_heads,
    "dff": dff,
    "input_vocab_size": vocab_size_source,
    "target_vocab_size": vocab_size_target,
    "max_length": PE_MAX_LENGTH
}

for key, value in config_summary.items():
    print(f"   {key:20s}: {value}")

print("=" * 70)

transformer = Transformer(
    num_layers=num_layers_encoder,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=vocab_size_source,
    target_vocab_size=vocab_size_target,
    max_length=PE_MAX_LENGTH
)

# Reuse the dummy vectors from previous tests
# The call method expects a tuple of (input, target)
output = transformer((dummy_pt_vector, dummy_en_vector), training=False)

print(f"\n✅ Shape of Portuguese input: {dummy_pt_vector.shape}")
print(f"✅ Shape of English input: {dummy_en_vector.shape}")
print(f"✅ Shape of the final Transformer output (logits): {output.shape}")
print(f"\n🎉 Modelo Transformer criado com sucesso!")


🚀 CRIANDO MODELO TRANSFORMER COM CONFIGURAÇÕES OTIMIZADAS
   num_layers          : 2
   d_model             : 128
   num_heads           : 8
   dff                 : 128
   input_vocab_size    : 22460
   target_vocab_size   : 12908
   max_length          : 200

✅ Shape of Portuguese input: (2, 14)
✅ Shape of English input: (2, 14)
✅ Shape of the final Transformer output (logits): (2, 14, 12908)

🎉 Modelo Transformer criado com sucesso!


## Função de Perda

In [54]:
# We need to define our loss function and metrics
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none' # 'none' means we get a loss for each example
)

def masked_loss_function(real, pred):
    # Create a mask to identify where the real target is not a padding token (0)
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    
    # Calculate the loss for every token in the batch
    loss_ = loss_object(real, pred)
    
    # Cast the mask to the same type as the loss
    mask = tf.cast(mask, dtype=loss_.dtype)
    
    # Apply the mask to the loss, effectively zeroing out the loss for padding tokens
    loss_ *= mask
    
    # Return the average loss over the non-padded tokens
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

## Learning Rate Schedule

In [55]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
    def get_config(self):
        """Retorna a configuração para serialização."""
        return {
            "d_model": int(self.d_model.numpy()),
            "warmup_steps": self.warmup_steps
        }

# Instantiate the learning rate schedule and the Adam optimizer
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)


## Treinamento

In [None]:
# --- 1. Prepare the Data for Training and Validation ---

from sklearn.model_selection import train_test_split

# Ensure eager execution is enabled
#tf.config.run_functions_eagerly(True)

# OTIMIZAÇÃO: Aumentar batch size para acelerar treinamento
# Batch maior = menos steps por época = treinamento mais rápido
# Valor anterior: 64 | Novo valor: 256 (4x mais rápido)
batch_size = 512  # Ajuste para 128 se tiver problemas de memória

# Vectorize all the sentences
source_vectors = source_vectorization(source_phrases_clean)
target_vectors = target_vectorization(target_phrases_final)

# Convert to numpy arrays for sklearn compatibility
source_vectors_np = source_vectors.numpy()
target_vectors_np = target_vectors.numpy()

# Split the data into training and validation sets (80/20 split)
source_train, source_val, target_train, target_val = train_test_split(
    source_vectors_np, target_vectors_np, test_size=0.2, random_state=42
)

print(f"Total examples: {len(source_vectors_np)}")
print(f"Training examples: {len(source_train)}")
print(f"Validation examples: {len(source_val)}")

# Helper function to create datasets
def create_dataset(source, target):
    """Helper function to create a tf.data.Dataset with shifted targets."""
    # The input to the decoder is the target sequence without the last token
    decoder_inputs = target[:, :-1]
    # The target for the loss function is the target sequence without the first token
    decoder_outputs = target[:, 1:]
    
    # Create the tf.data.Dataset object
    dataset = tf.data.Dataset.from_tensor_slices(
        ((source, decoder_inputs), decoder_outputs)
    )
    # Shuffle, batch, and prefetch
    return dataset.shuffle(buffer_size=len(source)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Create the training and validation datasets
train_dataset = create_dataset(source_train, target_train)
val_dataset = create_dataset(source_val, target_val)

# --- 2. Compile the Model ---

# We need a metric to monitor performance, let's create a masked accuracy
def masked_accuracy(real, pred):
    # Get the predicted word by finding the index with the highest logit
    pred = tf.argmax(pred, axis=2)
    real = tf.cast(real, pred.dtype)
    
    # Check for matches
    match = tf.cast(real == pred, dtype=tf.float32)
    
    # Create the padding mask
    mask = tf.cast(tf.math.logical_not(tf.math.equal(real, 0)), dtype=tf.float32)
    
    # Apply the mask to the matches
    match *= mask
    
    # Calculate the accuracy on non-padded tokens
    return tf.reduce_sum(match) / tf.reduce_sum(mask)

# Compile the model with the optimizer, loss, and metric
transformer.compile(
    optimizer=optimizer,
    loss=masked_loss_function,
    metrics=[masked_accuracy]
    #run_eagerly=True  # Run in eager mode to avoid graph mode issues
)

# --- 3. Configure Early Stopping ---

from keras.callbacks import EarlyStopping, ModelCheckpoint

# Early Stopping: para o treinamento se val_loss não melhorar por 'patience' épocas
early_stopping = EarlyStopping(
    monitor='val_loss',           # Monitora a perda de validação
    patience=2,                   # Espera 2 épocas sem melhora antes de parar
    restore_best_weights=True,    # Restaura os pesos da melhor época
    verbose=1,                    # Mostra mensagens quando parar
    mode='min'                    # 'min' porque queremos minimizar a loss
)

# OTIMIZAÇÃO: Model Checkpoint - salva o melhor modelo automaticamente
model_checkpoint = ModelCheckpoint(
    filepath='best_transformer_model.keras',  # Nome do arquivo
    monitor='val_loss',                       # Métrica a monitorar
    save_best_only=True,                      # Salva apenas quando melhorar
    save_weights_only=False,                  # Salva modelo completo
    mode='min',                               # Minimizar a loss
    verbose=1                                 # Mostrar quando salvar
)

print("✅ Callbacks configurados:")
print(f"   - Early Stopping (patience={early_stopping.patience})")
print("   - Model Checkpoint (salva em 'best_transformer_model.keras')")

# --- 4. Train the Model with Validation and Early Stopping ---

print("Starting training with validation and Early Stopping...")
print(f"Training will stop automatically if validation loss doesn't improve for {early_stopping.patience} epochs.")
print("=" * 80)
print("🚀 OTIMIZAÇÕES ATIVAS:")
print("   ✅ Mixed Precision (float16) - Acelera cálculos")
print("   ✅ Máscaras de Padding - Melhora acurácia e convergência")
print(f"   ✅ Batch Size: {batch_size} - Acelera treinamento")
print("   ✅ Model Checkpoint - Salva melhor modelo automaticamente")
print("=" * 80)

# Definimos mais épocas, mas o early stopping pode parar antes
epochs = 5 # 100

history = transformer.fit(
    train_dataset,
    epochs=epochs,
    validation_data=val_dataset,
    callbacks=[early_stopping, model_checkpoint]  # OTIMIZAÇÃO: Adiciona checkpoint
)

print("\n" + "=" * 60)
print("Training finished!")
print(f"Total epochs trained: {len(history.history['loss'])}")
print("=" * 60)


## Carregando o Melhor Modelo (Opcional)

In [None]:
# Se você quiser carregar o melhor modelo salvo pelo checkpoint:
# (Útil se você reiniciar o kernel ou quiser usar o modelo em outra sessão)

# Para carregar o modelo completo:
# transformer = keras.models.load_model('best_transformer_model.keras')

# OU para apenas verificar se o arquivo existe:
# import os
# if os.path.exists('best_transformer_model.keras'):
#     print("✅ Melhor modelo salvo em: best_transformer_model.keras")
#     print(f"   Tamanho do arquivo: {os.path.getsize('best_transformer_model.keras') / 1024 / 1024:.2f} MB")
# else:
#     print("⚠️ Modelo ainda não foi salvo (treine primeiro)")


## Visualização das Curvas de Aprendizagem

In [None]:
# Get the training history
train_acc = history.history['masked_accuracy']
val_acc = history.history['val_masked_accuracy']
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Create the epochs range
epochs_range = range(len(train_acc))

# Plot the training and validation curves
plt.figure(figsize=(12, 5))

# --- Plot Accuracy ---
plt.subplot(1, 2, 1)
plt.plot(epochs_range, train_acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.grid(True)

# --- Plot Loss ---
plt.subplot(1, 2, 2)
plt.plot(epochs_range, train_loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.grid(True)

plt.tight_layout()
plt.show()

print(f"\nFinal Training Loss: {train_loss[-1]:.4f}")
print(f"Final Validation Loss: {val_loss[-1]:.4f}")
print(f"Final Training Accuracy: {train_acc[-1]:.4f}")
print(f"Final Validation Accuracy: {val_acc[-1]:.4f}")


## Inferência

In [None]:
# Get the target vocabulary
target_vocab = target_vectorization.get_vocabulary()

# Create a dictionary to map the target language index back to a word
index_to_word = {i: word for i, word in enumerate(target_vocab)}

# Define the maximum length for a generated translation
max_output_length = 20

# Find the indices for start and end tokens (with brackets preserved)
start_token_index = target_vocab.index('[start]')
end_token_index = target_vocab.index('[end]')

print(f"Token '[start]' tem índice: {start_token_index}")
print(f"Token '[end]' tem índice: {end_token_index}")

def translate(sentence, debug=False):
    """
    Translates a Portuguese sentence to English using the trained Transformer model.
    """
    # 1. Preprocess the input sentence
    cleaned_sentence = standardize_text(sentence)
    
    # 2. Vectorize the sentence and add a batch dimension
    input_vector = source_vectorization([cleaned_sentence]) # Shape: (1, sequence_length)
    
    if debug:
        print(f"  Input vectorizado: {input_vector.numpy()}")
    
    # 3. Initialize the decoder's input with the '[start]' token
    # The decoder input starts as a tensor with the start token index
    decoder_input = tf.constant([[start_token_index]], dtype=tf.int64)

    generated_tokens = []
    for i in range(max_output_length):
        # 4. Get the model's predictions
        # The model is called in a non-training mode
        predictions = transformer((input_vector, decoder_input), training=False)
        
        # 5. Get the logits for the very last predicted token
        # predictions shape: (batch_size, seq_len, vocab_size) -> (1, i+1, vocab_size)
        # We want the predictions for the last token in the sequence
        last_token_logits = predictions[:, -1, :] # Shape: (1, vocab_size)
        
        # 6. Find the token with the highest probability (greedy search)
        predicted_id = tf.argmax(last_token_logits, axis=-1) # Shape: (1,)
        predicted_id_value = int(predicted_id.numpy()[0])
        
        if debug:
            print(f"  Step {i}: Predicted token ID = {predicted_id_value}, word = '{index_to_word.get(predicted_id_value, '???')}'")
        
        generated_tokens.append(predicted_id_value)
        
        # 7. Add the predicted token to the decoder input for the next iteration
        decoder_input = tf.concat([decoder_input, [predicted_id]], axis=1)
        
        # 8. Check if the predicted token is the '[end]' token
        if predicted_id_value == end_token_index:
            if debug:
                print(f"  Token '[end]' found, stopping generation.")
            break
            
    # 9. Convert the sequence of token IDs back to words
    # We ignore the [start] and [end] tokens
    output_tokens = [token for token in generated_tokens if token != end_token_index]
    translated_text = " ".join(index_to_word[token] for token in output_tokens)
    
    return translated_text

# --- Let's try it out! ---
# Use some sentences from our dataset to see if it learned
print("\n" + "="*40)
print("Original PT: 'por que eu'")
print("Translation EN:", translate("por que eu", debug=True))
print("-" * 40)
print("Original PT: 'eu venci'")
print("Translation EN:", translate("eu venci"))
print("-" * 20)
print("Original PT: 'socorro'")
print("Translation EN:", translate("socorro"))
print("-" * 20)

# Try a new sentence it has never seen (but with words from the vocabulary)
print("Original PT (New): 'eu te entendo'")
print("Translation EN (New):", translate("eu te entendo"))


In [None]:
# ============================================================================
# TESTE: Verificar se todas as dependências estão disponíveis
# ============================================================================

print("🧪 TESTE DE DEPENDÊNCIAS PARA INFERÊNCIA")
print("=" * 70)

# 1. Verificar modelo transformer
try:
    assert transformer is not None
    print("✅ Modelo 'transformer' está disponível")
    print(f"   - Encoder layers: {transformer.encoder.num_layers}")
    print(f"   - Decoder layers: {transformer.decoder.num_layers}")
except:
    print("❌ Modelo 'transformer' NÃO está disponível")

# 2. Verificar vetorização
try:
    assert source_vectorization is not None
    assert target_vectorization is not None
    print("✅ Camadas de vetorização disponíveis")
    print(f"   - Source vocab size: {len(source_vectorization.get_vocabulary())}")
    print(f"   - Target vocab size: {len(target_vectorization.get_vocabulary())}")
except:
    print("❌ Camadas de vetorização NÃO estão disponíveis")

# 3. Verificar funções de máscara
try:
    test_mask = create_padding_mask(tf.constant([[1, 2, 0, 0]]))
    print("✅ Função 'create_padding_mask' funciona")
    
    test_look_ahead = create_look_ahead_mask(5)
    print("✅ Função 'create_look_ahead_mask' funciona")
    
    test_la, test_pad = create_decoder_masks(tf.constant([[1, 2]]), tf.constant([[3, 4]]))
    print("✅ Função 'create_decoder_masks' funciona")
except Exception as e:
    print(f"❌ Funções de máscara com erro: {e}")

# 4. Verificar variáveis de inferência
try:
    assert start_token_index is not None
    assert end_token_index is not None
    assert index_to_word is not None
    assert max_output_length is not None
    print("✅ Variáveis de inferência disponíveis:")
    print(f"   - start_token_index: {start_token_index}")
    print(f"   - end_token_index: {end_token_index}")
    print(f"   - max_output_length: {max_output_length}")
    print(f"   - index_to_word entries: {len(index_to_word)}")
except Exception as e:
    print(f"❌ Variáveis de inferência com erro: {e}")

# 5. Verificar função standardize_text
try:
    test_clean = standardize_text("Olá, mundo!")
    assert test_clean == "olá mundo"
    print("✅ Função 'standardize_text' funciona")
    print(f"   Exemplo: 'Olá, mundo!' → '{test_clean}'")
except Exception as e:
    print(f"❌ Função 'standardize_text' com erro: {e}")

# 6. Teste completo de pipeline de inferência (sem tradução completa)
try:
    print("\n🔬 Teste de pipeline de inferência:")
    
    # a) Preprocessar entrada
    test_sentence = "eu amo python"
    cleaned = standardize_text(test_sentence)
    print(f"   1. Preprocessamento: '{test_sentence}' → '{cleaned}'")
    
    # b) Vetorizar
    input_vec = source_vectorization([cleaned])
    print(f"   2. Vetorização: shape {input_vec.shape}, primeiros tokens: {input_vec.numpy()[0][:5]}")
    
    # c) Criar máscaras
    enc_mask = create_padding_mask(input_vec)
    print(f"   3. Máscara encoder: shape {enc_mask.shape}")
    
    # d) Passar pelo encoder
    encoder_out = transformer.encoder(input_vec, training=False, mask=enc_mask)
    print(f"   4. Encoder output: shape {encoder_out.shape}")
    
    # e) Preparar decoder input com token [start]
    dec_input = tf.constant([[start_token_index]], dtype=tf.int64)
    print(f"   5. Decoder input inicial: {dec_input.numpy()}, palavra: '{index_to_word[start_token_index]}'")
    
    # f) Criar máscaras do decoder
    look_ahead, dec_pad = create_decoder_masks(dec_input, input_vec)
    print(f"   6. Máscaras decoder: look_ahead {look_ahead.shape}, padding {dec_pad.shape}")
    
    # g) Passar pelo decoder
    decoder_out = transformer.decoder(dec_input, encoder_out, training=False,
                                      look_ahead_mask=look_ahead, padding_mask=dec_pad)
    print(f"   7. Decoder output: shape {decoder_out.shape}")
    
    # h) Camada final
    predictions = transformer.final_layer(decoder_out)
    print(f"   8. Predictions: shape {predictions.shape}")
    
    # i) Pegar token previsto
    predicted_id = tf.argmax(predictions[0, -1, :]).numpy()
    predicted_word = index_to_word.get(predicted_id, '???')
    print(f"   9. Token previsto: ID={predicted_id}, palavra='{predicted_word}'")
    
    print("\n✅ Pipeline de inferência completo funciona!")
    
except Exception as e:
    print(f"\n❌ Pipeline de inferência com erro: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "=" * 70)
print("📊 RESUMO: Teste de dependências concluído")
print("=" * 70)


### Diagnóstico do Modelo (Verificação de Colapso)

In [None]:
# ============================================================================
# DIAGNÓSTICO: Vamos verificar se o modelo está realmente treinado
# ============================================================================

print("🔍 DIAGNÓSTICO DO MODELO")
print("=" * 70)

# 1. Verificar se o modelo tem pesos carregados
print("\n1. Verificando pesos do modelo:")
encoder_weights = transformer.encoder.get_weights()
decoder_weights = transformer.decoder.get_weights()
print(f"   ✅ Encoder tem {len(encoder_weights)} tensores de pesos")
print(f"   ✅ Decoder tem {len(decoder_weights)} tensores de pesos")

# 2. Verificar distribuição de probabilidades do modelo
print("\n2. Verificando distribuição de probabilidades:")
test_input = source_vectorization(["eu"])
test_decoder = tf.constant([[start_token_index]], dtype=tf.int64)

# Processar com o modelo
enc_mask = create_padding_mask(test_input)
encoder_out = transformer.encoder(test_input, training=False, mask=enc_mask)
look_ahead, dec_mask = create_decoder_masks(test_decoder, test_input)
decoder_out = transformer.decoder(test_decoder, encoder_out, training=False, 
                                   look_ahead_mask=look_ahead, padding_mask=dec_mask)
predictions = transformer.final_layer(decoder_out)

# Verificar as top-5 predições
probs = tf.nn.softmax(predictions[0, 0, :])
top_5_indices = tf.argsort(probs, direction='DESCENDING')[:5].numpy()
top_5_probs = tf.gather(probs, top_5_indices).numpy()

print(f"   Top 5 palavras previstas para 'eu':")
for idx, (token_id, prob) in enumerate(zip(top_5_indices, top_5_probs)):
    word = index_to_word.get(token_id, '???')
    print(f"      {idx+1}. '{word}' (ID={token_id}): {prob:.4f}")

# 3. Verificar se o modelo está colapsado (sempre prevê o mesmo)
print("\n3. Testando variação nas predições:")
test_sentences = ["eu", "você", "ele", "nós"]
predictions_per_word = []

for sent in test_sentences:
    test_in = source_vectorization([sent])
    dec_in = tf.constant([[start_token_index]], dtype=tf.int64)
    
    enc_mask = create_padding_mask(test_in)
    enc_out = transformer.encoder(test_in, training=False, mask=enc_mask)
    la_mask, dec_mask = create_decoder_masks(dec_in, test_in)
    dec_out = transformer.decoder(dec_in, enc_out, training=False, 
                                   look_ahead_mask=la_mask, padding_mask=dec_mask)
    pred = transformer.final_layer(dec_out)
    
    predicted_token = tf.argmax(pred[0, 0, :]).numpy()
    predicted_word = index_to_word.get(predicted_token, '???')
    predictions_per_word.append((sent, predicted_word, predicted_token))
    print(f"   '{sent}' → '{predicted_word}' (ID={predicted_token})")

# 4. Diagnóstico final
print("\n4. Diagnóstico:")
unique_predictions = len(set([p[2] for p in predictions_per_word]))
if unique_predictions == 1:
    print("   ⚠️  PROBLEMA: Modelo colapsado! Sempre prevê o mesmo token.")
    print("   🔧 SOLUÇÃO:")
    print("      1. Treinar por mais épocas (pelo menos 20-30)")
    print("      2. Verificar se houve overfitting no 'if' durante treinamento")
    print("      3. Considerar reduzir o vocabulário (max_tokens=5000)")
    print("      4. Aumentar o modelo (d_model=256, dff=512)")
else:
    print(f"   ✅ Modelo gera {unique_predictions} tokens diferentes")
    print("   ℹ️  Mas ainda pode precisar de mais treinamento")

print("\n" + "=" * 70)


In [None]:
import sys
import tensorflow as tf

# Mostra o caminho para o executável do Python que o notebook está usando
print("Python Executable:", sys.executable)

# Mostra a versão do TensorFlow que foi importada
print("TensorFlow Version:", tf.__version__)