In [1]:
import os
import re
import numpy as np
import json
import logging
from typing import List, Tuple
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import tokenizer_from_json

In [2]:
class BeamSearchTranslator:
    def __init__(self, 
                 model_path: str, 
                 english_tokenizer_path: str, 
                 french_tokenizer_path: str, 
                 max_length_path: str, 
                 beam_width: int = 3):
        """
        Initialize the beam search translator with robust file loading.
        """
        # Set up logging
        logging.basicConfig(level=logging.DEBUG)
        self.logger = logging.getLogger(__name__)
        
        # Validate file paths
        self._validate_file_paths(model_path, english_tokenizer_path, 
                                  french_tokenizer_path, max_length_path)
        
        # Load model
        try:
            self.model = load_model(model_path)
        except Exception as e:
            self.logger.error(f"Failed to load model: {e}")
            raise

        # Load tokenizers
        self.english_tokenizer = self._load_tokenizer(english_tokenizer_path)
        self.french_tokenizer = self._load_tokenizer(french_tokenizer_path)

        # Load max length
        with open(max_length_path, 'r') as f:
            self.max_length = json.load(f)

        # Create reverse mapping for tokenizers
        self.index_to_english = {id: word for word, id in self.english_tokenizer.word_index.items()}
        self.index_to_french = {id: word for word, id in self.french_tokenizer.word_index.items()}
        self.index_to_english[0] = '<PAD>'
        self.index_to_french[0] = '<PAD>'
        
        self.beam_width = beam_width

    def _validate_file_paths(self, *paths):
        """Validate that all provided file paths exist."""
        for path in paths:
            if not os.path.exists(path):
                raise FileNotFoundError(f"File not found: {path}")

    def _load_tokenizer(self, tokenizer_path: str):
        """Load tokenizer from JSON file."""
        try:
            with open(tokenizer_path, 'r') as f:
                tokenizer = tokenizer_from_json(json.load(f))
                self.logger.debug(f"Tokenizer word index size: {len(tokenizer.word_index)}")
                return tokenizer
        except Exception as e:
            self.logger.error(f"Failed to load tokenizer: {e}")
            raise

    def preprocess_text(self, sentence: str) -> str:
        """Preprocess input sentence."""
        sentence = re.sub(r'[^\w\s]', '', sentence).lower()
        return sentence

    def beam_search_translate(self, input_seq: np.ndarray) -> List[int]:
        """Perform beam search decoding with robust error handling."""
        beam_candidates = [([], 0.0)]
        
        for _ in range(self.max_length):
            next_candidates = []
            
            for candidate, score in beam_candidates:
                # Prepare decoder input
                decoder_input = np.zeros((1, self.max_length), dtype=int)
                decoder_input[0, :len(candidate)] = candidate
                
                try:
                    # Predict next token probabilities
                    prediction = self.model.predict(input_seq)[0]
                    logits = prediction[len(candidate)]
                    
                    # Get top beam_width candidates
                    top_indices = np.argsort(logits)[-self.beam_width:]
                    
                    for idx in top_indices:
                        new_candidate = candidate + [idx]
                        new_score = score + np.log(logits[idx] + 1e-10)
                        next_candidates.append((new_candidate, new_score))
                
                except Exception as e:
                    self.logger.error(f"Prediction error: {e}")
                    continue
            
            # Sort and select top candidates
            next_candidates.sort(key=lambda x: x[1], reverse=True)
            beam_candidates = next_candidates[:self.beam_width]
            
            # Stop if all candidates end or reach max length
            if all(len(c[0]) >= self.max_length for c in beam_candidates):
                break
        
        # Return best candidate
        best_candidate = max(beam_candidates, key=lambda x: x[1])[0]
        return best_candidate

    def translate_sentence(self, english_sentence: str) -> str:
        """Translate English sentence to French."""
        # Preprocess and tokenize
        preprocessed = self.preprocess_text(english_sentence)
        input_seq = self.english_tokenizer.texts_to_sequences([preprocessed])
        
        # Pad sequence
        input_seq = pad_sequences(input_seq, maxlen=self.max_length, padding='post')
        
        # Beam search translation
        translated_indices = self.beam_search_translate(input_seq)
        
        # Convert indices to words
        translation = ' '.join([self.index_to_french.get(idx, '<UNK>') for idx in translated_indices])
        
        return translation

In [3]:
def main():
    MODEL_PATH = 'english_to_french_model'
    ENGLISH_TOKENIZER_PATH = 'english_tokenizer.json'
    FRENCH_TOKENIZER_PATH = 'french_tokenizer.json'
    MAX_LENGTH_PATH = 'sequence_length.json'

    try:
        translator = BeamSearchTranslator(
            model_path=MODEL_PATH,
            english_tokenizer_path=ENGLISH_TOKENIZER_PATH,
            french_tokenizer_path=FRENCH_TOKENIZER_PATH,
            max_length_path=MAX_LENGTH_PATH,
            beam_width=3
        )

        sentences = [
            "new jersey is sometimes quiet during autumn",
        ]

        for sentence in sentences:
            french_translation = translator.translate_sentence(sentence)
            print(f"\nEnglish: {sentence}\n")
            print(f"French:  {french_translation}\n")
            print("Correct: le new jersey est parfois calme en automne")

    except Exception as e:
        logging.error(f"Translation failed: {e}", exc_info=True)

if __name__ == "__main__":
    main()

DEBUG:__main__:Tokenizer word index size: 199
DEBUG:__main__:Tokenizer word index size: 344



English: new jersey is sometimes quiet during autumn

French:  new jersey est parfois calme à l' automne <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct: le new jersey est parfois calme en automne
