In [1]:
# Import libraries for text processing and data handling
import os  # File system operations
import pickle  # Saving and loading Python objects
import pandas as pd 

In [2]:
def tokenize_novel(input_file, output_file, language):
    """
    Tokenize a text file into sentences using a language-specific tokenizer.
    
    Parameters:
    - input_file: Path to the input text file.
    - output_file: Path to save the tokenized sentences.
    - language: Language of the text ('french', 'english', or 'russian').
    """

    # Select the appropriate tokenizer for the specified language
    if language == 'french':
        tokenizer_path = 'punkt/french.pickle'
    elif language == 'english':
        tokenizer_path = 'punkt/english.pickle'
    elif language == "russian":
        tokenizer_path = 'punkt/russian.pickle'
    else:
        raise ValueError(f"Unsupported language: {language}")  # Handle unsupported languages

    # Load the tokenizer from the pickle file
    with open(tokenizer_path, 'rb') as f:
        tokenizer = pickle.load(f)

    # Read and preprocess the text from the input file
    with open(input_file, "r") as f:
        text = f.read()
    text = text.replace("\n", " ")  # Replace line breaks with spaces
    text = ' '.join(text.split())  # Normalize multiple spaces to single

    # Tokenize the text into sentences
    processed_sentences = tokenizer.tokenize(text)

    # Save the tokenized sentences to the output file, one sentence per line
    with open(output_file, "w") as f:
        f.write("\n".join(processed_sentences))

In [3]:
from unidecode import unidecode  # For normalizing text by removing accents

# Define directories and novel name
data_folder = "data"  # Folder for input text files, switch to "data_test" for testing
result_folder = "res"  # Folder to save tokenized output files
novel = "alice"  # Base name of the text files (no language suffix)
novel_fr = novel + "_fr.txt"  # French version of the novel
novel_eng = novel + "_eng.txt"  # English version of the novel

input_file_fr = os.path.join(data_folder, novel, novel_fr)  # French input file
input_file_eng = os.path.join(data_folder, novel, novel_eng) # English input file
output_file_fr = os.path.join(result_folder, novel, novel_fr)  # French output file
output_file_eng = os.path.join(result_folder, novel, novel_eng)  # English output file

# Normalize French text by removing accents
with open(input_file_fr, 'r') as file:
    lines = file.readlines()
normalized_lines = [unidecode(line) for line in lines]  # Remove accents line by line

# Overwrite the original French file with normalized text
with open(input_file_fr, 'w') as file:
    file.writelines(normalized_lines)

# Tokenize and save sentences for English and French texts
tokenize_novel(input_file_eng, output_file_eng, "english")  # Tokenize English file
tokenize_novel(input_file_fr, output_file_fr, "french")  # Tokenize French file

# Running Hunalign for Text Alignment

Hunalign is a tool for aligning bilingual text files. It uses a dictionary file to improve accuracy, taking the following inputs:

1. **Dictionary file**: A bilingual dictionary to guide alignment.
2. **Source text file**: The first text file (e.g., English).
3. **Target text file**: The second text file (e.g., French).
4. **Output file**: The aligned sentences output file.

## Full Dataset Script

Use this command to align the full dataset and save the result to `accuracy/<novel_folder>/aligned_output.txt`:

```bash
# Full dataset alignment for alice, remember wherever it says alice to write your novel file name instead
src/hunalign/hunalign res/eng-fra.dic res/alice/alice_eng.txt res/alice/alice_fr.txt -text > accuracy/alice/aligned_output.txt

# Test dataset alignment
src/hunalign/hunalign res/eng-fra.dic res/alice/alice_test_eng.txt res/alice/alice_test_fr.txt -text > accuracy/alice/test_aligned_output.txt