In [473]:
import pandas as pd
import numpy as np
import csv
import os

# Define directories and novel name
data_folder = "data"  # Main folder containing input text files
result_folder = "res"  # Folder containing intermediate or processed results
accuracy_folder = "accuracy"  # Folder containing alignment results
novel = "invisible_man"  # Base name of the text files (no language suffix)

# Construct paths for French and English versions of the novel
data_folder_for_novel = os.path.join(data_folder, novel)  # Path to the specific novel's data folder
result_folder_for_novel = os.path.join(result_folder, novel)  # Path to the specific novel's result folder
accuracy_folder_for_novel = os.path.join(accuracy_folder, novel)  # Path to the specific novel's accuracy folder
novel_fr = novel + "_fr.txt"  # Filename for the French version of the novel
novel_eng = novel + "_eng.txt"  # Filename for the English version of the novel

# Load English and French text files
english_text = open(os.path.join(data_folder_for_novel, novel_eng), 'r').read()  # Read English text
french_text = open(os.path.join(data_folder_for_novel, novel_fr), 'r').read()  # Read French text

# Load mentions and alignments data
mentions = pd.read_csv(os.path.join(result_folder_for_novel, 'mentions_used.csv'))  # Load mentions data from CSV
alignments = pd.read_csv(
    os.path.join(accuracy_folder_for_novel, 'aligned_output.txt'),
    delimiter='\t',  # Tab-separated file format
    header=None,  # No column headers in the alignment file
    quoting=csv.QUOTE_NONE,  # Disable special CSV quoting
    on_bad_lines='warn'  # Handle problematic lines without interrupting execution
)
alignments.columns = ['eng', 'fr', 'score']  # Assign meaningful column names to alignments

# Add an ID column to alignments
alignments['id'] = alignments.index  # Create a unique ID for each alignment row
alignments = alignments.drop(columns=['score'])  # Remove the 'score' column unnecessary

In [493]:
import pandas as pd
import google.generativeai as genai  # For AI-based alignment checking

# Configure the API key for accessing the Generative AI API
# genai.configure(api_key='AIzaSyDjJtyEDL6zD0VKvEyxyyWdI8lvWZJmk5o')  # Replace with your valid API key
# alternate key:
# genai.configure(api_key='AIzaSyBLe1EzLlTY5QqMIIzlq1SYoSQPDRHd_hM')
# genai.configure(api_key='AIzaSyBBl0kR-93H8KiJFVlaOr3JqbPW6QClu7M')
# genai.configure(api_key='AIzaSyBD2eC7bGy5OdWzjUntWxD5KQ7iPgbHuag')
genai.configure(api_key='AIzaSyDPC-XW7ANS7qlB-JmU0HKVfZ6b57RycZo')
# genai.configure(api_key='AIzaSyAFzNXL8sDmSdNGJaC6DjGVpZx-6tjG2C0')



# Initialize the generative model (e.g., Gemini-1.5-flash)
model = genai.GenerativeModel("gemini-1.5-flash")

# Define safety settings to handle content responsibly
safe = [
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
]

# function checks if hunalign alignment is correct
def check_alignment(french_text, english_text):
    """
    Check the alignment between a French text and its English translation.

    Parameters:
    - french_text (str): The French phrase to compare.
    - english_text (str): The English phrase to compare.

    Returns:
    - str: 'aligned' if the texts are aligned, otherwise 'not aligned'.
    """
    # Create a prompt for the AI model to evaluate text alignment
    prompt = (
        f"Evaluate the alignment between the following texts:\n\n"
        f"French: '{french_text}'\n"
        f"English: '{english_text}'\n\n"
        f"Are they aligned? Respond with 'aligned' or 'not aligned'."
    )
    
    # Use the AI model to generate a response
    response = model.generate_content(prompt, safety_settings=safe)
    return response.text.strip()  # Return the trimmed response text


In [494]:
import time  # For adding delays between requests to comply with rate limits

# Check if 'checked_alignments.csv' already exists
# This ensures you don't waste time re-checking alignments that were already processed.
if os.path.exists(os.path.join(accuracy_folder_for_novel, 'checked_alignments.csv')):
    # Load the existing alignments if the file exists
    checked_alignments = pd.read_csv(os.path.join(accuracy_folder_for_novel, 'checked_alignments.csv'))
else:
    # If the file doesn't exist, initialize with a copy of the alignments DataFrame
    checked_alignments = alignments.copy()


# Ensure the 'llm alignment result' column exists in the DataFrame
if 'llm alignment result' not in checked_alignments.columns:
    # Add a column to store alignment results if not already present
    checked_alignments['llm alignment result'] = np.nan

# Loop through each row in the DataFrame to check alignment
for index, row in checked_alignments.iterrows():
    # Skip rows that already have a valid alignment result
    if pd.isna(row['llm alignment result']) or row['llm alignment result'] == '':
        # Use the AI model to check alignment
        alignment_result = check_alignment(row['fr'], row['eng'])
        print(alignment_result)

        # Update the DataFrame with the alignment result
        checked_alignments.at[index, 'llm alignment result'] = alignment_result.lower()
        # Save the updated DataFrame to CSV after processing each row
        checked_alignments.to_csv(os.path.join(accuracy_folder_for_novel, 'checked_alignments.csv'), index=False)

        # Pause to comply with the API's 15 requests per minute (RPM) limit
        time.sleep(4)

# Print a message when processing is complete
print("Processing complete. Checked alignments saved to 'checked_alignments.csv'.")


Processing complete. Checked alignments saved to 'checked_alignments.csv'.


In [495]:
# Filter rows where the alignment result is "aligned"
correctly_aligned = checked_alignments[checked_alignments['llm alignment result'].str.lower() == "aligned"].copy()

# Create the DataFrame for incorrectly aligned rows
# Includes rows where:
# 1. The 'llm alignment result' is not "aligned".
# 2. The rows immediately before or after such misaligned rows are also included for context.
incorrectly_aligned = checked_alignments[
    np.isin(checked_alignments.index, np.where(checked_alignments['llm alignment result'] != "aligned")[0]) | 
    np.isin(checked_alignments.index, np.where(checked_alignments['llm alignment result'] != "aligned")[0] - 1) | 
    np.isin(checked_alignments.index, np.where(checked_alignments['llm alignment result'] != "aligned")[0] + 1)
].copy()

# Save the incorrectly aligned rows to a CSV for realignment
incorrectly_aligned.to_csv(accuracy_folder_for_novel + '/to_realign.csv', index=False)

In [496]:
from unidecode import unidecode  # For normalizing French text by removing accents

# gemini realigns unaligned texts
def realign_texts(french_text, english_text):
    """
    Generate realigned English and French texts using a language model.

    Parameters:
    - french_text (str): The original French text to be realigned.
    - english_text (str): The original English text to be realigned.

    Returns:
    - pd.DataFrame: A DataFrame containing the realigned English and French text pairs.
    """
    # Create a prompt for realignment, specifying the desired output format
    prompt = (
        f"Align the following French and English texts accurately. The goal is to produce realigned sentences where each English sentence corresponds directly to a French sentence.\n\n"
        f"French: '{french_text}'\n"
        f"English: '{english_text}'\n\n"
        f"Format the output as follows:\n"
        f"English: [aligned English sentence #1]\n"
        f"French: [aligned French sentence #1]\n\n"
        f"English: [aligned English sentence #2]\n"
        f"French: [aligned French sentence #2]\n\n"
        f"...\n"
        f"Keep the text exactly as it is without rewording or changing punctuation. This task is solely about aligning text, not modifying it."
        f"Use your best judgment to find reasonable alignments, even for partial matches or loosely corresponding sentences.\n"
        f"If there is no equivalent French sentence for an English sentence, or vice versa, attach the phrase to return (No direct equivalent in text).\n"
        )

    # Use the language model to generate a response based on the prompt
    response = model.generate_content(prompt, safety_settings=safe)

    # Split the response into sections based on double newlines
    sections = response.text.strip().split('\n\n')
    
    # Initialize lists to store realigned English and French lines
    english_lines = []
    french_lines = []

    # Process each section to extract aligned texts
    for section in sections:
        lines = section.split('\n')  # Split lines within a section
        for line in lines:
            if line.startswith("English: "):
                eng_line = line.replace("English: ", "").strip()  # Extract the English text
            elif line.startswith("French: "):
                fr_line = unidecode(line.replace("French: ", "").strip())  # Extract and normalize the French text

        # Append extracted lines to their respective lists
        english_lines.append(eng_line)
        french_lines.append(fr_line)

    # Create a DataFrame to organize and return the aligned texts
    result_df = pd.DataFrame({
        'English': english_lines,
        'French': french_lines
    })

    return result_df

In [497]:
# Ensure the 'id' column is numeric, coercing errors to NaN
# This handles cases where 'id' might contain non-numeric values, ensuring proper alignment grouping.
incorrectly_aligned['id'] = pd.to_numeric(incorrectly_aligned['id'], errors='coerce')

# Create a 'group' column to identify clusters of misaligned rows
# A new group is started whenever the current 'id' is not sequential with the previous 'id'.
# This ensures we can iterate over consecutive chunks and provide them to Gemini for realignment.
incorrectly_aligned['group'] = (incorrectly_aligned['id'] != incorrectly_aligned['id'].shift() + 1).cumsum()

# Initialize an empty DataFrame to store the final realigned results
realigned_full = pd.DataFrame(columns=["eng", "fr", "llm alignment result", "id", "group"])

In [498]:
# Check if realigned data already exists
# If it does, load the realigned DataFrame to avoid reprocessing
if os.path.exists(accuracy_folder_for_novel + '/realigned_need_to_transfer_annotations.csv'):
    incorrectly_aligned = pd.read_csv(accuracy_folder_for_novel + '/realigned_need_to_transfer_annotations.csv')
else:
    # Iterate over each group in the incorrectly aligned DataFrame
    for group_id, group in incorrectly_aligned.groupby('group'):
        # Check if the group contains any 'not aligned' results
        if any(group['llm alignment result'] == 'not aligned'):
            aligned_ids = group[group['llm alignment result'] == 'aligned']['id'].tolist()  # Get aligned IDs
            realigned_data = []  # List to store realigned results for this group

            # Combine English and French texts for the group into single strings
            merged_eng = ' '.join(group['eng'].dropna())  # Merge English texts
            merged_fr = ' '.join(group['fr'].dropna())  # Merge French texts

            # Call the LLM for realignment
            realigned_text = realign_texts(merged_fr, merged_eng)

            # Append the realigned results with group info
            for eng, fr in zip(realigned_text["English"], realigned_text["French"]):
                eng = eng.strip()
                fr = fr.strip()
                aligned = "aligned"

                if (eng == "(No direct equivalent in text)" or fr == "(No direct equivalent in text)"):
                    aligned = "not aligned"
                    
                realigned_data.append([eng, fr, aligned, aligned_ids, group_id])  # Replace IDs with start-end ID of chunk

            # Create a DataFrame for the realigned group
            realigned_df = pd.DataFrame(realigned_data, columns=["eng", "fr", "llm alignment result", "id", "group"])

            # Remove the current group from incorrectly aligned and add realigned results
            incorrectly_aligned = incorrectly_aligned[incorrectly_aligned['group'] != group_id]
            incorrectly_aligned = pd.concat([incorrectly_aligned, realigned_df], ignore_index=True)

            # Save the updated incorrectly aligned DataFrame to avoid losing progress
            incorrectly_aligned.to_csv(accuracy_folder_for_novel + '/realigned_need_to_transfer_annotations.csv', index=False)

            # Pause to respect the API rate limit
            time.sleep(4)

In [499]:
import ast

# Convert 'id' column to string if it's not already, to safely evaluate with ast.literal_eval
incorrectly_aligned['id'] = incorrectly_aligned['id'].astype(str)
correctly_aligned['id'] = correctly_aligned['id'].astype(str)

# Remove duplicate aligned IDs from correctly aligned DataFrame
duplicate_aligned_to_remove = set(x for sublist in map(ast.literal_eval, incorrectly_aligned['id'].values) for x in sublist)
correctly_aligned = correctly_aligned[~correctly_aligned['id'].apply(ast.literal_eval).isin(duplicate_aligned_to_remove)]

# Update IDs in incorrectly aligned to the first value from each list and drop 'group' column if it exists
incorrectly_aligned['id'] = incorrectly_aligned['id'].apply(lambda x: ast.literal_eval(x)[0])
if 'group' in incorrectly_aligned.columns:
    incorrectly_aligned = incorrectly_aligned.drop(columns=['group'])

# Convert all 'id' values to integer for consistent data type
incorrectly_aligned['id'] = incorrectly_aligned['id'].astype(int)
correctly_aligned['id'] = correctly_aligned['id'].astype(int)

# Combine correctly and incorrectly aligned DataFrames and sort by ID
updated_alignments = pd.concat([incorrectly_aligned, correctly_aligned]).sort_values(by='id').reset_index(drop=True)
updated_alignments['id'] = updated_alignments.index  # Ensure IDs are sequential

# Save the updated alignments to a CSV file
updated_alignments.to_csv(accuracy_folder_for_novel + '/updated_alignments.csv', index=False)

In [500]:
import re  # For regular expressions

# Function to find the position of a phrase (start and end) in the text if possible
def find_char_position_from_index(text, phrase):
    # Prepare the phrase for flexible whitespace recognition and strip special characters
    phrase_strip = str(phrase).replace(" ~~~ ", r"\s*").replace("~~~ ", r"\s*").replace(" ~~~", r"\s*").strip("'\"><() ")
    substring_pattern = re.escape(phrase_strip).replace(r"\\s\*", r"\s*").replace(r"\ ", r"\s+")

    print(phrase_strip)
    # Try to find the phrase in the text, allowing for flexible spaces
    match = re.search(substring_pattern, text)
    if not match:
        # Define possible sentence delimiters
        delimiters = r'[.!?;:-]'

        # Attempt to extract the first sentence or meaningful phrase using regex patterns
        first_sentence_match = re.match(rf'^(.*?[a-zA-Z]{{6,}}.*?{delimiters})', substring_pattern)
        first_sentence = first_sentence_match.group(1) if first_sentence_match else None

        # Attempt to extract the last sentence or meaningful phrase using regex patterns
        last_sentence_match = re.search(rf'{delimiters}\s+([a-zA-Z\s,\'"-]{{6,}}){delimiters}$', substring_pattern)
        last_sentence = last_sentence_match.group(1) if last_sentence_match else None
        
        # If either the first or last phrase is not found, return invalid indices
        if not first_sentence or not last_sentence:
            return -1, -1

        # Search for the start of the first and end of the last phrase separately
        start_match = re.search(re.escape(first_sentence).replace(r'\ ', r'\s+'), text)
        end_match = re.search(re.escape(last_sentence).replace(r'\ ', r'\s+'), text)

        if not start_match or not end_match:
            return -1, -1
    
        # Return the start and end indices of the matches
        return start_match.start(), end_match.end()

    # Return the start and end indices of the found match
    return match.start(), match.end()

# Function to add start and end positions of phrases to the DataFrame
def add_char_positions(updated_alignments, text, lang_prefix):
    # Initialize lists for start and end positions of phrases
    start_positions = []
    end_positions = []

    # For each phrase, calculate the start and end positions in the text
    for phrase in updated_alignments[lang_prefix]:
        start, end = find_char_position_from_index(text, phrase)
        start_positions.append(start)
        end_positions.append(end)

    # Add the position data as new columns in the DataFrame
    updated_alignments[f'phrase_startByte_{lang_prefix}'] = start_positions
    updated_alignments[f'phrase_endByte_{lang_prefix}'] = end_positions


# Use the function to add positions to the data frames for French and English texts
add_char_positions(updated_alignments, french_text, 'fr')
add_char_positions(updated_alignments, english_text, 'eng')

# Add columns to the DataFrame extracting the text segments using the start and end indices
updated_alignments['extracted_text_fr'] = updated_alignments.apply(
    lambda row: french_text[row['phrase_startByte_fr']:row['phrase_endByte_fr']+1], axis=1
)
updated_alignments['extracted_text_eng'] = updated_alignments.apply(
    lambda row: english_text[row['phrase_startByte_eng']:row['phrase_endByte_eng']+1], axis=1
)

# Export the DataFrame to a CSV file
updated_alignments.to_csv(accuracy_folder_for_novel + '/alignments_with_char_positions.csv', index=False)

L'Homme invisible Herbert George Wells La Revue de Paris, Paris, La Revue de Paris, 1900-1901 Exporte de Wikisource le 9 janvier 2025 I UN ETRANGE VOYAGEUR L'etranger arriva au commencement de fevrier, un jour brumeux, dans un tourbillon de vent et de neige.
Il entra, chancelant, plus mort que vif, dans l'auberge et posant a terre son bagage : -- Du feu, s'ecria-t-il, du feu, par charite ! Une chambre et du feu !
Il venait pedestrement, par la dune, de la station de Bramblehurst, portant, de sa main couverte d'un gant epais, une petite valise noire.
Il etait bien enveloppe des pieds a la tete, et le bord d'un chapeau de feutre mou ne laissait apercevoir de sa figure que le bout luisant de son nez. La neige s'etait amoncelee sur ses epaules, sur sa poitrine ; elle ajoutait aussi une crete blanche au sac dont il etait charge.
Il frappa de la semelle, secoua dans le bar la neige qui le couvrait, puis suivit madame Hall dans le petit salon pour faire ses conditions.
No direct equivalent in

In [501]:
# Identify non-conforming alignments based on invalid byte indices (-1 indicates missing data)
# will be excluded from aligning
# missed annotations from those lines will be added later
phrases_not_found_alignments = updated_alignments[
    # Check for any phrase start or end byte indices set to -1, indicating that the phrase was not found in the text
    (updated_alignments['phrase_startByte_eng'] == -1) |
    (updated_alignments['phrase_endByte_eng'] == -1) |
    (updated_alignments['phrase_startByte_fr'] == -1) |
    (updated_alignments['phrase_endByte_fr'] == -1)
]

# Identify all alignments where valid start and end byte indices are present
phrases_found_alignments = updated_alignments[
    # Check that all start and end byte indices are not -1, indicating valid phrase locations in the text
    (updated_alignments['phrase_startByte_eng'] != -1) &
    (updated_alignments['phrase_endByte_eng'] != -1) &
    (updated_alignments['phrase_startByte_fr'] != -1) &
    (updated_alignments['phrase_endByte_fr'] != -1)
]

# Save the DataFrame of phrases not found (which may need realignment) to a CSV file
phrases_not_found_alignments.to_csv(accuracy_folder_for_novel + '/may_need_realign.csv', index=False)

# Get the number of phrases that were not found, potentially indicating issues with the alignment or text
len(phrases_not_found_alignments)

343

In [502]:
# Function to create the english_merged_mentions DataFrame to hold the merged english data: english pronoun to enlglish phrase
def create_and_merge_mentions(mentions, phrases_found_alignments):
    # Creates the english_merged_mentions DataFrame which will hold the merged df with the aligned phrases and the entities
    english_merged_mentions = pd.DataFrame({
        'eng_source': mentions['source'],
        'eng_idColName': mentions['idColName'],
        'eng_iden': mentions['iden'],
        'eng_text': mentions['text'],
        'eng_startByte': mentions['startByte'],
        'eng_endByte': mentions['endByte'],
        'eng_pdncID': mentions['pdncID'],
        'eng_paraID': mentions['paraID'],
        'eng_chapID': mentions['chapID'],
        'eng_mID': mentions['mID']
    })


    result_rows = []

    # Iterate through each row in english_merged_mentions
    for _, fm_row in english_merged_mentions.iterrows():
        fm_start, fm_end = fm_row["eng_startByte"], fm_row["eng_endByte"]

        # Iterate through each row in phrases_found_alignments
        for _, align_row in phrases_found_alignments.iterrows():
            align_start, align_end = align_row["phrase_startByte_eng"], align_row["phrase_endByte_eng"]
            
            # Ensure that the entity indecies fall within the aligned phrase index range
            if align_start <= fm_start and fm_end <= align_end:
                # Combine the rows and add to the result so that for each entity, we have the aligned phrase and the entity
                combined_row = {**fm_row.to_dict(), **align_row.to_dict()}
                result_rows.append(combined_row)

        merged_mentions = pd.DataFrame(result_rows)

    return merged_mentions

if not os.path.exists(os.path.join(accuracy_folder_for_novel + "/pronoun_alignment_without_gemini.csv")):
    # Use the function to combine the mentions with correctly aligned phrases
    df_correctly_aligned = create_and_merge_mentions(mentions, phrases_found_alignments)


In [503]:
from deep_translator import GoogleTranslator
import pandas as pd
from unidecode import unidecode

if not os.path.exists(os.path.join(accuracy_folder_for_novel + "/pronoun_alignment_without_gemini.csv")):

    # Load the pronoun translations from a CSV file into a DataFrame
    translations_df = pd.read_csv(data_folder + '/translations.csv')

    # Convert the DataFrame to a dictionary mapping English terms to lists of possible French translations
    translations_dict = {row['english']: row['french'].split('|') for _, row in translations_df.iterrows()}

    # Initialize the GoogleTranslator object for translating from English to French
    translator = GoogleTranslator(source='en', target='fr')

    # Function to retrieve existing translations from a dictionary or translate English terms to French if not in pronoun dictionary
    def translate_or_retrieve(english_term):
        # Check if the translation for the term already exists in the translations dictionary
        if english_term in translations_dict:
            return translations_dict[english_term]
        
        try:
            # Attempt to translate the term using Google Translator
            # Normalize the translated term, remove accents using unidecode and strip whitespace
            french_term = unidecode(translator.translate(english_term).strip())

            # Return the new translation in a list format
            return [french_term]
        except Exception as e:
            # If translation fails, log the error and return None in a list
            print(f"Error translating '{english_term}': {e}")
            return [None]

    # Apply the translation function to each entry in the 'eng_text' column of df_correctly_aligned
    df_correctly_aligned['translations'] = df_correctly_aligned['eng_text'].apply(translate_or_retrieve)

In [504]:
import re

# function to find all occurrences of translated English terms in the French text
def find_in_french_text(dataframe):
    occurrences_list = []  # List to store occurrences for all rows

    for _, row in dataframe.iterrows():
        french_text = row['extracted_text_fr']  # French text to search in
        translations = row['translations']   # List of translated terms to search for
        start_byte = row['phrase_startByte_fr']  # Start byte of the phrase in the original text

        row_occurrences = []  # List to store occurrences for the current row

        # Iterate through each translated term to find all its occurrences in the French text
        for french_term in translations:
            # Create a regex pattern for exact whole-word matches, case insensitive
            pattern = r'\b' + re.escape(french_term.lower()) + r'\b'
            
            # Search for the pattern in the French text, using case insensitive matching
            for match in re.finditer(pattern, french_text.lower()):
                start_index = match.start()  # Start index of the match in the French text
                end_index = match.end()      # End index of the match in the French text

                # Calculate the actual byte positions by adding the start_byte
                row_occurrences.append((french_term, start_index + start_byte, end_index + start_byte))

        # If no occurrences are found for any of the terms, append a placeholder (None, None, None)
        if not row_occurrences:
            row_occurrences = [(None, None, None)]

        # Append the list of occurrences for this row to the main list
        occurrences_list.append(row_occurrences)

    # Add the list of all occurrences to the DataFrame as a new column
    dataframe['occurrences'] = occurrences_list

    return dataframe

if not os.path.exists(os.path.join(accuracy_folder_for_novel + "/pronoun_alignment_without_gemini.csv")):
    # Apply the function to the DataFrame with correctly aligned phrases and their translations
    french_mentions = find_in_french_text(df_correctly_aligned)

In [505]:
if not os.path.exists(os.path.join(accuracy_folder_for_novel + "/pronoun_alignment_without_gemini.csv")):

    # Convert the list of occurrences in each row into a tuple for immutable group keys
    df_correctly_aligned['occurrences_tuple'] = df_correctly_aligned['occurrences'].apply(tuple)

    # Function to assign values if the group length matches the number of occurrences
    # meaning that if the english phrase has 2 pronouns and the french phrase has 2 pronouns then they are transfered accordingly
    def assign_occurrences(group):
        # Check if the number of rows in the group matches the number of occurrences in the first row
        if len(group) == len(group.iloc[0]['occurrences']):
            group = group.copy()  # Make a copy of the group to avoid SettingWithCopyWarning
            # Unpack the first occurrence in the list into separate columns
            group['fr_text'], group['fr_startByte'], group['fr_endByte'] = zip(*group.iloc[0]['occurrences'])
        else:
            # Assign None if the group size does not match the number of occurrences, indicating a mismatch
            group['fr_text'], group['fr_startByte'], group['fr_endByte'] = None, None, None
        return group

    # Apply the function on groups formed based on the unique occurrence tuples
    grouped_df_correctly_aligned = df_correctly_aligned.groupby('occurrences_tuple', group_keys=False).apply(assign_occurrences)

    # Convert `fr_startByte` and `fr_endByte` to integers or None where applicable
    grouped_df_correctly_aligned['fr_startByte'] = grouped_df_correctly_aligned['fr_startByte'].astype('Int64') 
    grouped_df_correctly_aligned['fr_endByte'] = grouped_df_correctly_aligned['fr_endByte'].astype('Int64')

    # Save the DataFrame to a CSV file
    grouped_df_correctly_aligned.to_csv(accuracy_folder_for_novel + "/pronoun_alignment_without_gemini.csv", index=False)

In [506]:
import re

if os.path.exists(os.path.join(accuracy_folder_for_novel + "/pronoun_alignment_with_gemini.csv")):
    grouped_df_correctly_aligned = pd.read_csv(os.path.join(accuracy_folder_for_novel + "/pronoun_alignment_with_gemini.csv"))


# Function to transfer pronouns using Gemini model and update the dataframe with the results
def align_pronouns_with_gemini(dataframe, model, text):
    if "fr_text" not in dataframe.columns:
        dataframe["fr_text"] = None
    if "gemini_used" not in dataframe.columns:
        dataframe["gemini_used"] = False
    for index, row in dataframe.iterrows():        
        # Skip rows where gemini was already used or French text is already available or not null
        if row["gemini_used"] == True:
            continue

        # Extract relevant data from the row for processing
        english_phrase = row['extracted_text_eng']
        french_phrase = row['extracted_text_fr']
        english_pronoun = row['eng_text']
        english_context = text[row['eng_startByte'] - 20:row['eng_endByte'] + 20]
        fr_start_byte = int(row["phrase_startByte_fr"])

        print(row["eng_text"])  # Debugging: print the English pronoun being processed

        # Formulate the prompt for the Gemini model, providing detailed instructions and context
        prompt = (
            f"Your task is to identify the French equivalent of the given English pronoun in the French text, "
            "using the provided context to ensure accuracy. Only align when you are completely certain. "
            "If you cannot find a match, respond with 'None'. \n\n"
            f"English Pronoun: {english_pronoun}\n"
            f"French Phrase: {french_phrase}\n"
            f"English Context: {english_context}\n\n"
            "Important: Use only standard ASCII apostrophes in your response.\n\n"
            "Provide the result in the format: 'french_pronoun: <pronoun>, substring_from_pronoun: <exact substring>'"
        )
    

        # Generate content from the model and normalize apostrophes
        response = model.generate_content(prompt).text.strip().replace("’", "'").replace("‘", "'").replace("`", "'").replace("\\\'", "'").replace("''", "'").strip()
        
        # If the model found no match, pause and continue to the next iteration
        if response == "None":
            dataframe.at[index, 'gemini_used'] = True
            dataframe.to_csv(accuracy_folder_for_novel + "/pronoun_alignment_with_gemini.csv", index=False)
            time.sleep(6)  # Respect API rate limits
            continue
        
        # Debugging: print the model's response
        print(response)

        # Extract and process each match from the response using regex
        result = ()
        for match in re.finditer(r"french_pronoun:\s*(.*),\s*substring_from_pronoun:\s*(.*)", response):
            french_pronoun, substring_from_pronoun = match.groups()
            print("french_pronoun, substring_from_pronoun", french_pronoun, substring_from_pronoun)
            substring_from_pronoun = substring_from_pronoun.strip()

            # Find the start and end character positions of the match in the French phrase
            start_char = french_phrase.lower().find(substring_from_pronoun[:30].lower())
            end_char = start_char + len(french_pronoun)
            print("start_char, end_char", start_char, end_char)

            # Validate and store the correct alignment if it matches the provided pronoun
            if french_phrase[start_char:end_char].lower().strip() == french_pronoun.lower().strip():
                result = (french_pronoun, start_char + fr_start_byte, end_char + fr_start_byte)

        # Update the DataFrame with the found pronoun positions or mark as None if not found
        if result:
            dataframe.at[index, 'fr_text'] = result[0]
            dataframe.at[index, 'fr_startByte'] = result[1]
            dataframe.at[index, 'fr_endByte'] = result[2]
            dataframe.at[index, 'gemini_used'] = True
            print(result)
        else:
            dataframe.at[index, 'fr_text'] = None
            dataframe.at[index, 'fr_startByte'] = None
            dataframe.at[index, 'fr_endByte'] = None
            dataframe.at[index, 'gemini_used'] = True
    
        # Save the DataFrame periodically to ensure data is not lost
        dataframe.to_csv(accuracy_folder_for_novel + "/pronoun_alignment_with_gemini.csv", index=False)
        time.sleep(6)  # Respect API rate limits

    return dataframe

df_with_pronoun_alignment = align_pronouns_with_gemini(grouped_df_correctly_aligned, model, english_text)
    
# Post-process to replace incorrect apostrophe encoding in the dataframe
df_with_pronoun_alignment['fr_text'] = df_with_pronoun_alignment['fr_text'].str.replace("\\\'", "\'")

In [507]:
# Sort the DataFrame by the 'fr_text' column to place non-null values at the top
grouped_grouped_df_correctly_aligned = (
    df_with_pronoun_alignment.sort_values(by='fr_text', na_position='last')
    # Remove duplicate entries based on 'eng_mID', keeping only the first occurrence (so the duplicate isn't the null value)
    .drop_duplicates(subset='eng_mID', keep='first')
)

# Identify and retrieve mentions that are not present in the aligned DataFrame based on 'eng_mID'
forgotten_mentions = mentions[~mentions['mID'].isin(grouped_grouped_df_correctly_aligned['eng_mID'])]
# Rename columns to clearly differentiate between English and potentially missing French counterparts
forgotten_mentions_renamed = forgotten_mentions.rename(columns={
    "source": "eng_source",
    "idColName": "eng_idColName",
    "iden": "eng_iden",
    "text": "eng_text",
    "startByte": "eng_startByte",
    "endByte": "eng_endByte",
    "pdncID": "eng_pdncID",
    "paraID": "eng_paraID",
    "chapID": "eng_chapID",
    "mID": "eng_mID"
})

# Add French-specific columns with None values to account for missing French alignments
forgotten_mentions_renamed["fr_text"] = None
forgotten_mentions_renamed["fr_startByte"] = None
forgotten_mentions_renamed["fr_endByte"] = None

# Combine the previously aligned data with the newly accounted mentions and re-sort by English start byte
grouped_grouped_df_correctly_aligned = pd.concat(
    [grouped_grouped_df_correctly_aligned, forgotten_mentions_renamed], ignore_index=True
).sort_values(by="eng_startByte")

# Select specific columns to form the final DataFrame
selected_df = grouped_grouped_df_correctly_aligned[[
    'eng_source', 'eng_idColName', 'eng_iden', 'eng_text', 'fr_text',
    'eng_startByte', 'eng_endByte', 'fr_startByte', 'fr_endByte',
    'eng_pdncID', 'eng_paraID', 'eng_chapID', 'eng_mID'
]]

# save the french mentions to a CSV file
selected_df.to_csv(result_folder_for_novel + "/mentions_used_fr.csv", index=False)

  grouped_grouped_df_correctly_aligned = pd.concat(


In [508]:
# Count the number of transfered values: non-NA (not missing) values in the 'fr_text' column
non_none_fr_startByte_count = selected_df['fr_text'].notna().sum()

# Count the number of not transfered values: NA (missing) values in the 'fr_text' column
none_fr_startByte_count = selected_df['fr_text'].isna().sum()

# Print the counts of transfered and not-transfered values
print(non_none_fr_startByte_count, none_fr_startByte_count)

3598 1244


In [509]:
# abbey: transfered to not transfered: 2760 1509
# age_innocence: transfered to not transfered: 4632 2632
# dorian_gray: transfered to not transfered: 7435 1358 
# emma: transfered to not transfered: 9223 5978
# gambler: transfered to not transfered: 2227 2530
# invisible_man: transfered to not transfered: 3598 1244


In [513]:
if os.path.exists(os.path.join(accuracy_folder_for_novel + "/with_gemini_correction.csv")):
    # Check if the corrected alignments file already exists
    # If it exists, load the alignments into a DataFrame
    selected_df = pd.read_csv(os.path.join(accuracy_folder_for_novel + "/with_gemini_correction.csv"))

def check_pronoun_translation(dataframe, french_text, english_text):
    # Add a new 'check' column if it doesn't already exist
    if "check" not in dataframe.columns:
        dataframe['check'] = None    
        # Mark rows with missing French text as "wrong"
        dataframe.loc[dataframe['fr_text'].isna(), 'check'] = "wrong"

    # Iterate through each row in the DataFrame
    for index, row in dataframe.iterrows():
        # Only process rows where French text exists, and no check has been performed
        if not pd.isna(row['fr_text']) and row['check'] != "correct" and row['check'] != "wrong":
            
            # Extract the pronoun and context information for both languages
            french_pronoun = row['fr_text']
            english_pronoun = row['eng_text']

            # Extract the surrounding context in English
            english_context_before = english_text[row['eng_startByte'] - 30:row['eng_startByte']]
            english_context_after = english_text[row['eng_startByte']:row['eng_endByte'] + 31]
            
            # Extract the surrounding context in French
            french_context_before = french_text[int(row['fr_startByte']) - 30:int(row['fr_startByte'])]
            french_context_after = french_text[int(row['fr_startByte']):int(row['fr_endByte']) + 31]
        
            # Print pronouns and context for debugging or verification
            print(french_pronoun, " -> ", english_pronoun)
            print(english_context_before, " ---- ", english_pronoun, " ---- ", english_context_after)
            print(french_context_before, " ---- ", french_pronoun, " ---- ", french_context_after)
            
            # Construct a detailed prompt for a model to evaluate pronoun correspondence
            
            prompt = (
                f"French Pronoun: '{french_pronoun}'\n"
                f"English Pronoun: '{english_pronoun}'\n\n"
                f"English Context Before: '{english_context_before}'\n\n"
                f"English Context After: '{english_context_after}'\n\n"
                f"French Context Before: '{french_context_before}'\n\n"
                f"French Context After: '{french_context_after}'\n\n"
                "Using the context before and after the pronouns in both languages, determine if the French pronoun corresponds "
                "to the English pronoun. Ensure that the pronouns match in their grammatical role (e.g., subject, object, possessive, etc.) "
                "and that neither a proper noun (like 'Alice') nor any other non-pronoun term is considered as a pronoun. "
                "Ignore minor differences in word choice or phrasing unrelated to pronoun usage. Respond with 'correct' if the pronouns correspond, otherwise respond with 'wrong'."
                "Respond ONLY with 'correct' or 'wrong'."
            )

            # Generate a response from the model based on the constructed prompt
            response = model.generate_content(prompt, safety_settings=safe).text.strip()  
            print(response)
            
            # Only update the DataFrame with valid responses ('correct' or 'wrong')
            if (response.lower() == "correct" or response.lower() == "wrong"):
                dataframe.at[index, 'check'] = response.lower()

            # Save the updated DataFrame after each row is processed
            dataframe.to_csv(accuracy_folder_for_novel + "/with_gemini_correction.csv", index=False)
            time.sleep(4)  # Pause to respect API rate limits (e.g., 15 requests per minute)
    return dataframe

# Sort the DataFrame by English start byte for logical ordering
selected_df = selected_df.sort_values(by="eng_startByte")

# Apply the pronoun-checking function to the selected DataFrame
checks = check_pronoun_translation(selected_df, french_text, english_text)

In [514]:
# Count the number of correctly transfered annotations: rows where the 'check' column has the value "correct"
correct_count = (checks['check'] == "correct").sum()

# Count the number of incorrectly transfered annotations: rows where the 'check' column has the value "wrong"
wrong_count = (checks['check'] == "wrong").sum()

# Print the number of correctly and incorrectly transfered annotations
print(correct_count, wrong_count)

3316 1526


In [None]:
# abbey correctly and incorrectly transfered annotations 2439 1830
# age_innocence correctly and incorrectly transfered annotations 4129 3135
# dorian_gray correctly and incorrectly transfered annotations 7055 1738
# emma correctly and incorrectly transfered annotations 8261 6940
# gambler correctly and incorrectly transfered annotations 1928 2829
# invisible_man correctly and incorrectly transfered annotations 3316 1526 