## Initialisation

Here we load all the files and models we'll need

In [1]:
# Load the model to differentiate between person nouns and other nouns
from tensorflow.keras.models import load_model
model_person_noun = load_model('classif_common_noun_person.h5')

# Load persons nouns with masculine and feminine forms retrieved from wikidata
import json
with open('formatted_wikidata.json', 'r', encoding="utf-8") as f:
    data_triplets = json.load(f)

from transformers import CamembertModel, CamembertTokenizer
import torch

# Load the model and tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model_embedd = CamembertModel.from_pretrained("camembert-base")

import spacy

# Charger le modèle de langue
nlp = spacy.load("fr_core_news_md")

import numpy as np

from difflib import SequenceMatcher
from SPARQLWrapper import SPARQLWrapper, JSON

import requests

import pandas as pd
# Load data from Lexique_tous_noms_communs.xlsx in a dataframe
df_tous_noms = pd.read_excel('Lexique_tous_noms_communs.xlsx')

import re

from pattern3.text.fr import pluralize
from pattern3.fr import parse
from pattern3.fr import predicative

from pynlg.morphology.fr import FrenchMorphologyRules
from pynlg.lexicon.feature.category import ADJECTIVE, VERB_PHRASE, NOUN_PHRASE, VERB
from pynlg.lexicon.fr import FrenchLexicon
lexicon_fr = FrenchLexicon()
french_rules = FrenchMorphologyRules()

from pynlg.lexicon.feature.gender import MASCULINE, FEMININE
from pynlg.lexicon.feature.number import PLURAL, SINGULAR, BOTH

import time



  from .autonotebook import tqdm as notebook_tqdm


## Embeddings functions

### Embedding for a specific word in context (sentence)

In [2]:
def get_word_embedding_context(word, sentence, tokenizer, model):
    """
    Get the contextualized embedding of a word in a sentence using Camembert.

    Args:
        word (str): The word to extract the embedding for.
        sentence (str): The input sentence containing the word.
        model_name (str): The Hugging Face model to use (default: "camembert-base").

    Returns:
        torch.Tensor: The embedding of the word as a PyTorch tensor.
    """

    # Tokenize the sentence
    tokens = tokenizer(sentence, return_tensors="pt", add_special_tokens=True)
    input_ids = tokens["input_ids"]
    attention_mask = tokens["attention_mask"]

    # Get the embeddings
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_dim)

    # Decode tokens to find the indices of the word
    tokenized_sentence = tokenizer.convert_ids_to_tokens(input_ids[0])
    word_tokens = tokenizer.tokenize(word)

    # Find the start and end indices of the word tokens in the sentence
    word_indices = []
    for i in range(len(tokenized_sentence)):
        # Decode consecutive tokens to check for a match
        for j in range(i, len(tokenized_sentence)):
            decoded_span = tokenizer.decode(input_ids[0][i:j+1])
            if decoded_span == word:
                word_indices = list(range(i, j+1))
                break
        if word_indices:
            break

    if not word_indices:
        raise ValueError(f"Word '{word}' not found in the tokenized sentence.")

    # Extract and aggregate the embeddings for the word tokens
    word_embeddings = hidden_states[0, word_indices, :]  # Shape: (word_token_count, hidden_dim)
    aggregated_embedding = word_embeddings.mean(dim=0)  # Aggregate subword embeddings

    return aggregated_embedding, word_indices




### Embedding for a word without context

In [3]:
def get_word_embedding(word, tokenizer, model, model_name="camembert-base"):
    """
    Given a word, outputs its embedding using a pre-trained model from Hugging Face.

    Parameters:
        word (str): The word to get the embedding for.
        model_name (str): The Hugging Face model name to use (default: 'camembert-base').

    Returns:
        numpy.ndarray: The embedding vector for the given word.
    """
    # Tokenize the word
    inputs = tokenizer(word, return_tensors="pt")

    # Get the hidden states (embeddings)
    with torch.no_grad():
        outputs = model(**inputs)
        # `outputs.last_hidden_state` contains embeddings for each token
        hidden_states = outputs.last_hidden_state

    # Aggregate embeddings (e.g., mean pooling for the word's tokens)
    word_embedding = hidden_states.mean(dim=1).squeeze().numpy()

    return word_embedding

## Inclusive functions

### General functions creation

In [4]:
def pos_tagging(sentence):
  doc = nlp(sentence)
  indice_pos = []
  for i, token in enumerate(doc):
    indice_pos.append(token.pos_)
  return(indice_pos)

### Nom Commun

In [5]:
def is_person_word(word, sentence, model):

    # Creates the embedding for the word
    embedding_without_context = get_word_embedding(word, tokenizer, model_embedd)

    # Predict if it's a person
    prediction = (model.predict(np.array([embedding_without_context])) > 0.5).astype("int32")[0][0]
    return prediction

print(is_person_word("étudiants", "Le professeur a distribué les devoirs aux étudiants.", model_person_noun))
# Other example
print(is_person_word("table", "Je mange sur la table.", model_person_noun))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 558ms/step
1
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
0


In [6]:
def detect_gender_and_number_noun_spacy(nom):
    """
    Utilise SpaCy pour déterminer le genre grammatical et le nombre des noms.
    Retourne un tuple (mot, genre).
    """
    # Crée un tuple genre vide
    genre_nombre = []

    # Construire une phrase contextuelle minimale pour forcer le traitement comme un nom
    doc = nlp(f"un(e) {nom}")  # Ajout de l'article "un(e)" pour guider SpaCy

    # Extraire le genre du mot principal
    token = doc[2]  # Le mot analysé est toujours le deuxième dans "le {nom}"
    if token.pos_ == "NOUN":  # Vérifie que le mot est bien un nom
        genre_nombre = ( nom , "masculine" if token.morph.get("Gender") == ["Masc"] else \
                "feminine" if token.morph.get("Gender") == ["Fem"] else \
                "inconnu", "plural" if token.morph.get("Number") == ["Plur"] else \
                "singulier" if token.morph.get("Number") == ["Sing"] else \
                "inconnu")
    else:
        #print(token.pos_)
        genre_nombre = (nom, "non-nom", "non-nom")

    return genre_nombre

In [7]:
print(detect_gender_and_number_noun_spacy("personne"))


('personne', 'feminine', 'singulier')


#### Sparql feminine

In [8]:
def get_feminine_form(word):
    
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.addCustomHttpHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

    query = f"""
    SELECT DISTINCT ?feminineLabel
    WHERE {{
      ?noun rdfs:label '{word}'@fr;  # Match the masculine noun in French
            wdt:P2521 ?feminineLabel.     # Get the feminine form (P2521)


    }}

    ORDER BY DESC(?sitelinks)


    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()


    feminines = [result["feminineLabel"]["value"] for result in results["results"]["bindings"] if result["feminineLabel"]['xml:lang']=='fr']
    try:
      closest_match = max(feminines, key=lambda form: SequenceMatcher(None, word, form).ratio())
      return closest_match
    except:
      return None

# Example usage
word = "présentateur ou présentatrice"  # Masculine noun
feminine_forms = get_feminine_form(word)
print(f"Feminine forms of '{word}': {feminine_forms}")


Feminine forms of 'présentateur ou présentatrice': présentatrice


In [9]:
def greedy_search_feminine_forms(word, language="fr", limit=50):
    """
    Search for feminine forms of a noun using the Wikibase API.

    :param word: The word to search for (masculine form).
    :param language: The language of the word (default: French).
    :param limit: The maximum number of results to return.
    :return: A list of feminine forms.
    """
    # If it's a feminine noun, we don't want to make it inclusive since it's likely refering to a specific person:
    if detect_gender_and_number_noun_spacy(word)[1]=='feminine':
       return(word)

    # Wikibase API endpoint
    base_url = "https://www.wikidata.org/w/api.php"

    # Step 1: Search for the noun
    search_params = {
        "action": "wbsearchentities",
        "search": word,
        "language": language,
        "limit": limit,
        "format": "json",
    }
    response = requests.get(base_url, params=search_params)
    response.raise_for_status()
    search_results = response.json().get("search", [])

    feminine_label = []

    # Step 2: Fetch feminine forms for each result
    for result in search_results:
        r = result
        #print(r)
        if (
            r['match']['text'] == word or
            r['match']['text'].startswith(f"{word} ou") or
            r['match']['text'].endswith(f"ou {word}")
          ):
            
            entity_id = result.get("id")  # The Q-ID of the entity
            # Query the entity data to check for P2521 (feminine form)
            entity_params = {
                "action": "wbgetclaims",
                "entity": entity_id,
                "property": "P2521",  # Property for feminine equivalent
                "format": "json",
            }
            entity_response = requests.get(base_url, params=entity_params)
            entity_response.raise_for_status()
            claims = entity_response.json().get("claims", {}).get("P2521", [])

            # Extract the feminine forms
            for claim in claims:
                target = claim["mainsnak"]["datavalue"]["value"]
                # Get the label of the feminine form
                if(target["language"] == "fr"):
                    feminine_label.append(target["text"])

    try:
      closest_match = max(feminine_label, key=lambda form: SequenceMatcher(None, word, form).ratio())
      return(closest_match)
    except:
      return None

# Example usage
word = "réparateur"
feminine_forms = greedy_search_feminine_forms(word, limit=4000)

# From all proposed forms in feminine_forms, choose the one that is the closest in terms of letters to word

print("Feminine forms:", feminine_forms)


Feminine forms: réparatrice


In [10]:
def check_epicene(word):
  """
  This function iterates through the 'Word' column of the dataframe and return True if the associated 'genre' column in empty word : the word to check
  """
  # Filter the dataframe to find the row with the specified word
  row = df_tous_noms[df_tous_noms['Word'] == word]

  # Check if the 'genre' column is empty for this word
  if not row.empty:
      return row['genre'].iloc[0] == "" or pd.isna(row['genre'].iloc[0])

  # If the word is not in the dataframe, return False
  return False

In [11]:
check_epicene("partenaire")

True

#### Creation inclusive nouns (used also for adjectives )

In [12]:
def combine_words_from_sentences(sentence1, sentence2):
    """
    Parse two sentences, extract the first words, and create a combination
    based on the common radical they have in common.

    :param sentence1: The first sentence.
    :param sentence2: The second sentence.
    :return: The combined word or None if no common radical exists.
    """
    def extract_first_word(sentence):
        """Extract the first word from a sentence."""
        return re.split(r"\s+", sentence.strip())[0]

    def find_common_radical(word1, word2):
        """Find the common radical between two words."""
        word1= word1.lower()
        word2= word2.lower()
        matcher = SequenceMatcher(None, word1, word2)
        match = matcher.find_longest_match(0, len(word1), 0, len(word2))
        if match.size > 0:
            return word1[match.a:match.a + match.size]  # Common radical
        return ""

    def find_common_suffix(word1, word2):
        """Find the common suffix between two words."""
        min_length = min(len(word1), len(word2))
        for i in range(1, min_length + 1):
            if word1[-i:] != word2[-i:]:
                if i == 1:
                    return None  # No common suffix
                return word1[-(i-1):]  # Return the common suffix
        return word1[-min_length:]  # Entire shorter word is a suffix


    # Extract the first words from both sentences
    word1 = extract_first_word(sentence1)
    word2 = extract_first_word(sentence2)

    # Find the common radical
    common_radical = find_common_radical(word1, word2)

    # Find the common suffix
    common_suffix = find_common_suffix(word1, word2)

    if common_radical:
        # Remove the common radical from both words to find their unique suffixes
        suffix1 = word1[len(common_radical):]
        suffix2 = word2[len(common_radical):]

        # Combine the word with the format: common_radical + suffix1 + '.' + suffix2
        combined_word = f"{common_radical}{suffix1}.{suffix2}"

        if common_suffix:
            suffix1 = suffix1.replace(common_suffix, '')
            suffix2 = suffix2.replace(common_suffix, '')
            combined_word = f"{common_radical}{suffix1}.{suffix2}.{common_suffix}"

        if(word1 == word2):
          return word1
        else:
          return combined_word

    return None  # No common radical found

# Example usage
sentence1 = "nageur est sportif"
sentence2 = "nageuse est sportive"
result = combine_words_from_sentences(sentence1, sentence2)
print("Combined word:", result)

Combined word: nageur.se


#### Pluralize

In [13]:
def get_word_in_plural(word):
    """
    Return the plural form of the word .
    :param word: word to pluralize.
    :return: The plural form of the word.
    """

    # Return the pluralized form
    return pluralize(word)

# Example usage
sentence = "L' enfant est sur la table"
index = 1
plural_word = get_word_in_plural('enfant')
print(f"The plural of '{sentence.split()[index]}' is '{plural_word}'")


The plural of 'enfant' is 'enfants'


#### Detection Plural/Singular

In [14]:
# Avec pattern

def is_singular(word):
    """
    Détermine si un nom commun en français est au singulier, avec 3 tentatives en cas d'erreur.
    """
    attempts = 0
    while attempts < 3:  # Try up to 3 times
        try:
            parsed = parse(word)
            if not parsed or '/' not in parsed:
                raise ValueError("Parsing failed")  # Force a retry if parsing is empty

            tag = parsed.split('/')[1]

            if tag == "NN":
                return True  # Singular
            elif tag == "NNS":
                return False  # Plural
            return None  # Unknown case

        except Exception as e:
            attempts += 1
            print(f"⚠️ Erreur lors de l'analyse du mot '{word}' (Tentative {attempts}/3): {e}")
            if attempts < 3:
                time.sleep(1)  # Optional: Pause before retrying

    print(f"Impossible de traiter '{word}' après 3 tentatives.")
    return None  # Return None after 3 failed attempts

# Exemple d'utilisation
print(is_singular("chat"))  # True
print(is_singular("chats"))  # False
print(is_singular("maison"))  # True
print(is_singular("avocats"))  # False


⚠️ Erreur lors de l'analyse du mot 'chat' (Tentative 1/3): generator raised StopIteration
⚠️ Erreur lors de l'analyse du mot 'chat' (Tentative 2/3): generator raised StopIteration
True
False
True
False


In [15]:
print (predicative('chats') )

chat


In [16]:
def combination_plural(word):
    sing = predicative(word)
    fem = greedy_search_feminine_forms(sing)
    try : 
        fem_plur = pluralize(fem)
    except:
        fem_plur = word
    inclusive_word = combine_words_from_sentences(word, fem_plur)
    return(inclusive_word)

combination_plural('avocats')


'avocat.e.s'

### Déterminant

In [17]:
def find_determiner_index(tokens, noun_index):
    """
    Trouve l'indice du déterminant correspondant à un nom spécifique dans une phrase tokenisée.

    :param tokens: Liste de mots tokenisés représentant une phrase.
    :param noun_index: Index du nom commun dans la liste des tokens.
    :return: Index du déterminant correspondant ou None s'il n'existe pas.
    """
    # Reconstituer la phrase à partir des tokens
    sentence = " ".join(tokens)
    # Analyser la phrase avec SpaCy
    doc = nlp(sentence)

    # Vérifier si le nom à l'indice donné est un NOUN dans SpaCy
    if noun_index < 0 or noun_index >= len(tokens):
        return None  # Indice invalide
    if doc[noun_index].pos_ != "NOUN":
        return None  # Le mot à l'indice donné n'est pas un nom commun

    # Récupérer le token du nom et son déterminant
    noun_token = doc[noun_index]
    for child in noun_token.children:
        if child.dep_ == "det":  # Relation déterminant
            return child.i  # Retourne l'indice du déterminant correspondant

    return None  # Aucun déterminant trouvé pour ce nom

In [None]:
tokens = ["Le", "chat", "mange", "une", "souris"]
noun_index = 1  # "chat"
determiner_index = find_determiner_index(tokens, noun_index)
if determiner_index is not None:
    print(f"Le déterminant correspondant est '{tokens[determiner_index]}' à l'indice {determiner_index}.")
else:
    print("Aucun déterminant correspondant trouvé.")


Le déterminant correspondant est 'Le' à l'indice 0.


In [19]:
def rendre_determinant_inclusif(determinant):
    """
    Transforme un déterminant en écriture inclusive.

    :param determinant: Le déterminant à transformer (str).
    :return: La version inclusive du déterminant (str).
    """
    # Dictionnaire des déterminants et leurs formes inclusives

    inclusifs = {
        # Articles définis et indéfinis
    "le": "le.la",
    "un": "un.e",
    "du": "du.de la",
    "au": "au.à la",

    # Adjectifs démonstratifs
    "ce": "ce.tte",
    "cet": "ce.tte",

    # Adjectifs possessifs
    "mon": "mon.ma",
    "mes": "mes",  # Déjà inclusif
    "ton": "ton.ta",
    "tes": "tes",  # Déjà inclusif
    "son": "son.sa",

    # Déterminants interrogatifs et exclamatifs
    "quel": "quel.le",
    "quels": "quel.le.s",

    # Articles partitifs
    "de l'": "de l'",  # Ne change pas
    "du": "du.de la",

     # Autres
    "certains": "certain.e.s"  ,
    "certain": "certain.e"
    }

    determinant = determinant.lower()  # Ignore la casse
    return inclusifs.get(determinant, determinant)  # Retourne le déterminant original si non trouvé


### Adjective

In [20]:
# Not used atm but could be used

# Load the French SpaCy model
nlp = spacy.load("fr_core_news_md")

def feminize_adjective(adjective):
    """
    Transform a masculine French adjective into its feminine form.

    :param adjective: The masculine adjective (string).
    :return: The feminine form of the adjective (string).
    """
    if adjective.endswith("e"):  # Already feminine
        return adjective
    elif adjective.endswith("é"):
        return adjective + "e"
    elif adjective.endswith("el"):
        return adjective[:-2] + "elle"
    elif adjective.endswith("en"):
        return adjective[:-2] + "enne"
    elif adjective.endswith("on"):
        return adjective[:-2] + "onne"
    elif adjective.endswith("eux"):
        return adjective[:-3] + "euse"
    elif adjective.endswith("if"):
        return adjective[:-2] + "ive"
    elif adjective.endswith("c"):
        return adjective + "he"
    else:
        return adjective + "e"  # Default rule


In [21]:

def masculine_to_feminine_adjective(adjective):

    word = lexicon_fr.first(adjective, category=ADJECTIVE)
    word.feminine_singular = ''
    feminine_form = FrenchMorphologyRules().feminize_singular_element(
    word, word.realisation)
    return french_rules.feminize_singular_element(word, word.realisation)

def transform_adjective_with_pynlg(adjective, gender="feminine", number="singular"):
    """
    Transform a French adjective to agree with the specified gender and number.

    :param adjective: The base (masculine singular) form of the adjective.
    :param gender: The target gender ('feminine' or 'masculine').
    :param number: The target number ('singular' or 'plural').
    :return: The transformed adjective as a string.
    """
    word = lexicon_fr.first(adjective, category=ADJECTIVE)
    word.feminine_singular = ''  # make sure all static rules are tested
    #print(FrenchMorphologyRules().)
    feminine_form = FrenchMorphologyRules().feminize_singular_element(
    word, word.realisation)


print(masculine_to_feminine_adjective("Cher"))


Chère


In [22]:
# If the common noun linked is plural we can deduce the adjective will be plural:
# 1. find its predicative form
# 2. feminize
# 3. merge both versions

def create_plural_adjective(adj_plur):

  adj_sing = predicative(adj_plur)
  adj_sing_fem = masculine_to_feminine_adjective(adj_sing)
  adj_plur_fem = pluralize(adj_sing_fem)
  combination = combine_words_from_sentences(adj_plur, adj_plur_fem)
  # Check if the first letter of adj_plur is uppercase, if so, uppercase the first letter of combination:
  if adj_plur[0].isupper():
    combination = combination[0].upper() + combination[1:]
  return combination

def create_singular_adjective(adj_sing):
  adj_sing_fem = masculine_to_feminine_adjective(adj_sing)
  combination = combine_words_from_sentences(adj_sing, adj_sing_fem)
  return combination

create_plural_adjective('Chers')

'Cher.ère.s'

In [23]:
def find_adjective(tokens, noun_index):
    """
    Identifies the adjective linked to a given common noun, 
    and returns its position in the sentence.

    :param tokens: List of tokenized words (strings).
    :param noun_index: Index of the noun in the tokens list.
    :return: A tuple (adjective_index, feminine_adjective), or None if no linked adjective exists.
    """
    # Reconstruct the sentence from tokens
    sentence = " ".join(tokens)
    # Parse the sentence with SpaCy
    doc = nlp(sentence)

    #for i in doc:
      #print(i.pos_)

    adjective_index = []

    # Validate the noun
    if noun_index < 0 or noun_index >= len(tokens):
        return None  # Invalid index
    noun_token = doc[noun_index]
    if noun_token.pos_ != "NOUN":
        return None  # The token is not a noun

    # Find the linked adjective
    for child in noun_token.children:
        #print(child)
        if child.dep_ == "amod" and child.pos_ == "ADJ":  # Adjectival modifier
            adjective_index.append(child.i)
    
    if adjective_index is not None:
        return adjective_index

    return None  # No linked adjective found

### Past participle

In [24]:
# Exemple de texte
texte = "Il est allé"

# Analyse du texte
doc = nlp(texte)


# Rechercher les participes passés associés à un sujet
def find_past_participles(tokens, sujet_id):
  sujet_cible = tokens[sujet_id]
  past_participles = []
  for token in tokens:
      # Vérifier si le mot est un sujet correspondant au sujet donné
      # Convert token.text and sujet_cible to strings before calling lower()
      if str(token.text).lower() == str(sujet_cible).lower() and token.dep_ in {"nsubj", "nsubj:pass"}:#, 'obl:arg'}:
          # Parcourir les ancêtres pour trouver les participes passés liés
          for ancetre in token.ancestors:
              if ancetre.pos_ in {"VERB", "AUX"} and "VerbForm=Part" in ancetre.morph:

                  past_participles.append((ancetre, ancetre.i) )
                  #print(f"- Participe passé : {ancetre.text} (lemma: {ancetre.lemma_})")
  if past_participles:
    return past_participles
  else:
    return None

find_past_participles(doc, 1)

In [25]:
def inclusive_past_participle(tokens, sujet_id):
    past_participle = find_past_participles(tokens, sujet_id)
    modified_past_participle = []
    if past_participle:
        for id, pp in enumerate(past_participle):
            fem_vers = str(pp[0])
            # Case with auxiliary 'être'
            #print(pp)
            if tokens[pp[1] - 1].lemma_ == 'être' or tokens[pp[1] - 2].lemma_ == 'être':
                verb = lexicon_fr.first(pp[0].lemma_, category=VERB)
                # quand le verbe est comme 'pris' peut être plur ou sing il faut regarder comment est conjugué le verbe copule
                try :
                  number = pp[0].morph.get("Number")
                  number=number[0]
                except:
                  try:
                    if tokens[pp[1] - 1].lemma_ == 'être' and tokens[pp[1] - 1]:
                      number = tokens[pp[1] - 1].morph.get("Number")[0]
                  except:
                    if tokens[pp[1] - 2].lemma_ == 'être' and tokens[pp[1] - 2].morph.get("Number")[0]:
                      number = tokens[pp[1] - 2].morph.get("Number")[0]
                    else:
                       number = 'Sing'
                if number == 'Plur' :
                    fem_vers = french_rules.realise_verb_past_participle(
                        verb, base_word=verb, base_form=pp[0].lemma_, gender=FEMININE, number=PLURAL
                    )
                if number == 'Sing':
                    fem_vers = french_rules.realise_verb_past_participle(
                        verb, base_word=verb, base_form=pp[0].lemma_, gender=FEMININE, number=SINGULAR
                    )
                modified_past_participle = (combine_words_from_sentences(str(pp[0]), str(fem_vers)), past_participle[id][1])

            # Case with auxiliary 'avoir'
            elif tokens[pp[1] - 1].lemma_ == 'avoir' or tokens[pp[1] - 2].lemma_ == 'avoir':
                # Check if the COD precedes the verb
                cod = None
                for child in pp[0].children:
                    if child.dep_ == "obj" and child.i < pp[1]:  # COD is before the participle
                        cod = child
                        break

                if cod:  # Agreement applies if COD exists and precedes the participle
                    #print("cod" + str(cod))
                    verb = lexicon_fr.first(pp[0].lemma_, category=VERB)
                    #print("verb" + str(verb))
                    gender = cod.morph.get("Gender")[0] if cod.morph.get("Gender") else None
                    number = cod.morph.get("Number")[0] if cod.morph.get("Number") else None

                    if gender == "Fem" and number == "Plur":
                        fem_vers = french_rules.realise_verb_past_participle(
                            verb, base_word=verb, base_form=pp[0].lemma_, gender=FEMININE, number=PLURAL
                        )
                    elif gender == "Fem" and number == "Sing":
                        fem_vers = french_rules.realise_verb_past_participle(
                            verb, base_word=verb, base_form=pp[0].lemma_, gender=FEMININE, number=SINGULAR
                        )


                    try:
                        modified_past_participle = (combine_words_from_sentences(str(pp[0]), str(fem_vers)), past_participle[id][1])
                    except:
                        modified_past_participle = (str(pp[0]), past_participle[id][1])
                else:
                    modified_past_participle = (str(pp[0]), past_participle[id][1])
    return modified_past_participle

print(inclusive_past_participle(doc, 0))


('allé.e', 2)


In [26]:
def inclusive_past_participle_specific(tokens, sujet_id):
    past_participle = tokens[sujet_id]
    modified_past_participle = []
    #print(past_participle)
    if past_participle:
            pp = past_participle
            fem_vers = str(pp)
            # Case with auxiliary 'être'
            if tokens[sujet_id - 1].lemma_ == 'être' or tokens[sujet_id - 2].lemma_ == 'être':
                verb = lexicon_fr.first(pp.lemma_, category=VERB)
                if pp.morph.get("Number")[0] == 'Plur' or tokens[sujet_id - 1].morph.get("Number") == 'Plur' or tokens[sujet_id - 2].morph.get("Number") == 'Plur':
                    fem_vers = french_rules.realise_verb_past_participle(
                        verb, base_word=verb, base_form=pp.lemma_, gender=FEMININE, number=PLURAL
                    )
                if pp.morph.get("Number")[0] == 'Sing' or tokens[sujet_id - 1].morph.get("Number") == 'Sing' or tokens[sujet_id - 2].morph.get("Number") == 'Sing':
                    fem_vers = french_rules.realise_verb_past_participle(
                        verb, base_word=verb, base_form=pp.lemma_, gender=FEMININE, number=SINGULAR
                    )
                

                modified_past_participle = (combine_words_from_sentences(str(pp), str(fem_vers)), sujet_id)
                #print(modified_past_participle)
            # Case with auxiliary 'avoir'
            elif tokens[sujet_id - 1].lemma_ == 'avoir' or tokens[sujet_id - 2].lemma_ == 'avoir':
                # Check if the COD precedes the verb
                cod = None
                for child in pp.children:
                    if child.dep_ == "obj" and child.i < sujet_id:  # COD is before the participle
                        cod = child
                        break

                if cod:  # Agreement applies if COD exists and precedes the participle
                    #print("cod" + cod)
                    verb = lexicon_fr.first(pp.lemma_, category=VERB)
                    #print("verb" + verb)
                    gender = cod.morph.get("Gender")[0] if cod.morph.get("Gender") else None
                    number = cod.morph.get("Number")[0] if cod.morph.get("Number") else None

                    if gender == "Fem" and number == "Plur":
                        fem_vers = french_rules.realise_verb_past_participle(
                            verb, base_word=verb, base_form=pp.lemma_, gender=FEMININE, number=PLURAL
                        )
                        #print(fem_vers)
                    elif gender == "Fem" and number == "Sing":
                        fem_vers = french_rules.realise_verb_past_participle(
                            verb, base_word=verb, base_form=pp.lemma_, gender=FEMININE, number=SINGULAR
                        )



                    try:
                        
                        modified_past_participle = (combine_words_from_sentences(str(pp), str(fem_vers)), sujet_id)
                    except:
                        
                        modified_past_participle = (str(pp[0]), past_participle[id][1])
                else:
                    modified_past_participle = (str(pp), sujet_id)
    return modified_past_participle


### To inclusive




Translation from Standard French to Inclusive French using all rules defined above


In [27]:
def to_inclusive(text):
  # Create a list of sentences from the text
  doc = nlp(text)
  sentences = [sent.text for sent in doc.sents]
  #print(sentences)
  output_sent = []
  plural = False
  ids_to_not_change = []
  output_text = []
  for sentence in sentences:
    doc = nlp(sentence)
    #print(sentence)
    output_sent = []
    pos_tag = pos_tagging(sentence)
    sent = nlp(sentence)
    words = [word.text for word in sent]
    output_sent.extend(words) # we will replace the words little by little
    for i, tag in enumerate(pos_tag):
      if tag == 'NOUN':
        if is_person_word(words[i], sentence, model_person_noun):
          #if should_be_made_inclusive(words[i], sentence, model_inclusive): 
            # Check if the word is plural or singular
            if is_singular(words[i]):
              # If the common noun is detected as singular
              try:
                output_sent[i] = (combine_words_from_sentences(words[i], greedy_search_feminine_forms(words[i])) ) #print(f"{i}. {words[i]} should be made inclusive")
                ids_to_not_change.append(i)
              except:
                try:
                  # If it does not work, maybe the word was miscategorized as singular so we try the plural process
                  output_sent[i]=combination_plural(words[i])
                  ids_to_not_change.append(i)
                  plural = True
                except:
                  # If it still does not work, the word is likely epicene
                  if check_epicene(words[i]):
                    ids_to_not_change.append(i)
                    output_sent[i]= words[i]
            else:
              # If the common noun is detected as plural
              output_sent[i]= combination_plural(words[i])
              ids_to_not_change.append(i)
              plural = True
          #else:
          #  output_sent.append(words[i])
            # Then we find the corresponding determiner to modify it also
            id_det = find_determiner_index(words, i)

            if id_det is not None:
              ids_to_not_change.append(id_det)
              output_sent[id_det] = rendre_determinant_inclusif(words[id_det])
            # We then find adjective(s) linked to the modified common noun to modif it(them)
            ids_adj = find_adjective(words, i)
            if ids_adj is not None:
              for id_adj in ids_adj:
                ids_to_not_change.append(id_adj)
                if plural == False:
                  output_sent[id_adj] = create_singular_adjective(words[id_adj])
                else:
                  output_sent[id_adj] = create_plural_adjective(words[id_adj])
            # We then find past participle(s) linked to the common noun to modify it(them)
            past_participle = find_past_participles(sent, i)
            if past_participle is not None:
              for pp in past_participle:
                ids_to_not_change.append(pp[1])
                try:
                  output_sent[pp[1]] = str(inclusive_past_participle(sent, i)[0])
                except:
                  output_sent[pp[1]] = sent[i]
            # If the noun is dependant of another word, this other word should be made inclusive
            if sent[i].dep_ == "nsubj":
              # Find the index of the word which is sent[i].head
              id_head = words.index(sent[i].head.text)
              POS_head = pos_tag[id_head]
              if POS_head == 'ADJ':
                ids_to_not_change.append(id_head)
                # If the adj is singular
                if not plural:
                  output_sent[id_head] = create_singular_adjective(words[id_head])
                else:
                  output_sent[id_head] = create_plural_adjective(words[id_head])
              elif POS_head == 'VERB' and "VerbForm=Part" in sent[id_head].morph:
                  ids_to_not_change.append(id_head)
                  output_sent[id_head] = str(inclusive_past_participle_specific(sent, id_head)[0])
        else:
          # If not a persons noun, remains the same for now
          if i not in ids_to_not_change:
            output_sent[i] = words[i]


      else:
        if i not in ids_to_not_change:
          output_sent[i] = words[i]
    output_text.extend(output_sent)
  for i, w in enumerate(output_text):
    output_text[i] = str(w)
     
  return re.sub(r'\s([?.!,;:»)])', r'\1', ' '.join(output_text)).replace("' ", "'").replace("’ ", "’")




In [33]:
# Try it
text = "Les meilleurs professeurs se sont accordés à dire qu’elle a progressé."



output_sentence = to_inclusive(text)
print(output_sentence)



text = "Les animateurs sont diplômés et aptes à travailler avec un élève."



output_sentence = to_inclusive(text)
print(output_sentence)

text = "Seuls certains bons athlètes sont qualifiés "



output_sentence = to_inclusive(text)
print(output_sentence)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
les meilleur.e.s professeur.e.s se sont accordé.e.s à dire qu’elle a progressé.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
les animateur.rice.s sont diplômé.e.s et aptes à travailler avec un.e élève.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
Seul.e.s certain.e.s bon.ne.s athlètes sont qualifié.e.s
