# Extraction occurence référents humains les plus courants

Documents utilisés :  
dict_occ.json (dictionnaire d'occurences de chaque mot)  
lemme_form.json (dictionnaire de chaque lemme/forme du corpus)  
all_data_cleaned.json (la totalité des données après nettoyage)

Ici, on extrait les occurences pour 6 lemmes (et leurs formes): citoyen, élu, député, migrants, représentant, candidat

Extraction des formes du textes qui prennes les SN de type "citoyen et citoyenne, citoyen ou citoyenne", etc.  

Extraction de contexte pour les formes féminines

In [None]:
from ipynb.fs.defs.fonctions_preprocess import serialisation_data
from ipynb.fs.defs.fonctions_preprocess import open_file
from ipynb.fs.defs.fonctions_preprocess import convert_occ_csv
from ipynb.fs.defs.fonctions_preprocess import convert_occ_csv
from ipynb.fs.defs.Statistics import dict_occ
from collections import Counter
from copy import deepcopy
import pandas as pd
import spacy
import itertools
import json

## Extraction des occurences des formes des lemmes listés ci-dessus

In [None]:
# Charge le module SpaCy

nlp = spacy.load("fr_core_news_lg")

In [None]:
dict_occ = open_file("dict_occ.json")
dict_lemme = open_file("lemme_form.json")
data_all = open_file("all_data_cleaned.json")
data = open_file("dict_occ_word.json")
lemme_to_extract = ["citoyen", "élu", "député", "migrant", "représentant", "candidat"]

In [None]:
def extract_occurence_form (dict_occ, dict_lemme, word_list):
    
    """ Extract occurences for each form of each lemme
    -> dict_occ : dictionary of occurences
    -> dict_lemme : dictionary of lemmes as key a lemme and as value each form of the lemme detected by the tagger
    -> word_list : list of lemme to extract
    <- Dictionnary of occurences of each form
    """
    
    word_occ = {}
    dict_form = {}
    
    for lemme in dict_lemme:
        if lemme in word_list: dict_form[lemme] = dict_lemme[lemme]
    
    for lemme, form in dict_form.items():
        for word in form:
            word_occ[word] = dict_occ[word]
        
    return word_occ

In [None]:
dict_occ_word = extract_occurence_form(dict_occ, dict_lemme, lemme_to_extract)

In [None]:
print(dict_occ_word)

## Extraction des SN "citoyen et citoyenne", etc.

In [None]:
def count_SN(sent_list):
    
    """extracts multiple forms of NP like "citoyen et citoyenne"
    -> sent_list : list of sentences
    <- dictionnary of counts of NPs
    """
    
    dict_SN = {}
    
    dict_SN["citoyen_ou_citoyenne"] = 0
    dict_SN["citoyenne_ou_citoyen"] = 0
    dict_SN["citoyens_ou_citoyennes"] = 0
    dict_SN["citoyennes_ou_citoyens"] = 0
    
    dict_SN["citoyen_et_citoyenne"] = 0
    dict_SN["citoyenne_et_citoyen"] = 0
    dict_SN["citoyens_et_citoyennes"] = 0
    dict_SN["citoyennes_et_citoyens"] = 0
    

    for sentence in sent_list :
        if "citoyen ou citoyenne" in sentence: dict_SN["citoyen_ou_citoyenne"] += 1
        elif "citoyenne ou citoyen" in sentence: dict_SN["citoyenne_ou_citoyen"] += 1
        elif "citoyens ou citoyennes" in sentence: dict_SN["citoyen_ou_citoyenne"] += 1
        elif "citoyennes ou citoyens" in sentence: dict_SN["citoyen_ou_citoyenne"] += 1
            
        elif "citoyen et citoyenne" in sentence: dict_SN["citoyen_et_citoyenne"] += 1
        elif "citoyenne et citoyen" in sentence: dict_SN["citoyenne_et_citoyen"] += 1
        elif "citoyennes et citoyens" in sentence: dict_SN["citoyennes_et_citoyens"] += 1
        elif "citoyens et citoyennes" in sentence: dict_SN["citoyenne_et_citoyen"] += 1
            
    return dict_SN

In [None]:
dict_occ_SN = count_SN(data_all)

In [None]:
#Fusion des deux dictionnaires

d = deepcopy(dict_occ_word)
d.update(dict_occ_SN)

In [None]:
#Cette section trie le dictionnaire par ordre décroissant 

ordered_occ_dict = sorted(d.items(), key=lambda x:x[1], reverse = True)

print(ordered_occ_dict[0:50])
sortdict = dict(ordered_occ_dict)

In [None]:
serialisation_data(sortdict, "dict_occ_word.json")

## Mise en tableau et extraction format csv

In [None]:
data_frame = pd.DataFrame.from_dict(data, orient='index', columns = ["Occurence"])
data_frame.to_csv(r"./ word_occ_ordered.csv")

## Séparation des noms/adjectifs pour les formes féminines et inclusives

Pour cette partie, il est question de revérifier si les formes extraites sont des adjectifs ou bien des noms, et si les formes inclusives sont "accidentelles" ou non

On retague l'ensemble des données en contraignant légèrement le modèle : si le tag précédent est un nom, on ne peut pas avoir deux noms de suite.

In [None]:
def extract_form (dict_lemme, word_list):
    
    """ Creates a list of forms wanted for extraction
    -> dict_lemme : dictionary of forms of each lemme extracted before
    -> word_list : list of lemmes needed
    <-  list of forms of lemmes in word_list
    """
    
    list_form = []
    forms = []
    
    for lemme in dict_lemme:
        if lemme in word_list: forms = dict_lemme[lemme]
        for form in forms: 
            if form not in list_form: list_form.append(form)
    
    return list_form

In [None]:
forms = extract_form(dict_lemme, lemme_to_extract)

print(forms)


In [None]:
def extract_adjectives (sentences_list, word_list):
    
    extraction = {}
    
    i=0
    for sent in sentences_list:
        if i%100000 == 0 : print("phrase n°", i, "/", len(sentences_list))
        doc = nlp(sent)
        for word in word_list:
            for i in range(len(doc)):
                
                if doc[i].text == word and doc[i].pos_ == "NOUN" and doc[i-1].pos_ != "NOUN" :
                    if word not in extraction : 
                        extraction[word] = 0
                    extraction[word] +=1
        i +=1
    
    return extraction
                

In [None]:
word_occ = extract_adjectives(extract, forms)

In [None]:
serialisation_data(word_occ, "word_occ3.json")

## Mise en tableau et extraction format csv

In [None]:
data = open_file("word_occ.json")

In [None]:
ordered_occ_dict = sorted(data.items(), key=lambda x:x[1], reverse = True)

print(ordered_occ_dict[0:20])
sortdict = dict(ordered_occ_dict)

In [None]:
data_frame = pd.DataFrame.from_dict(sortdict, orient='index', columns = ["Occurence"])
data_frame.to_csv(r"./ word_occ_ordered2.csv")

In [None]:
data = open_file("word_occ_all.json")

In [None]:
convert_occ_csv(data, "word_occ_all.csv")

## Extraction d'exemples des occurences féminines et inclusives

In [None]:
fem_form = [forms[2], forms[6], forms[9], forms[10], forms[11], forms[16], forms[19], forms[20], forms[22], forms[23], forms[24], forms[25], forms[36], 
                forms[37], forms[39], forms[40], forms[41], forms[43], forms[45], forms[51], forms[53], forms[54], forms[58], forms[62], forms[64], forms[65],
                forms[71], forms[72], forms[74], forms[75]]

print(fem_form)

In [None]:
def dict_sn_occ (sentences_list, word_list):
    
    """ Creates context (word-2, word-1, word) to see if some nouns are mistagged.
    -> sentences_list : list of sentences
    -> word_list : list of words to extract context
    <- Dictionnary of contexts of form {word1 : [context1, context2, ...], word2 : [context1, context1, ...]}
    """
    
    dict_fem_occ = {}
    
    for sent in sentences_list:
        doc = nlp(sent)
        for word in word_list:
            if word not in dict_fem_occ : dict_fem_occ[word] = []
            for i in range(len(doc)):

                if doc[i].text == word and doc[i].pos_ == "NOUN" :
                    #if len(doc) > 5 and (doc[i] != doc[i-1] or doc[i] != doc[i-2]) and (doc[i] != doc[0] or doc[i] != doc[1]) : #Permet de s'assurer qu'on peut prendre tous les mots autour
                    if len(doc) > 5 and (i != 0 or i != 1) :  
                        context = str(doc[i-2]), str(doc[i-1]), str(doc[i])
                        dict_fem_occ[word].append(context)
                        
                    else :
                        if i == 0 : context = "*d1*", "*d2*", str(doc[i])
                        elif i == 1 : context = "*d1*", doc[0], str(doc[i])
                        else : context = str(doc[i-2]), str(doc[i-1]), str(doc[i])
                        dict_fem_occ[word].append(context)
    
    return dict_fem_occ

In [None]:
def count_context_occ (dict_form_occ):
    
    """ Creates a dictionary in which we count occurences of context made previously
    -> dict_form_occ : dictionary of contexts
    <- Dictionary of occurences of context
    """
    
    docc = {}
    
    for form in dict_fem_occ:
        docc[form] = Counter(dict_fem_occ[form])
        
    return docc    

In [None]:
docc = count_context_occ(dict_fem_occ)

In [None]:
def serialisation_data (data, title):
  """
  Serialize data in a json file
  -> Title mus be a string : title.json
  <- Save a file in desktop
  """

  with open(title, "w+") as file:
    json.dump(data, file)

In [None]:
serialisation_data(docc, "data_extract.json")

In [None]:
#On crée des dictionnaires pour chaque lemme féminin

dict_elue = dict(itertools.islice(docc.items(), 5))
dict_citoyenne = dict(itertools.islice(docc.items(), 5, 12))     
dict_depute = dict(itertools.islice(docc.items(), 12, 19))
dict_representante = dict(itertools.islice(docc.items(), 19, 22))
dict_candidate = dict(itertools.islice(docc.items(), 22, 26))
dict_migrante = dict(itertools.islice(docc.items(), 26, len(docc)))

In [None]:
df1 = pd.DataFrame(dict_elue) #, orient='index', columns = ["Occurence"])
df1.to_csv(r"./form_fem_elue.csv")

In [None]:
df1 = pd.DataFrame(dict_citoyenne) #, orient='index', columns = ["Occurence"])
df1.to_csv(r"./form_fem_citoyenne.csv")