# Importation des modules

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from ast import literal_eval
import csv
import re

# Importation des CSV de référence

In [2]:
RefNom = pd.read_csv("./../Tables/ReferentielNoms.csv",converters={"FormesNonNormalisées": literal_eval}).set_index("NomNormalisé")
RefPrenom = pd.read_csv("./../Tables/RéférentielPrénoms.csv", converters={"FormesNonNormalisées": literal_eval}).set_index("PrénomNormalisé")

# 1. Parsing du document xml et extraction des informations en vue de la constitution d'un dataframe

In [10]:
def ParsingXMLTEItoDictList(CheminRelatifFichier,DivCC):
    
    soup = BeautifulSoup(open(CheminRelatifFichier, 'r'))

    ListeDicosPersonnes = []
    
    for div in soup.find_all('div'):
            if div['type'] == 'date':
                Date = div['n']

    for persname in soup.find_all('persname'):

        DicoPersonne = {}
        
        # Loop pour nom non normalisé.
        for surname in persname.find_all('surname'):
            DicoPersonne['nomNN'] = surname.get_text()
            
        # Loop pour nom normalisé.
            DataToComp = surname.get_text()
            DataToComp = re.sub(r"\n", " ", DataToComp)
            if type(DataToComp) == float:
                DicoPersonne['nom'] = ''
            for j in range (0, len(RefNom.index)):
                CheckN = RefNom.iloc[j,1]
                for element in CheckN:
                    if DataToComp == element:
                        DicoPersonne['nom'] = RefNom.index[j]
        
        # Loop pour prénom non normalisé.
        for forename in persname.find_all('forename'):
            DicoPersonne['prénomNN'] = forename.get_text()
        
        # Loop pour prénom normalisé.
            DataToComp = forename.get_text()
            DataToComp = re.sub(r"\n", " ", DataToComp)
            if type(DataToComp) == float:
                DicoPersonne['prénom'] = ''
            for j in range (0, len(RefPrenom.index)):
                CheckN = RefPrenom.iloc[j,2]
                for element in CheckN:
                    if DataToComp == element:
                        DicoPersonne['prénom'] = RefPrenom.index[j]

        for genname in persname.find_all('genname'):
            DicoPersonne['surnom'] = genname.get_text()

        Roles = []
        for rolename in persname.find_all('rolename'):
            Role = rolename.get_text()
            RoleToAppend = Role + '-' + Date
            Roles.append(RoleToAppend)
            DicoPersonne['fonction'] = Roles

        for div in persname.find_parents('div'):
            
            if div['type'] == 'Ordre' :
                DicoPersonne['corps civique'] = div['n']
            
            if div['type'] == 'Corporation':
                DicoPersonne['Corporation'] = div['n']
                
            if div['type'] == 'Poele':
                DicoPersonne['Poêle'] = div['n']
        
        NomCompletNN =  persname.get_text()
        NomCompletNN = re.sub(r"\n", " ", NomCompletNN)
        DicoPersonne['NomCompletNN'] = NomCompletNN    

        ListeDicosPersonnes.append(DicoPersonne)

    return (ListeDicosPersonnes)

In [11]:
ListeDicosPersonnes = ParsingXMLTEItoDictList('../Sources/EditionsXML/aves_aa_195_f69r-77r-v2.xml', 'TEI')
print(ListeDicosPersonnes)

[{'nomNN': 'Schalck', 'nom': 'Schalk', 'prénomNN': 'Obreht', 'prénom': 'Obrecht', 'fonction': ['her-1444', 'ammeister-1444'], 'Poêle': 'Zum_Encker', 'Corporation': 'à_l_Ancre', 'corps civique': 'Échevinat', 'NomCompletNN': 'her Obreht Schalck der ammeister'}, {'nomNN': 'Armbruster In Brantgasse', 'nom': 'Armbruster in Brantgasse', 'prénomNN': 'Cünrat', 'prénom': 'Konrat', 'Poêle': 'Zum_Encker', 'Corporation': 'à_l_Ancre', 'corps civique': 'Échevinat', 'NomCompletNN': 'Cünrat Armbruster In Brantgasse'}, {'nomNN': 'Amlung', 'nom': 'Amelung', 'prénomNN': 'Hanns', 'prénom': 'Johann', 'Poêle': 'Zum_Encker', 'Corporation': 'à_l_Ancre', 'corps civique': 'Échevinat', 'NomCompletNN': 'Hanns Amlung'}, {'nomNN': 'Phige', 'nom': 'Phye', 'prénomNN': 'Reimbolt', 'prénom': 'Reimbolt', 'Poêle': 'Zum_Encker', 'Corporation': 'à_l_Ancre', 'corps civique': 'Échevinat', 'NomCompletNN': 'Reimbolt Phige'}, {'nomNN': 'Lumbart', 'nom': 'Lumbart', 'prénomNN': 'Hanns', 'prénom': 'Johann', 'Poêle': 'Zum_Encker', 

# 2. Normalisation des prénoms

In [12]:
DfListe = pd.DataFrame(ListeDicosPersonnes)
pd.set_option('display.max_rows', DfListe.shape[0]+1)
DfListe.fillna('NaN')

Unnamed: 0,nomNN,nom,prénomNN,prénom,fonction,Poêle,Corporation,corps civique,NomCompletNN,surnom
0,Schalck,Schalk,Obreht,Obrecht,"[her-1444, ammeister-1444]",Zum_Encker,à_l_Ancre,Échevinat,her Obreht Schalck der ammeister,
1,Armbruster In Brantgasse,Armbruster in Brantgasse,Cünrat,Konrat,,Zum_Encker,à_l_Ancre,Échevinat,Cünrat Armbruster In Brantgasse,
2,Amlung,Amelung,Hanns,Johann,,Zum_Encker,à_l_Ancre,Échevinat,Hanns Amlung,
3,Phige,Phye,Reimbolt,Reimbolt,,Zum_Encker,à_l_Ancre,Échevinat,Reimbolt Phige,
4,Lumbart,Lumbart,Hanns,Johann,,Zum_Encker,à_l_Ancre,Échevinat,Hanns Lumbart,
5,Lumbart,Lumbart,Steffan,Stefan,,Zum_Encker,à_l_Ancre,Échevinat,Steffan Lumbart,
6,Armbruster,Armbruster,Hanns,Johann,[rotherre-1444],Zum_Encker,à_l_Ancre,Échevinat,Hanns Armbruster der rotherre,
7,Bisinger,Bisinger,Heinrich,Heinrich,,Zum_Encker,à_l_Ancre,Échevinat,Heinrich Bisinger,
8,Wissenburg,Wissenburg,Hanns,Johann,,Zum_Encker,à_l_Ancre,Échevinat,Hanns Wissenburg,
9,von Lutesheim,"Leutesheim, von",Hanns,Johann,,Zum_Encker,à_l_Ancre,Échevinat,Hanns von Lutesheim,


In [6]:
RefPrenom = pd.read_csv("./../Tables/RéférentielPrénoms.csv", converters={"FormesNonNormalisées": literal_eval}).set_index("PrénomNormalisé")

In [7]:
RefPrenom

Unnamed: 0_level_0,PrénomFrançais,Types,FormesNonNormalisées
PrénomNormalisé,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adalhelm,Alleaume,"['masc', 'germanique']","[Adalhelm, Adelhelm, Alhelm]"
Adam,Adam,"['masc', 'hébraïque']",[Adam]
Adolf,Adolphe,"['masc', 'germanique']","[Adolf, Adolff]"
Andres,André,"['masc', 'grec']",[Andres]
Anshelm,Anselme,"['masc', 'germanique']",[Anshelm]
Behrmann,,"['masc', 'germanique']","[Behrmann, Behrman, Bermann, Berman]"
Bechtolt,Bertold,"['masc', 'germanique']","[Behtolt, Bechtolt]"
Bernhart,Bernard,"['masc', 'germanique']","[Bernhart, Bernhard]"
Burkhart,,"['masc', 'germanique']","[Burkhart, Burkhard, Burckhart, Burckhard, Bur..."
Claus,Nicolas,"['masc', 'grec']","[Claus, Clauwes, Claws, Niclaus]"


In [8]:
'''
ListeNNN = []
ListeNNP = []
for i in range (0,len(DfListe.index)):
    DataToComp = DfListe.iloc[i,1]
    if type(DataToComp) == float:
        ListeNNP.append('') 
    for j in range (0, len(RefPrenom.index)):
        CheckN = RefPrenom.iloc[j,2]
        for element in CheckN:
            if DataToComp == element:
                ListeNNP.append(RefPrenom.index[j])
print(ListeNNP)

for i in range (0,len(DfListe.index)):
    DataToComp = DfListe.iloc[i,0]
    if type(DataToComp) == float:
        ListeNNN.append('')
    for j in range (0, len(RefNom.index)):
        CheckN = RefNom.iloc[j,1]
        for element in CheckN:
            if DataToComp == element:
                ListeNNN.append(RefNom.index[j])
print(ListeNNN)
'''

"\nListeNNN = []\nListeNNP = []\nfor i in range (0,len(DfListe.index)):\n    DataToComp = DfListe.iloc[i,1]\n    if type(DataToComp) == float:\n        ListeNNP.append('') \n    for j in range (0, len(RefPrenom.index)):\n        CheckN = RefPrenom.iloc[j,2]\n        for element in CheckN:\n            if DataToComp == element:\n                ListeNNP.append(RefPrenom.index[j])\nprint(ListeNNP)\n\nfor i in range (0,len(DfListe.index)):\n    DataToComp = DfListe.iloc[i,0]\n    if type(DataToComp) == float:\n        ListeNNN.append('')\n    for j in range (0, len(RefNom.index)):\n        CheckN = RefNom.iloc[j,1]\n        for element in CheckN:\n            if DataToComp == element:\n                ListeNNN.append(RefNom.index[j])\nprint(ListeNNN)\n"

In [9]:
print(len(ListeNNP))
print(len(ListeNNN))
print(len(DfListe.index))
print(DfListe.index)

NameError: name 'ListeNNP' is not defined

In [None]:
DfListe['NomNormalisé'] =  ListeNNN
DfListe['PrénomNormalisé'] = ListeNNP
DfListe

In [None]:
"""
dataframe[["nom","prénom"]].describe()
NomPrénom = dataframe[['nom', 'prénom']].to_csv(index=True)
print (NomPrénom)
with open ('../Tables/NomsPrénoms.csv','w') as Doc:
    Doc.write(NomPrénom)
import matplotlib.pyplot as plt
df2 = dataframe["Corporation"].value_counts()
df3 = dataframe['Poêle'].value_counts()
df2.plot.pie()
plt.show()
df4 = dataframe["nom"].value_counts()
df4
df4.plot.pie()
"""