L'objet de ce notebook est de fournir un exemple d'étude de texte en français par traitement du langage naturel.
Le texte choisi est l'intégralité de la Comédie humaine de Balzac.

# 1. Librairies

In [None]:
!python -m spacy download fr_core_news_md;
!python -m spacy download fr_core_news_sm;

In [None]:
import csv;
def SaveData (Filename="",DataList=[]):
    with open (Filename,"w",encoding='utf-8',newline='\n') as csvfile:
        DataWriter=csv.writer(csvfile,delimiter='\n',quotechar=" ",quoting=csv.QUOTE_NONNUMERIC)
        DataWriter.writerow(DataList)
        csvfile.close()
        print ("Données enregistrées!");

In [None]:
from pprint import pprint;
import numpy as np;
import pandas as pd;
import seaborn as sns;
import matplotlib.pyplot as plt;
import nltk;
from sklearn.feature_extraction.text import CountVectorizer;
from sklearn.feature_extraction.text import TfidfVectorizer;
from sklearn.preprocessing import LabelBinarizer;
from nltk.corpus import stopwords;
from nltk.stem.porter import PorterStemmer;
from wordcloud import WordCloud,STOPWORDS;
from nltk.stem import WordNetLemmatizer;
from nltk.tokenize import word_tokenize,sent_tokenize;
import os;
import spacy;
import re,string,unicodedata;
from nltk.tokenize.toktok import ToktokTokenizer;
from nltk.stem import LancasterStemmer,WordNetLemmatizer;
from nltk.tag import pos_tag;
from sklearn.linear_model import LogisticRegression,SGDClassifier;
from sklearn.naive_bayes import MultinomialNB;
from sklearn.svm import SVC;
from textblob import TextBlob;
from textblob import Word;
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score;
import fr_core_news_md;
nlp_fr = fr_core_news_md.load();
import spacy.lang.fr;
from stop_words import get_stop_words;
import plotly.graph_objects as go;
import networkx as nx;
from networkx.algorithms import bipartite;
import igraph as ig;


# 2. Import du texte

In [None]:
def getFilenames(root, extension='.txt', separator=' '): #fonction pour récupérer les noms de fichiers
    result = []
    for _, _, files in os.walk(root):
        for file in files:
            if file.endswith(extension):
                result.append(file.split(separator)[0])
    return result

In [None]:
list_files = getFilenames("../input/la-comedie-humaine-de-balzac/");

In [None]:
def nett_nom(texte):  #fonction de nettoyage du nom des fichiers pour obtenir le nom du roman
    cleantext=re.sub(r'[0-9]', '', str(texte))
    cleantext=re.sub(r'\A_', '', cleantext)
    cleantext=re.sub(r'.txt', '', cleantext)
    cleantext=re.sub(r'_',' ', cleantext)
    return cleantext

In [None]:
df = pd.DataFrame({'Noms_fichier': list_files}) #définition du data frame contenant les noms de fichier, les noms de roman et le texte.
df['Noms_romans'] = df.Noms_fichier.apply(lambda x: nett_nom(x));

In [None]:
df['Texte']=0 #initialisation de la colonne 'Texte'

In [None]:
def get_text(fileName):  #fonction de récupération du texte de chaque roman
    fileName=str("../input/la-comedie-humaine-de-balzac/" )+ str(fileName)
    f = open(fileName, "r", encoding='latin-1')
    cleantext = f.read()
    return cleantext

In [None]:
for i in range(0,len(df)):  # import du texte dans la colonne 'Texte'
    tmp = str(get_text(str(df.Noms_fichier.iloc[i])))
    df.Texte.iloc[i]=tmp
    #print (i)

# 3. Etude du nom des personnages

On commence par extraire les noms propres du premier roman. On procède par NER grâce à spaCy.

In [None]:
doc = nlp_fr(re.sub(r'\x97','',re.sub(r'\x92',' ',df.Texte.iloc[0]))) 

In [None]:
Noms_propres=pd.DataFrame(columns = ['Word' , 'Label'])
i=0
for word in doc.ents:
    Noms_propres.loc[i]=[word.text,word.label_]
    i=i+1

On ne conserve que les noms propres de personnes (étiquetés 'PER').

In [None]:
tmp=Noms_propres.loc[Noms_propres['Label'] == 'PER']
tmp['Label']=1
tmp['Word']=tmp.Word.apply(lambda x: str.lower(x))
tmp=tmp.groupby(by=["Word"],as_index=False).sum()
#tmp=tmp.loc[tmp['Label'] >2]
tmp['Roman']=str(df.Noms_romans.iloc[0])
tmp

In [None]:
Noms=tmp
Noms

In [None]:
for i in range (1,len(list_files)):
    doc = nlp_fr(re.sub(r'\x97','',re.sub(r'\x92',' ',df.Texte.iloc[i])))
    nom_roman=str(df.Noms_romans.iloc[i])
    Noms_propres=pd.DataFrame(columns = ['Word' , 'Label'])
    j=0
    for word in doc.ents:
        Noms_propres.loc[j]=[word.text,word.label_]
        j=j+1
    tmp=Noms_propres.loc[Noms_propres['Label'] == 'PER']
    tmp['Label']=1
    tmp['Word']=tmp.Word.apply(lambda x: str.lower(x))
    tmp=tmp.groupby(by=["Word"],as_index=False).sum()
    #tmp=tmp.loc[tmp['Label'] >2]
    tmp['Roman']=str(nom_roman)
    tmp
    Noms=pd.concat([Noms,tmp], ignore_index=False)
    del tmp
    del nom_roman
    del Noms_propres
    #print(i)

Aperçu du data frame obtenu:

In [None]:
display(Noms)

In [None]:
romans=df['Noms_romans']
noms_pers = set(Noms['Word'])
liens = pd.DataFrame(Noms)
liens.set_axis(['personnages', 'weigth','livres'], 
                    axis='columns', inplace=True)

In [None]:
G = nx.Graph()
G.add_nodes_from(romans,bipartite='livres')
G.add_nodes_from(noms_pers,bipartite='personnages')

In [None]:
G.add_weighted_edges_from([(row['personnages'], row['livres'],1) for idx, row in liens.iterrows()],weight='weigth')

In [None]:
#print(G.edges(data=True))
pos = {node:[0, i] for i,node in enumerate(liens['livres'])}
pos.update({node:[1, i] for i,node in enumerate(liens['personnages'])})
nx.draw(G, pos, with_labels=False)
for p in pos:  # raise text positions
    pos[p][1] += 0.25
nx.draw_networkx_labels(G, pos)

In [None]:
Noms = Noms.groupby(by=["personnages"],as_index=False).sum()
Noms = Noms.sort_values(by=['weigth'],ascending=False)
Noms = Noms.loc[Noms['weigth'] >300]
Noms
fig = go.Figure(
    data=[go.Bar(y=Noms['weigth'],x=Noms['personnages'])],
    layout_title_text="Noms propres de personnages les plus représentés dans la Comédie humaine" )
fig.show()

Bonus: reseau des personnages récurrents dans la Comédie humaine.

In [None]:
Nodes=pd.read_csv('../input/la-comedie-humaine-de-balzac/nodes.csv',sep=";",encoding='utf-8')
Links=pd.read_csv('../input/la-comedie-humaine-de-balzac/links.csv',sep=";",encoding='utf-8')
Links.set_axis(['from', 'to'], 
                    axis='columns', inplace=True)
L=len(Links)
N=len(Nodes)
Edges=[(Links['from'][k], Links['to'][k]) for k in range(L)]

In [None]:
labels=[]
group=[]
for node in range(0,N):
    labels.append(Nodes['nom'].iloc[node])
    group.append(Nodes['groupe'].iloc[node])
labels

In [None]:
G=ig.Graph(Edges, directed=False)

In [None]:
layt=G.layout('kk', dim=3)

Xn=[layt[k][0] for k in range(N)]# x-coordinates of nodes
Yn=[layt[k][1] for k in range(N)]# y-coordinates
Zn=[layt[k][2] for k in range(N)]# z-coordinates
Xe=[]
Ye=[]
Ze=[]

for e in Edges:
    Xe+=[layt[e[0]][0],layt[e[1]][0], None]# x-coordinates of edge ends
    Ye+=[layt[e[0]][1],layt[e[1]][1], None]
    Ze+=[layt[e[0]][2],layt[e[1]][2], None]

In [None]:
!pip install chart_studio
import chart_studio.plotly as py
trace1=go.Scatter3d(x=Xe,
               y=Ye,
               z=Ze,
               mode='lines',
               line=dict(color='rgb(125,125,125)', width=1),
               hoverinfo='none'
               )

trace2=go.Scatter3d(x=Xn,
               y=Yn,
               z=Zn,
               mode='markers',
               name='actors',
               marker=dict(symbol='circle',
                             size=6,
                             color=group,
                             colorscale='Viridis',
                             line=dict(color='rgb(50,50,50)', width=0.5)
                             ),
               text=labels,
               hoverinfo='text'
               )

axis=dict(showbackground=False,
          showline=False,
          zeroline=False,
          showgrid=False,
          showticklabels=False,
          title=''
          )

layout = go.Layout(
         title="Réseau des personnages récurrents de la Comédie humaine et des romans dans lesquels ils apparaissent",
         width=1000,
         height=1000,
         showlegend=False,
         scene=dict(
             xaxis=dict(axis),
             yaxis=dict(axis),
             zaxis=dict(axis),
        ),
     margin=dict(
        t=100
    ),
    hovermode='closest',
    annotations=[
           dict(
           showarrow=False,
            xref='paper',
            yref='paper',
            x=0,
            y=0.1,
            xanchor='left',
            yanchor='bottom',
            font=dict(
            size=14
            )
            )
        ],    )
data=[trace1, trace2]
fig=go.Figure(data=data, layout=layout)

In [None]:
import plotly.io as pio
pio.show(fig)

# 4. Etude des lieux

In [None]:
doc = nlp_fr(re.sub(r'\x97','',re.sub(r'\x92',' ',df.Texte.iloc[0]))) 
Noms_propres=pd.DataFrame(columns = ['Word' , 'Label'])
i=0
for word in doc.ents:
    Noms_propres.loc[i]=[word.text,word.label_]
    i=i+1
tmp=Noms_propres.loc[Noms_propres['Label'] == 'LOC']
tmp['Label']=1
tmp['Word']=tmp.Word.apply(lambda x: str.lower(x))
tmp=tmp.groupby(by=["Word"],as_index=False).sum()
#tmp=tmp.loc[tmp['Label'] >2]
tmp['Roman']=str(df.Noms_romans.iloc[0])
Lieux=tmp
for i in range (1,len(list_files)):
    doc = nlp_fr(re.sub(r'\x97','',re.sub(r'\x92',' ',df.Texte.iloc[i])))
    nom_roman=str(df.Noms_romans.iloc[i])
    Noms_propres=pd.DataFrame(columns = ['Word' , 'Label'])
    j=0
    for word in doc.ents:
        Noms_propres.loc[j]=[word.text,word.label_]
        j=j+1
    tmp=Noms_propres.loc[Noms_propres['Label'] == 'LOC']
    tmp['Label']=1
    tmp['Word']=tmp.Word.apply(lambda x: str.lower(x))
    tmp=tmp.groupby(by=["Word"],as_index=False).sum()
    #tmp=tmp.loc[tmp['Label'] >2]
    tmp['Roman']=str(nom_roman)
    tmp
    Lieux=pd.concat([Lieux,tmp], ignore_index=False)
    del tmp
    del nom_roman
    del Noms_propres
    
Lieux

In [None]:
Lieux = Lieux.groupby(by=["Word"],as_index=False).sum()
Lieux = Lieux.sort_values(by=['Label'],ascending=False)
Lieux = Lieux.loc[Lieux['Label'] >200]
Lieux
fig = go.Figure(
    data=[go.Bar(y=Lieux['Label'],x=Lieux['Word'])],
    layout_title_text="Noms propres de lieux les plus représentés dans la Comédie humaine"
    
)
fig.show()

# 5. Etude du texte

In [None]:
#from spacy.tokens import Token
#stop_words_getter = lambda token: token.is_stop or token.lower_ in stopwords or token.lemma_ in stopwords
#Token.set_extension('is_stop', getter=stop_words_getter, force=True)
#docs = list(nlp_fr.pipe(df.Texte))
#tokens = [[w.lemma_ for w in tokens if w.is_alpha and (len(w.lemma_))>2 and not w.is_stop] for tokens in docs]
#Lemmas = pd.Series(tokens)
#df.loc[:,'lemmas'] =Lemmas

In [None]:
#def suppSW(liste):
#    lem_sw = []
#    for mot in liste:
#        if mot not in stopwords:
#            lem_sw.append(mot)
#    return lem_sw

In [None]:
#lemmy = df_reviews_clean.lemmas.apply(suppSW)
#lem_sery = pd.Series(lemmy)
#df['lemmas_c'] = lem_sery