In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import nltk
import gensim
import re
import pickle
import json
import string
from tqdm import tqdm

In [2]:
import spacy
nlp = spacy.load('fr_core_news_md')

In [3]:
from nltk.stem.snowball import FrenchStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shaya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from collections import Counter
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

In [5]:
stop_words = set(stopwords.words('french'))

In [6]:
stop_words.add("n'")
stop_words.add("l'")
stop_words.add("d'")
stop_words.add("c'")
stop_words.add("qu'")
stop_words.add("quelqu'")
stop_words.add("quelqu")
stop_words.add("s'")
stop_words.add("n’")
stop_words.add("l’")
stop_words.add("d’")
stop_words.add("c’")
stop_words.add("qu’")
stop_words.add("quelqu’")
stop_words.add("s’")
len(stop_words)

172

In [7]:
# Specify the path to your JSON file
json_file_path = 'stop_words_french.json'

# Open the JSON file
with open(json_file_path, encoding='utf-8') as json_file:
    # Load the JSON data
    stopwords = json.load(json_file)
len(stopwords)

496

In [8]:
stopwords = set(stopwords).union(stop_words)

In [9]:
with open('memoirs_preprocessed.pkl', 'rb') as f:
    data = pickle.load(f)

In [10]:
def extract_woi_context(commune_memoirs, wois, method, count = 0):

    woi_context_extraction = pd.DataFrame(columns=['filename', 'memoir_len_sans_sb_pb', 'bias', 'woi', 'woi_location', 'extraction_method', 'text'])

    for i, row in tqdm(commune_memoirs.iterrows()):

        filename = row[0]

        # commented out because this will corrupt lengths by token count
        #if filename == "du_camp_1.txt" or filename == "du_camp_2.txt":
        #    filename = "du_camp.txt"
        #if filename == "arnould_1.txt" or filename == "arnould_2.txt" or filename == "arnould_3.txt":
        #    filename = "arnould.txt"
        #if filename == "cluseret_1.txt" or filename == "cluseret_2.txt" or filename == "cluseret_3.txt":
        #    filename = "cluseret.txt"
        #if filename == "da_costa_1.txt" or filename == "da_costa_2.txt" or filename == "da_costa_3.txt":
        #    filename = "da_costa.txt"

        bias = row[1]
        text = row[2]
        
        memoir_len_sans_sb_pb = len([token for token in text.split() if token not in ["<sb>","<pb>"]])

        text_lower = text.lower()
        for woi in wois:
            if woi.lower() in text_lower:
                indices_object = re.finditer(pattern=r'\b{}\b'.format(re.escape(woi.lower())), string=text_lower)
                indices = [index.start() for index in indices_object]
                for location in indices:
                    if method == "token" or method == "character":
                        method_s = str(method) + ", " + str(count) + " either side"
                        window_half = count
                        if method == "token":
                            pre_tokens = ' '.join(text_lower[:location].split()[-window_half:])
                            post_tokens = ' '.join(text_lower[location:].split()[:window_half+len(woi.split())])
                            tmp_sequence = pre_tokens + " " + post_tokens
                            woi_location = len(pre_tokens) + 1
                        if method == "character":
                            pre_string = text_lower[:location][-window_half:]
                            post_string = text_lower[location:][:window_half+len(woi)]
                            tmp_sequence = pre_string + post_string
                            woi_location = 0
                            woi_location = len(pre_string)
                    elif method == "<sb>" or method == "<pb>":
                        method_s = method
                        pre_string = text_lower[text_lower[:location].rfind(method):location]
                        if "<pb>" in pre_string and method == "<sb>":
                            pre_string = pre_string[pre_string.rfind("<pb>"):]
                        post_string_tmp = text_lower[location:]
                        post_string = post_string_tmp[:post_string_tmp.find(method) + 4]
                        if "<pb>" in post_string and method == "<sb>":
                            post_string = post_string[:post_string.find("<pb>")]
                        tmp_sequence = pre_string + post_string
                        woi_location = 0
                        woi_location = len(pre_string)
                    woi_context_extraction.loc[len(woi_context_extraction)] = [filename, memoir_len_sans_sb_pb, bias, woi, woi_location, method_s, tmp_sequence]
    
    return woi_context_extraction

In [11]:
def lda(words):
    df = extract_woi_context(data, words, "<sb>")
    df.text = df.text.str.replace('<pb>', '')
    df.text = df.text.str.replace('<sb>', '')
    df.text = df.text.apply(lambda t : nlp(t))
    df.text = df.text.apply(lambda i :[token.lemma_ for token in i if token.text not in stopwords])
    
    opinions = ['pro', 'anti']
    ldas = []
    for opinion in opinions : 
        df_opi = df[df.bias == opinion]
        opi_values = list(df_opi.text.values)
    
        dic, corpus, words = pre_process(opi_values)
        lda = LdaModel(corpus=corpus, id2word=dic, num_topics=3, 
                    alpha='auto',
                    eta='auto',
                    passes=10,
                    iterations=500,
                    eval_every=None,
                    random_state=12345)
        ldas.append(lda)
    for index, lda in enumerate(ldas) : 
        print(f"{opinions[index]}:")
        for idx, topic in lda.show_topics(num_topics=-1, num_words=10):
            print("Topic %d:" % (idx + 1))
            print(topic)
        print()

-----------------------------

In [12]:
names = ['rigault', 'favre', 'pyat', 'ferré']

In [13]:
surnames = ['raoul', 'jules', 'félix', 'théophile']

In [14]:
def lemmatize(text):
    lemmas = []
    for token in text:
        if token.text.lower() not in stopwords and token.text.lower() not in surnames:
            if token.ent_type_ == "PERSON" or token.text == 'paris':
                lemmas.append(token.text)
            elif token.text.lower() in surnames:
                continue
            else:
                lemmas.append(token.lemma_)
    return lemmas

In [15]:
def pre_process(val):
    dictionary = corpora.Dictionary(val)
    corpus = [dictionary.doc2bow(v) for v in val]
    return dictionary, corpus

In [16]:
def lda_full(words):
    df = extract_woi_context(data, words, "<sb>")
    df = df.drop_duplicates(subset='text', keep='first')
    df.text = df.text.str.replace('<pb>', '')
    df.text = df.text.str.replace('<sb>', '')
    df.text = df.text.apply(lambda t : nlp(t))
    df.text = df.text.apply(lemmatize)
    
    values = list(df.text.values)
    values = [[word for word in group if word not in string.punctuation + " " + "«" + "»" + "—" ] for group in values]
    num_lists = len(values)
    word_count = Counter(w for t in values for w in t)
    min_threshold = 5
    max_threshold = num_lists * 0.9
    values = [[word for word in group if min_threshold <= word_count[word] <= max_threshold] for group in values]

    
    
    dic, corpus = pre_process(values)
    lda = LdaModel(corpus=corpus, id2word=dic, num_topics=3, 
                alpha='auto',
                eta='auto',
                passes=10,
                iterations=500,
                eval_every=None,
                random_state=12345)
    
    for idx, topic in lda.show_topics(num_topics=-1, num_words=10):
        print("Topic %d:" % (idx + 1))
        print(topic)

In [17]:
for name, surname in zip(names,surnames):
    print(surname + " " + name)
    lda_full([name])

raoul rigault


31it [00:01, 19.28it/s]


Topic 1:
0.023*"ordre" + 0.022*"police" + 0.016*"préfecture" + 0.016*"commune" + 0.016*"ferré" + 0.011*"trouver" + 0.011*"donner" + 0.010*"prendre" + 0.010*"exécution" + 0.010*"pouvoir"
Topic 2:
0.035*"commune" + 0.027*"citoyen" + 0.025*"dire" + 0.021*"ferré" + 0.019*"faire" + 0.018*"police" + 0.015*"procureur" + 0.015*"pouvoir" + 0.013*"général" + 0.012*"je"
Topic 3:
0.042*"faire" + 0.013*"arrestation" + 0.012*"pouvoir" + 0.011*"je" + 0.010*"ami" + 0.010*"vouloir" + 0.010*"trouver" + 0.009*"nom" + 0.009*"savoir" + 0.009*"commission"
jules favre


31it [00:01, 25.29it/s]


Topic 1:
0.040*"ministre" + 0.039*"simon" + 0.038*"picard" + 0.034*"m." + 0.025*"trochu" + 0.023*"thier" + 0.022*"paris" + 0.014*"affaire" + 0.014*"maire" + 0.013*"étranger"
Topic 2:
0.033*"paris" + 0.027*"faire" + 0.026*"avocat" + 0.024*"an" + 0.017*"charmont" + 0.016*"jeanne" + 0.015*"assemblée" + 0.015*"vouloir" + 0.015*"maître" + 0.015*"civil"
Topic 3:
0.037*"thier" + 0.025*"pouvoir" + 0.024*"trochu" + 0.024*"homme" + 0.019*"paris" + 0.015*"savoir" + 0.015*"croire" + 0.015*"devoir" + 0.015*"gouvernement" + 0.014*"1"
félix pyat


31it [00:01, 29.39it/s]


Topic 1:
0.049*"citoyen" + 0.038*"journal" + 0.038*"commune" + 0.036*"faire" + 0.031*"pouvoir" + 0.026*"delescluze" + 0.023*"homme" + 0.022*"membre" + 0.020*"vengeur" + 0.020*"jamais"
Topic 2:
0.071*"delescluze" + 0.069*"commission" + 0.046*"tridon" + 0.040*"vermorel" + 0.034*"vaillant" + 0.030*"exécutif" + 0.030*"cournet" + 0.027*"comité" + 0.027*"membre" + 0.025*"eude"
Topic 3:
0.066*"citoyen" + 0.047*"commune" + 0.032*"paris" + 0.027*"léo" + 0.024*"ranvier" + 0.023*"meillet" + 0.022*"dire" + 0.022*"gambon" + 0.022*"rester" + 0.021*"vermorel"
théophile ferré


31it [00:01, 24.48it/s]


Topic 1:
0.027*"commune" + 0.019*"rigault" + 0.016*"ordre" + 0.016*"homme" + 0.016*"juge" + 0.015*"procès" + 0.015*"reconnaître" + 0.015*"membre" + 0.015*"comité" + 0.014*"public"
Topic 2:
0.033*"faire" + 0.022*"police" + 0.020*"ordre" + 0.020*"citoyen" + 0.017*"commune" + 0.016*"sûreté" + 0.015*"général" + 0.015*"délégué" + 0.014*"dire" + 0.014*"venir"
Topic 3:
0.048*"rigault" + 0.020*"dire" + 0.019*"mai" + 0.019*"pouvoir" + 0.019*"th" + 0.015*"24" + 0.013*"devenir" + 0.012*"mort" + 0.012*"prendre" + 0.012*"trinquet"


In [32]:
versaillais = extract_woi_context(data, ['ferré'], "<sb>")
filtered_df = versaillais[versaillais['text'].str.contains('police')]
pro_len = str(len(filtered_df[filtered_df.bias == 'pro']))
anti_len = str(len(filtered_df[filtered_df.bias == 'anti']))
print(pro_len + " " + anti_len)

31it [00:02, 14.92it/s]

11 18





In [19]:
filtered_df

Unnamed: 0,filename,memoir_len_sans_sb_pb,bias,woi,woi_location,extraction_method,text
32,da_costa_1.txt,122048,pro,ferré,51,<sb>,"<pb> puis, par arrêtés des 14 et 16 mai, théop..."
191,da_costa_2.txt,62863,pro,ferré,304,<sb>,"<sb> dégoûté, j'acceptai avec joie le poste pl..."
230,elie.txt,146678,pro,ferré,180,<sb>,<pb> nous jouissons depuis ce matin d’un nouve...
250,lefrancais.txt,144197,pro,ferré,135,<sb>,"<pb> ce qui le fut moins, ce fut le peu de sen..."
267,verges_desboeufs.txt,18081,pro,ferré,89,<sb>,"<sb> fut fusillé au père-lachaise, tandis qu'o..."
270,arsac.txt,141550,anti,ferré,57,<sb>,"<pb> « a la même heure, le délégué à la sûreté..."
291,du_camp_1.txt,118984,anti,ferré,220,<sb>,<sb> le délégué civil à la guerre les écouta e...
297,du_camp_1.txt,118984,anti,ferré,5,<sb>,<sb> ferré depuis deux jours était délégué à l...
302,du_camp_1.txt,118984,anti,ferré,281,<sb>,"<sb> on renvoyait les vieillards, à moins qu'i..."
320,du_camp_2.txt,116707,anti,ferré,257,<sb>,"<pb> rapproche de lui, sous le titre de substi..."


In [20]:
list(filtered_df.text.values)

['<pb> puis, par arrêtés des 14 et 16 mai, théophile ferré était nommé détégué à la sûreté générale en remplacement de cournet et un autre blanquiste, alfred breuillé, le remplaçait comme substitut au parquet de la commune. ',
 "<sb> dégoûté, j'acceptai avec joie le poste plus précis et moins complexe de substitut, chargé spécialement de requérir contre les anciens mouchards mais, afin de suivre les -asaires en cours, je conservai cependant ma fonction de secrétaire du cabinet du comité de sûreté générale, sous cournet et sous ferré. ",
 "<pb> nous jouissons depuis ce matin d’un nouveau préfet de police, au moins en apparence, car tout porte à croire que l'illustre raoul rigault,qui est maintenant avec son collègue ferré dans le comité de sûreté générale.continuera d’être le réel chef de la police, à côté et sous le nom du citoyen cournet, un bon et jovial garçon du æéveil qui, poursuivi plusieurs fois pour délit de presse, ne semblait pas devoir être jamais appelé à la triste fonction

In [21]:
lda_full(['comité de salut public'])

31it [00:01, 25.27it/s]


Topic 1:
0.063*"commune" + 0.021*"membre" + 0.021*"faire" + 0.018*"nommer" + 0.018*"je" + 0.017*"paris" + 0.015*"citoyen" + 0.013*"pouvoir" + 0.012*"mai" + 0.011*"dire"
Topic 2:
0.039*"commune" + 0.027*"voter" + 0.024*"vote" + 0.019*"membre" + 0.018*"institution" + 0.018*"pouvoir" + 0.017*"arrêter" + 0.016*"considérer" + 0.016*"guerre" + 0.016*"minorité"
Topic 3:
0.031*"paris" + 0.031*"membre" + 0.029*"faire" + 0.026*"commune" + 0.024*"ordre" + 0.024*"jour" + 0.018*"pouvoir" + 0.014*"police" + 0.014*"thier" + 0.014*"gouvernement"
