In [1]:
import nltk
import pandas as pd
import random
import numpy as np
import string
import math

from numpy import array
from nltk.corpus import stopwords
from collections import Counter
from scipy.sparse import csr_matrix
from nltk.stem.snowball import FrenchStemmer
from nltk.tokenize import word_tokenize

from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
from gensim import corpora, models, utils
from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
from gensim.models import LsiModel
from gensim.corpora import Dictionary

from nltk import ngrams


In [2]:
# Use spacy lib
# On https://spacy.io/

import spacy
nlp = spacy.load('fr')

In [4]:
# Datas preprocessing methods.
# Lemmatisation without poncutations

stemmer = nltk.stem.snowball.FrenchStemmer()
fstw = stopwords.words('french')

# French Stop Words, extraits depuis le fichier stopwords-fr.txt + stopwords french de nltk
sourceFST = [x.replace('\n', '') for x in open('stopwords-fr.txt', mode="r", encoding="utf-8").readlines()]+fstw

def lemmatize(article):
    output = []
    outPonc = article.translate(article.maketrans("","", string.punctuation))
    outLem = nlp(outPonc)
    for token in outLem:
        if token.lemma_ not in sourceFST:
            output.append(token.lemma_)
    res = ' '.join(output)
    return res

In [5]:
# Data Reading
data = pd.read_csv('export_articles_EGC_2004_2018.csv', sep='\t', header=0)

In [6]:
# Let's process our corpus, and determine a limit to split it in partitions

limit_years = [2007, 2010, 2014]
limits = []
usable = []
prev_year = data['year'][0]
numArti = 0
for i in range(0, len(data['abstract']), 1):
    if not isinstance(data['abstract'][i], float): #if not null, empty, or whatever (so if there is a abstract)
        year = data['year'][i]
        if year != prev_year:
            prev_year = year
            limits.append(numArti)
        numArti+=1
        usable.append(stemmer.stem(lemmatize(data['abstract'][i])))
limits.append(numArti)

In [8]:
# Display pre-processed datas
print("nombre d'articles =", len(usable))
print("limits =", limits)

usable[0]

nombre d'articles = 1096
limits = [56, 116, 200, 267, 354, 410, 470, 543, 628, 692, 790, 882, 962, 1036, 1096]


'plateforme objectif permettre citoyen danalyserpar euxmême tweet politique dévénement spécifique francepour cas lélection présidentiel 2017 idéo2017 analyser quasitemp réel message candidat fournir principal caractéristiqueslusage lexiqu politique comparaison entrer candidat'

In [13]:
nb_concepts = 30

min_gram = 1
max_gram = 3

cleandocs = []
for t in usable:
    doc = []
    for n in range(min_gram, max_gram+1):
        for gram in ngrams(t.split(), n):
            doc.append(" ".join(gram))
    cleandocs.append(doc)

corpus = []
first = True
if first:
    dictionary = corpora.Dictionary(cleandocs)
    first = False
for doc in cleandocs:
    newVec = dictionary.doc2bow(doc)
    corpus.append(newVec)
tfidf = models.TfidfModel(corpus)

partitions_lsa = []
beg = 0
for l in limits:
    last = l
    corpus_tfidf = tfidf[corpus]
    lsi = models.LsiModel(corpus_tfidf, num_topics=nb_concepts, id2word=dictionary)
    corpus_lsi = lsi[corpus_tfidf[beg:last]]
    partitions_lsa.append(corpus_lsi)
    beg = l

In [14]:
for lsa in partitions_lsa:
    i=0
    for doc in lsa:
        if (i<3):
            print("document number ", i)
            i+=1
            print(doc)
corpus_tfidf

document number  0
[(0, -0.003286836486224767), (1, 0.02770979080747823), (2, -0.0109565744252085), (3, 0.01554431094819378), (4, 0.02123046795555608), (5, -0.009548223373637367), (6, -0.0198089103416116), (7, -0.05293605977837071), (8, 0.0019796572979879933), (9, -0.03943646474350758), (10, -0.002916821387276751), (11, -0.0014718341416692692), (12, -0.05196827346974078), (13, -0.023523632671275156), (14, -0.003095491357074982), (15, -0.039955970396926686), (16, 0.012469282289965508), (17, 0.006619662749809259), (18, -0.03770187273299071), (19, 0.01790243417613722), (20, 0.01139278411785319), (21, -0.010901445975026796), (22, 0.03761819330434391), (23, -0.010096280922482131), (24, 0.01794020678702423), (25, -0.007485927757077189), (26, -0.0063172731496533045), (27, 0.010579234456593223), (28, 0.009441680543092662), (29, 0.006435082441349784)]
document number  1
[(0, -0.010498981260769032), (1, 0.09813412962980873), (2, 0.006684332398952965), (3, -0.07730116690575757), (4, -0.0013150481

document number  0
[(0, -0.0059771091645911744), (1, 0.03882230555812892), (2, -0.016594478589902786), (3, -0.004261289915911395), (4, -0.03697349060630694), (5, -0.015601292718172404), (6, -0.02533226420025355), (7, 0.024734588237320863), (8, 0.022613301810531786), (9, 0.047905666414965256), (10, 0.0194090285255312), (11, -0.02869855774846438), (12, 0.023410309915236467), (13, 0.05437804424076472), (14, 0.027694281311036097), (15, -0.038547400578451345), (16, 0.003266728249288183), (17, 0.038000287250890004), (18, 0.01839840702326844), (19, -0.004801328471477444), (20, 0.00472629246108398), (21, 0.09863613532602619), (22, 0.010957314328431795), (23, -0.04219808191889261), (24, 0.011787400334669879), (25, -0.04496152332139001), (26, -0.02785107859355936), (27, 0.042090375427882895), (28, 0.06363820630369273), (29, 0.022479614599581053)]
document number  1
[(0, -0.0068259737661432365), (1, 0.062125584256671494), (2, -0.008940665441738315), (3, -0.009756691500977119), (4, -0.000652931083

document number  0
[(0, 0.0061603065296592695), (1, -0.02758849715384241), (2, 0.02199595470806697), (3, -0.025696775052995703), (4, -0.026029861096465327), (5, -0.019871094758032216), (6, 0.004449662030021974), (7, -0.007760119160213066), (8, 0.046064975063394076), (9, -0.041054775072130595), (10, 0.006764553498124129), (11, -0.08015997759640273), (12, -0.0021364198748898408), (13, 0.005519481384112586), (14, 0.006033443410984919), (15, 0.00569115223688494), (16, 0.006107653674809432), (17, -0.011047811766901833), (18, 0.06197881118954202), (19, -0.0010575800597365517), (20, -0.005086403570148624), (21, -0.035403249418623606), (22, -0.00965786622588335), (23, 0.047641114751707944), (24, -0.013817863059420288), (25, 0.04007247265945399), (26, 0.046552257010563015), (27, -0.012977462245958221), (28, 0.013188979940938017), (29, -0.034426995932237504)]
document number  1
[(0, 0.007834741383127501), (1, -0.05441851753386566), (2, -0.007467146836744603), (3, 0.008517330305124936), (4, -0.05

document number  0
[(0, 0.004307190406313887), (1, -0.04771917126854175), (2, 0.007577580501826285), (3, -0.02843419962073367), (4, -0.00705309392480993), (5, 0.0014256405492496143), (6, -0.0007135719025979849), (7, 0.010045769788802615), (8, 0.002221747055058722), (9, -0.012611668197457649), (10, -0.025796348557996403), (11, -0.0042065826399605485), (12, -0.010264969627852867), (13, -0.008964253522297008), (14, 0.02573002223900241), (15, -0.008660607987732534), (16, -0.00555165331050237), (17, 0.04299335664299042), (18, -0.011064401653172462), (19, 0.004519395131382836), (20, -0.0028900471246223053), (21, 0.0030262136527038157), (22, -0.0026507505505103395), (23, 0.0020435092578563914), (24, -0.010751157958793797), (25, 0.0069529963999397475), (26, 0.013247515297544166), (27, 0.013785665610285368), (28, -0.0056949275166491035), (29, -0.01304010469094553)]
document number  1
[(0, 0.006941238726319017), (1, -0.05999797772671682), (2, 0.00022366249126090526), (3, 0.04412188929470396), (4

<gensim.interfaces.TransformedCorpus at 0x7f44910c5470>

In [15]:
clusters = []
tresh = 0.03

for i in range(0, nb_concepts):
    dic = {}
    num_doc = 0
    for doc in corpus_lsi:
        if abs(doc[i][1]) > tresh:
            dic[num_doc] = doc[i][1]
        num_doc+=1
    clusters.append(dic)

In [21]:
# Display clusters 
clusters[3]

{0: 0.033016229371439436,
 1: -0.05315720144513091,
 2: -0.06521017032500903,
 6: 0.0491946399492331,
 8: 0.03582878512656243,
 10: 0.03947870705591637,
 16: 0.05397680423033342,
 22: 0.04709303416454413,
 23: 0.06785425435838192,
 24: -0.07785237911063717,
 26: -0.07844564634200887,
 28: -0.06151756496852732,
 30: 0.06517660936034447,
 31: 0.09596531568324665,
 32: 0.07367965015547125,
 34: -0.04954819511115152,
 35: 0.04388250423546769,
 38: 0.03606839374988594,
 39: -0.03231459576556111,
 42: 0.05141393979365777,
 45: 0.047183091564491325,
 47: -0.07882347682215196,
 49: -0.03906498466539012,
 52: -0.05652853447908443,
 53: 0.05191180419009053,
 57: 0.045844438243621354,
 58: 0.05493694022668398,
 59: 0.05905413421012602}

In [23]:
nb_labels_by_cluster = 5

# Let's labelize our clusters
# For this, we will use the tfidf matrix

vectorizer = TfidfVectorizer(stop_words=sourceFST, use_idf=True, ngram_range=(min_gram, max_gram))
tfidf = vectorizer.fit_transform(usable)

# We can access the value in the tfidf using:
#tfidf.toarray().item(num_doc, num_word)
# To know the number of the word searched, we will use:
#vectorizer.vocabulary_[word]

# take less than 8h to compute x)
labels = []
for clus in clusters:

    first_arti = True
    for article in clus:
        link = abs(clus[article])
        if first_arti:
            coef_list = (tfidf.toarray()[article] * link)
            first = False
        else:
            # the more an article have a high coeficient, the more he is implied in the labeling step
            coef_list += (tfidf.toarray()[article] * link)
    # Now we have coef_list filled by every coeficient in the multiple tfidf
    # Let's find the best ones, to finally get the labels
    res = dict(zip(vectorizer.get_feature_names(), coef_list))
    
    labels.append(Counter(res).most_common(nb_labels_by_cluster))

# TODO: on observe beaucoup de labels identiques entre deux clusters
# Je pense que c'est parce que l'on a trop de clusters, mais j'aimerais en être sûr

  sorted(inconsistent))


In [24]:
# Display Labels
labels

[[('événement', 0.034139849113969806),
  ('danalyse', 0.02637317448651254),
  ('adaptéau', 0.023568865442611473),
  ('adaptéau application', 0.023568865442611473),
  ('adaptéau application spécifique', 0.023568865442611473)],
 [('tabler', 0.01761827007803228),
  ('publier', 0.011244162233305608),
  ('donnée', 0.009375601082198308),
  ('tabler donnée', 0.008562438308292113),
  ('donnée synthétique', 0.007857942849012268)],
 [('contenir textuel', 0.005355828168732267),
  ('textuel', 0.0033017873862053212),
  ('approcher sinscrit', 0.0030618583147087194),
  ('approcher sinscrit dan', 0.0030618583147087194),
  ('automatique nousmontreron', 0.0030618583147087194)],
 [('tabler', 0.014642316292162331),
  ('publier', 0.009344877739485486),
  ('donnée', 0.007791940744844083),
  ('tabler donnée', 0.007116131685281954),
  ('donnée synthétique', 0.006530634624816881)],
 [('tabler', 0.013449897837992712),
  ('publier', 0.008583863945890143),
  ('donnée', 0.007157392647906604),
  ('tabler donnée', 0