In [1]:
import nltk
import pandas as pd
import random
import numpy as np
import string
import math

from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
from nltk.tokenize import word_tokenize

from numpy import array
from collections import Counter
from scipy.sparse import csr_matrix

from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
from gensim import corpora, models, utils
from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
from gensim.models import LsiModel
from gensim.corpora import Dictionary





In [2]:
# Use spacy lib
# On https://spacy.io/

import spacy
nlp = spacy.load('fr')

In [3]:
# Datas preprocessing methods.
# Lemmatisation without poncutations

stemmer = nltk.stem.snowball.FrenchStemmer()
fstw = stopwords.words('french')

# French Stop Words, extraits depuis le fichier stopwords-fr.txt + stopwords french de nltk
sourceFST = [x.replace('\n', '') for x in open('stopwords-fr.txt', mode="r", encoding="utf-8").readlines()]+fstw

def lemmatize(article):
    output = []
    outPonc = article.translate(article.maketrans("","", string.punctuation))
    outLem = nlp(outPonc)
    for token in outLem:
        if token.lemma_ not in sourceFST:
            output.append(token.lemma_)
    res = ' '.join(output)
    return res

In [4]:
# Data Reading
data = pd.read_csv('export_articles_EGC_2004_2018.csv', sep='\t', header=0)

In [5]:
# Let's process our corpus, and determine a limit to split it in partitions

# usable[] correspond to our corpus processed
# limits[] let us know when to delimit partitions
limits = []
usable = []

# To create ours delimiters, we must first know the years which will be the limits
limit_years = [2007, 2010, 2014]

prev_year = data['year'][0]
numArti = 0
for i in range(0, len(data['abstract']), 1):
    if not isinstance(data['abstract'][i], float): #if not null, empty, or whatever (so if there is a abstract)
        year = data['year'][i]
        if year != prev_year:
            prev_year = year
            if year in limit_years:
                limits.append(numArti)
        numArti+=1
        usable.append(stemmer.stem(lemmatize(data['abstract'][i])))
limits.append(numArti)



In [6]:
# Display pre-processed datas
print("nombre d'articles =", len(usable))
print("limits =", limits)

usable[0]

nombre d'articles = 1096
limits = [267, 543, 790, 1096]


'plateforme objectif permettre citoyen danalyserpar euxmême tweet politique dévénement spécifique francepour cas lélection présidentiel 2017 idéo2017 analyser quasitemp réel message candidat fournir principal caractéristiqueslusage lexiqu politique comparaison entrer candidat'

In [7]:
#params
nb_concepts = 30
min_gram = 1
max_gram = 3

# Creation of cleandocs, which is usable[] with ngrams
cleandocs = []
for t in usable:
    doc = []
    for n in range(min_gram, max_gram+1):
        for gram in ngrams(t.split(), n):
            doc.append(" ".join(gram))
    cleandocs.append(doc)

# Creation of tfidf model, a tool to create ours tfidf
corpus = []
dictionary = corpora.Dictionary(cleandocs)
for doc in cleandocs:
    newVec = dictionary.doc2bow(doc)
    corpus.append(newVec)
tfidf = models.TfidfModel(corpus)

# Creation of partitions_lsa[], which give us the LSA of each partition
partitions_lsa = []
beg = 0
for l in limits:
    last = l
    corpus_tfidf = tfidf[corpus]
    lsi = models.LsiModel(corpus_tfidf, num_topics=nb_concepts, id2word=dictionary)
    corpus_lsi = lsi[corpus_tfidf[beg:last]]
    partitions_lsa.append(corpus_lsi)
    beg = l

In [8]:
num_partition = 0
for lsa in partitions_lsa:
    print("Partition numéro:",num_partition)
    num_partition+=1
    i=0
    for doc in lsa:
        if (i<3):
            print("document number ", i)
            i+=1
            print(doc)


Partition numéro: 0
document number  0
[(0, -0.0024738714432981676), (1, 0.025725858959848486), (2, 0.010897726235135094), (3, 0.002088915119678054), (4, -0.005944901364661566), (5, 0.007568059563352835), (6, 0.008793288625785241), (7, 0.0031626869060070945), (8, 0.047708279742918004), (9, -0.027463326654428303), (10, 0.02620071902818843), (11, 0.0009731866249140437), (12, -0.04105947650342904), (13, 0.010784925393505836), (14, -0.03636026730466142), (15, -0.011689967761567499), (16, 0.013060364670370908), (17, -0.03557253576370232), (18, -0.03137421860215876), (19, -0.03637000443815013), (20, -0.02230409015631147), (21, 0.006799142267632275), (22, 0.03076702211417731), (23, -0.005632811842326894), (24, -0.005758752464520116), (25, -0.04667623602450392), (26, 0.02370002234286569), (27, 0.011377821251532841), (28, -0.02304024343921165), (29, 0.026354590088884567)]
document number  1
[(0, -0.011798800815419088), (1, 0.09810500889688491), (2, 0.039856291176258773), (3, 0.09294078473146268

[(0, -0.00883326551114477), (1, -0.09255659273569147), (2, 0.014287639216764361), (3, -0.046420605465063755), (4, 0.09267097849447672), (5, 0.015405572692952023), (6, -0.03190702601682441), (7, -0.03421792359381664), (8, -0.022030197577816744), (9, 0.04055678947203388), (10, 0.02133720847965256), (11, -0.017535057095893977), (12, -0.007300457435961515), (13, 0.03861250341160007), (14, 0.036442066021075595), (15, 0.04585926990288329), (16, -0.000536848492156182), (17, -0.03594774843741222), (18, 0.011112852714998194), (19, 0.050708550962722136), (20, -0.031087756088032284), (21, 0.006951733966797193), (22, -0.018422063755440823), (23, -0.06457645435890354), (24, -0.0457051336661303), (25, 0.008730804029211435), (26, 0.03966877366012255), (27, 0.016366095473602994), (28, -0.01268171513660413), (29, 0.01037252148161957)]
document number  1
[(0, -0.005954548281800257), (1, -0.07428989477475446), (2, 0.003941653109995601), (3, -0.006012277917508063), (4, 0.021136537134105744), (5, -0.029575

In [9]:
# Let's create ours partitions
partitions = []

# You must specify a treshold, to know what are the doc you keep, and what are the doc you drop
tresh = 0.03

for corpus_lsi in partitions_lsa:
    # Let's create ours clusters
    clusters = []

    for i in range(0, nb_concepts):
        dic = {}
        num_doc = 0
        for doc in corpus_lsi:
            if abs(doc[i][1]) > tresh:
                dic[num_doc] = doc[i][1]
            num_doc+=1
        clusters.append(dic)
    partitions.append(clusters)
    
# TODO: it would be nice to know how many articles are in no cluster anymore

In [10]:
# Display clusters 3 of partition 0 
partitions[0][3]

{1: 0.09294078473146268,
 3: 0.03633034707949508,
 4: -0.033597979538271906,
 5: 0.030765634349993413,
 6: 0.04511204152660263,
 8: 0.031418011626709146,
 10: 0.03342599432471933,
 12: -0.06736343299005722,
 14: -0.0311220007757374,
 15: -0.031654142275983795,
 17: -0.03046065533635964,
 18: 0.030365562232329644,
 19: -0.05111297047644732,
 23: 0.033231655979149524,
 26: -0.034406123885720445,
 27: -0.08033971196171126,
 30: -0.03129488625685242,
 34: 0.037195101416340526,
 41: 0.03841495745162948,
 43: -0.046974534621905754,
 44: -0.050109069456024544,
 45: 0.05685036713129945,
 50: 0.06722140330478575,
 53: -0.0450644945235948,
 54: -0.050575752302505425,
 56: -0.04715046177240906,
 60: 0.08940531521159073,
 61: -0.03154192453606096,
 63: -0.055722088088707225,
 69: 0.09949425444196348,
 74: 0.03226583513992607,
 76: -0.0417604537786582,
 78: -0.07336368074270433,
 83: -0.06861860674985723,
 85: -0.0531494843493874,
 86: 0.03529134079166965,
 90: -0.03117028216984973,
 92: -0.0304711

In [11]:
nb_labels_by_cluster = 5

# Let's labelize our clusters
# For this, we will use the tfidf matrix

vectorizer = TfidfVectorizer(stop_words=sourceFST, use_idf=True, ngram_range=(min_gram, max_gram))
tfidf = vectorizer.fit_transform(usable)

# We can access the value in the tfidf using:
#tfidf.toarray().item(num_doc, num_word)
# To know the number of the word searched, we will use:
#vectorizer.vocabulary_[word]

# take less than 8h to compute x)
labels = []
for clusters in partitions:
    l = []
    for clus in clusters:
        first_arti = True
        for article in clus:
            link = abs(clus[article])
            if first_arti:
                coef_list = (tfidf.toarray()[article] * link)
                first = False
            else:
                # the more an article have a high coeficient, the more he is implied in the labeling step
                coef_list += (tfidf.toarray()[article] * link)
        # Now we have coef_list filled by every coeficient in the multiple tfidf
        # Let's find the best ones, to finally get the labels
        res = dict(zip(vectorizer.get_feature_names(), coef_list))

        l.append(Counter(res).most_common(nb_labels_by_cluster))
    labels.append(l)

# TODO: on observe beaucoup de labels identiques entre deux clusters
# Je pense que c'est parce que l'on a trop de clusters, mais j'aimerais en être sûr

In [12]:
# Display labels
# labels is composed by an array for each partition
labels

[[[('xplor', 0.04005849687206974),
   ('xplor everywhere', 0.04005849687206974),
   ('everywhere', 0.03783570961575117),
   ('and', 0.028254465076511328),
   ('dater', 0.020189613761603598)],
  [('hotspot', 0.02679405983234753),
   ('relation', 0.006417877842461409),
   ('photographie', 0.006326824184650093),
   ('détecter', 0.0038242003822581004),
   ('approcher extraire relation', 0.003349257479043441)],
  [('sémiotique', 0.005892694383508915),
   ('style', 0.005565717412234285),
   ('indicateur', 0.00467976961655377),
   ('action suggéré', 0.0029463471917544576),
   ('action suggéré proposer', 0.0029463471917544576)],
  [('changement', 0.008396827799076783),
   ('composer complexe', 0.00667051147145592),
   ('formalisation', 0.006289212590426272),
   ('composer', 0.00603201563636381),
   ('élémentaire', 0.005834057223851985)],
  [('olfactif', 0.00669056655565228),
   ('qualité olfactif', 0.00669056655565228),
   ('neuroscientifique', 0.004460377703768188),
   ('découvrir sousgroupe'

In [36]:
tfidf.toarray()[1]

array([0., 0., 0., ..., 0., 0., 0.])

In [37]:
# Diachronic analysis

vectorizer = TfidfVectorizer(stop_words=fstw, use_idf=True, ngram_range=(min_gram, max_gram))
tfidf = vectorizer.fit_transform(usable)

# Feature Recall
# label_f: num of the label
def FR(num_cluster_c, num_label_f, num_partition_C):
    total_singledoc = 0
    for doc in partitions[num_partition_C][num_cluster_c]:
        total_singledoc += tfidf.toarray().item(doc, num_label_f)
    total_everydoc = 0
    for cluster in partitions[num_partition_C]:
        for doc in cluster:
            total_everydoc += tfidf.toarray().item(doc, num_label_f)
    return total_singledoc / total_everydoc

# Feature Precision
def FP(num_cluster_c, num_label_f, num_partition_C):
    total_singledoc = 0
    for doc in partitions[num_partition_C][num_cluster_c]:
        total_singledoc += tfidf.toarray().item(doc, num_label_f)
    total_everydoc = 0
    for doc in partitions[num_partition_C][cluster_c]:
        total_everydoc += sum(tfidf.toarray()[doc])
    return total_singledoc / total_everydoc
    
# Feature F-measure
def FF(num_cluster_c, num_label_f, num_partition_C):
    fr = FR(num_cluster_c, num_label_f, num_partition_C)
    fp = FP(num_cluster_c, num_label_f)
    return 2*fr*fp / (fr + fp)

In [40]:
# Vamos a implementar una nueva tecnicà para labelisar: FF-measure

# list of labels of cluster C from partition P = labels[P][C]
labels = []


# first, we want the mean ffmeasure of each f
ffmeanF = []


#TODO: question: on prend on compte les zeros (genre quand le mot n'apparait pas)? -> non
# second, we want the mean of every ffmeasure
ffmean_total = -1


mean_total = 0
nb_for_mean_total = 0

for f in range(0, len(tfidf.toarray()[0])):
    mean_clus = 0
    nb_for_mean_clus = 0
    for num_parti in range(0, len(partitions)):
        for num_clus in range(0, len(partitions[num_parti])):
            ffmesure = FF(num_clus, f, num_clusters)
            mean_clus += ffmesure
            nb_for_mean_clus += 1
            if ffmesure > 0:
                mean_total += ffmesure
                nb_for_mean_total += 1
    mean_clus /= nb_for_mean_clus
    ffmeanF.append(mean_clus)
ffmean_total = mean_total / nb_for_mean_total
    


# Now we fill labels[]
for clusters in partitions:
    labels_for_clusters = []
    for clus in clusters:
        labels_for_clus = []
        for arti in clus:
            for num_word in range(0, len(tfidf.toarray()[arti])):
                if tfidf.toarray().item(arti, num_word) != 0:
                    ffmes = FF(clus, num_word, partitions)
                    if ffmes > ffmeanF[num_word] and ffmes > ffmean_total:
                        labels_for_clus[num_word] = ffmes
        labels_for_clusters[clus] = labels_for_clus
    labels[clusters] = labels_for_clusters



MemoryError: 

In [32]:
# TODO: sigma are ecart-type :)

def inter(listA, listB):
    return np.intersect1d(listA, listB)
    
# cluster_t and cluster_s must be in two different partitions
def proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S):
    total_inter = 0
    total_t = 0
    for f in range(0, len(labels[num_partition_T][num_cluster_t])):
        if labels[num_partition_T][num_cluster_t][f][0] == labels[num_partition_S][num_cluster_s][f][0]:
            total_inter += labels[partition_T][cluster_t][f][1]
            
        total_t += labels[num_partition_T][num_cluster_t][f][1]
    return total_inter / total_t
    

def P_A(num_cluster_s, num_partition_T, num_partition_S):
    # first, we have to know what are the cluster which got the label
    total = 0
    nb_computation = 0
    for label_s in labels[num_partition_S][num_cluster_s]:
        for num_cluster_t in range(0, len(partitions[num_partition_T])):
            if label_s in labels[num_partition_T][num_cluster_t]:
                total += proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S)
                nb_computation += 1
    if nb_computation == 0:
        return 0
    return total / nb_computation

# Define a coeficient for the activity 
def activity(num_partition_S, num_partition_T):
    res = 0
    for num_cluster_s in range(0, len(partitions[num_partition_S])):
        res += P_A(num_cluster_s, num_partition_T, num_partition_S)
    return res / len(partitions[num_partition_S])

# Standard deviation
# Nothing have been find in the paper, so I put those random values ¯\_(ツ)_/¯
sigma_t = 0.01
sigma_s = 0.01

# Our Graal
# Does cluster_t is similar to cluster_s?
def similar(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S):
    cond1 = proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S) > P_A(num_cluster_s, num_partition_T, num_partition_S)
    cond2 = proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S) > activity(num_partition_S, num_partition_T) + sigma_s
    
    cond1 = proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S) > P_A(num_cluster_s, num_partition_T, num_partition_S)
    cond2 = proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S) > activity(num_partition_T, num_partition_S) + sigma_t
    return cond1 and cond2 and cond3 and cond4
    

In [33]:
similar(3, 3, 0, 1)

False

In [29]:
print(labels[0][1])
labels[0][1][0][1]

[('hotspot', 0.02679405983234753), ('relation', 0.006417877842461409), ('photographie', 0.006326824184650093), ('détecter', 0.0038242003822581004), ('approcher extraire relation', 0.003349257479043441)]


0.02679405983234753