In [32]:
import nltk
import pandas as pd
import random
import numpy as np
import string
import math

from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
from nltk.tokenize import word_tokenize

from numpy import array
from collections import Counter
from scipy.sparse import csr_matrix

from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
from gensim import corpora, models, utils
from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
from gensim.models import LsiModel
from gensim.corpora import Dictionary

import re

In [33]:
# Use spacy lib
# On https://spacy.io/

import spacy
nlp = spacy.load('fr')

In [34]:
##############
# Parameters #
##############

min_gram = 1
max_gram = 3

# To create ours partitions, we must first know the years which will be the limits
limit_years = [2007, 2010, 2013, 2016]

# Ignore words that appear at a frequency less than max_frequ in the corpus
max_frequ = 0.8

# Ignore words appearing less than min_appear in the whole corpus
min_appear = 5

# Range fo cluster number you want to test
cluster_ranges = range(2, 30) #range(2, 100) # Warning, long to compute (but nice)

# Number of trial you want to do for each test
nb_trial_by_test = 3

# Number of cluster you finally choose
nb_cluster = 20

# Max iteration for each kmeans (default: 300)
max_iter = 300

In [35]:
# Datas preprocessing methods.

# Lemmatisation without poncutations

stemmer = nltk.stem.snowball.FrenchStemmer()
fstw = stopwords.words('french')

# French Stop Words, extraits depuis le fichier stopwords-fr.txt + stopwords french de nltk
sourceFST = [x.replace('\n', '') for x in open('stopwords-fr.txt', mode="r", encoding="utf-8").readlines()]+fstw
sourceFST += [x.replace('\n', '') for x in open('perso_words-fr.txt', mode="r", encoding="utf-8").readlines()]

# Based on ration of french and english stopwords
def isEnglish(article):
    total_fsw = len([x for x in article.split() if x in sourceFST])
    total_esw = len([x for x in article.split() if x in stopwords.words('english')])
    ratio = 100
    if total_fsw != 0:
        ratio = total_esw/total_fsw
    return ratio > 1 and total_esw > 3

def lemmatize(article):
    arti_lower = article.lower()
    arti_2words = re.sub(" [0-z][0-z] ", " ", arti_lower) # word of length < 2
    arti_e = re.sub("(é|è|ê)", "e", arti_2words)
    arti_o = re.sub("à", "a", arti_e)
    arti_i = re.sub("ô", "o", arti_o)
    artiregex = re.sub("î", "i", arti_i)
    output = []
    outPonc = artiregex.translate(artiregex.maketrans("","", string.punctuation))
    outLem = nlp(outPonc)
    for token in outLem:
        if token.lemma_ not in sourceFST and [x for x in token.lemma_ if x not in "0123456789"] != []:
            output.append(token.lemma_)
    res = ' '.join(output)
    return res

In [36]:
# Data Reading
data = pd.read_csv('export_articles_EGC_2004_2018.csv', sep='\t', header=0)

In [37]:
# Let's process our corpus, and determine a limit to split it in partitions

# usable[] correspond to our corpus processed
# limits[] let us know when to delimit partitions
limits = []
usable = []

prev_year = data['year'][0]
numArti = 0
for i in range(0, len(data['abstract']), 1):
    #if not null, empty, or whatever (so if there is a abstract):
    if not isinstance(data['abstract'][i], float) and not isEnglish(data['abstract'][i]):
        text = data['abstract'][i]
        if not isinstance(data['title'][i], float):
            text += " "+data['title'][i]

        numArti+=1
        usable.append(re.sub(" [0-z][0-z] ", " ", stemmer.stem(lemmatize(text))))
        year = data['year'][i]
        if year != prev_year:
            prev_year = year
            if year in limit_years:
                limits.append(numArti)
limits.append(numArti)



In [38]:
# Post-process word removal
post_words = [x.replace('\n', '') for x in open('post_process_words-fr.txt', mode="r", encoding="utf-8").readlines()]

for i in range(0, len(usable)):
    arti = usable[i].split()
    res = []
    for word in arti:
        if word not in post_words:
            res.append(word)
    usable[i] = ' '.join(res)

In [39]:
# Display pre-processed datas

vectorizer = TfidfVectorizer(stop_words=sourceFST, use_idf=True, ngram_range=(min_gram, max_gram), max_df=max_frequ, min_df=min_appear)
tfidf = vectorizer.fit_transform(usable)

print("nombre d'articles =", len(usable))
print("nombre de mots =", len(tfidf.toarray()[0]))
print("limits =", limits)

usable[0]

nombre d'articles = 991
nombre de mots = 2179
limits = [114, 301, 468, 694, 991]


'plateforme objectif permettre citoyen euxmemer tweet politique devenement specifiqu francepour lelection presidentiell ideo2017 analyser quasitemps reel message candidat fournir principal caracteristiqueslusage lexiqu politique comparaison candidat ideo2017 plateforme citoyen dediee lanalyse tweet evenement polit'

In [40]:
# Creation of partitions_tfidf[], which give us the TFIDF of each cluster of each partition
# partitions_tfidf[num_partition][num_doc][num_word]
# Beware, num_doc can't be equals to 1091 (max). You have partitions, so every doc aren't in every partitions
# num_word can be found via vectorizer.get_feature_name()
partitions_tfidf = []
beg = 0
for l in limits:
    last = l
    partitions_tfidf.append([list(x) for x in list(tfidf.toarray())[beg:last]])
    beg = l

In [41]:
vectorizer.get_feature_names()

['acce',
 'accessible',
 'achat',
 'acquisition',
 'acquérir',
 'acteur',
 'actif',
 'action',
 'activite',
 'actuel',
 'actuellement',
 'adapt',
 'adaptatif',
 'adaptation',
 'adapte',
 'adaptee',
 'adapter',
 'adequat',
 'adn',
 'adopter',
 'afc',
 'affiner',
 'agent',
 'agregation',
 'aid',
 'aider',
 'ainsiqu',
 'ajoutee',
 'ajouter',
 'ala',
 'aleatoir',
 'algebriqu',
 'algorithme',
 'algorithme dapprentissage',
 'algorithme dextraction',
 'algorithme efficace',
 'algorithme fouiller',
 'algorithme incremental',
 'algorithmique',
 'alignement',
 'alternatif',
 'amelior',
 'amelioration',
 'ameliore',
 'amelioree',
 'ameliorer',
 'ameliorer qualite',
 'amene',
 'amont',
 'analys',
 'analyse',
 'analyser',
 'analyser factoriel',
 'analyser semantiqu',
 'analytique',
 'anne',
 'annees',
 'annot',
 'annotation',
 'annotation semantiqu',
 'annoter',
 'anormal',
 'anr',
 'apartir',
 'apparer',
 'appartenir',
 'appel',
 'appele',
 'appelee',
 'appeler',
 'applicable',
 'applicatif',
 'ap

# KMeans & Silhouette Score

In [42]:
# Applying KMeans on tfidf
# the labels_ give assignment of doc to the cluster number 


In [43]:
# doc_clustering is a dictionnary 
# it looks like -> { doc_number : [partition_number, cluster_number] }
# This is used to reassign doc number to their respective partition and and cluster

def kmeans(nb_clusters):
    doc_clustering = {}
    
    km = KMeans(n_clusters=nb_clusters, max_iter=max_iter)

    # Silhouette score mean
    silhouette_mean = 0

    numDoc = 0
    for i in range(0, len(limits)):
        dash = km.fit(partitions_tfidf[i])

        # Silhouette
        silhouette_mean += silhouette_score(partitions_tfidf[i], dash.labels_)

        previousBound = 0
        if i > 0:
            previousBound = limits[i-1]
        for numDocItern in range(0, limits[i]-previousBound):
            doc_clustering[numDoc] = [i, dash.labels_[numDocItern]]
            numDoc+=1

    silhouette_mean = silhouette_mean / len(limits)
    res = {}
    res["silhouette"] = silhouette_mean
    res["clustering"] = doc_clustering
    return res


In [45]:
# Compute Silhouette Score for each number of cluster

silhouette_by_cluster_nb = {}

for nbClusters in cluster_ranges:
    print("Computing for", nbClusters, "clusters...")
    silhouette_avg = 0
    for trial in range(0, nb_trial_by_test):
        km = kmeans(nbClusters)
        silhouette_avg += km["silhouette"]
    silhouette_avg = silhouette_avg / nb_trial_by_test
    silhouette_by_cluster_nb[nb_clusters] = silhouette_avg

Computing for  2 clusters
Computing for  3 clusters
Computing for  4 clusters
Computing for  5 clusters
Computing for  6 clusters
Computing for  7 clusters
Computing for  8 clusters
Computing for  9 clusters
Computing for  10 clusters
Computing for  11 clusters
Computing for  12 clusters
Computing for  13 clusters
Computing for  14 clusters
Computing for  15 clusters
Computing for  16 clusters
Computing for  17 clusters
Computing for  18 clusters
Computing for  19 clusters
Computing for  20 clusters
Computing for  21 clusters
Computing for  22 clusters
Computing for  23 clusters
Computing for  24 clusters
Computing for  25 clusters
Computing for  26 clusters
Computing for  27 clusters
Computing for  28 clusters
Computing for  29 clusters


In [46]:
# We want silhouette scores to be high
silhouette_by_cluster_nb

{2: 0.005410570209443134,
 3: 0.006187564487044067,
 4: 0.00685720147411483,
 5: 0.008150191380431104,
 6: 0.008704565403188164,
 7: 0.009000500668185118,
 8: 0.010777420429916355,
 9: 0.011451913110778053,
 10: 0.012022667276738048,
 11: 0.013108222172617624,
 12: 0.013522613422691176,
 13: 0.014588560600138986,
 14: 0.015610055690953343,
 15: 0.016555568462439354,
 16: 0.01636134801724312,
 17: 0.017897108298325512,
 18: 0.017655434085125255,
 19: 0.01885256852553832,
 20: 0.019543648720926968,
 21: 0.02065440013808222,
 22: 0.021778124560471887,
 23: 0.023321975917530446,
 24: 0.022329278939527696,
 25: 0.022648310093930774,
 26: 0.023402410092019568,
 27: 0.024736841405944784,
 28: 0.025736564437768523,
 29: 0.02577226177039991}

In [15]:
doc_clustering = kmeans(nb_cluster)["clustering"]

In [16]:
# Allows to get list of documents number
# return [dou numbers]
# params : partition_number , cluster number
def get_doc(part, clust):
    docs = []
    for i in range(0,len(doc_clustering)):
        if doc_clustering[i][0] == part and doc_clustering[i][1] == clust:
            docs.append(i)
    return docs

In [17]:
# Get the partitions variable
# Here partitions[part][cluster] = list of docs numbe
partitions = []
for i in range(0, len(limits)):
    clusters = []
    for j in range(0, nb_clusters):
        clusters.append(get_doc(i,j))
    partitions.append(clusters)

In [18]:
partitions

[[[17, 106],
  [2, 29, 51, 95, 104],
  [7, 62, 67, 97, 112],
  [33, 39, 40, 45, 52, 72, 91, 101],
  [6, 12, 41, 92, 108],
  [49, 74, 78, 103, 109],
  [22, 53, 79, 96, 113],
  [3, 13, 28, 42, 47, 64, 75, 83, 90],
  [14, 48, 54, 66, 82, 84, 100],
  [59, 60],
  [36, 37, 71],
  [0, 25, 26, 38, 73, 76, 77, 81],
  [15, 63, 70, 87],
  [24, 27, 30, 35, 44, 89, 110],
  [8, 19, 43, 57, 61, 107, 111],
  [9, 32, 34, 94, 98],
  [4, 23, 56, 58, 65],
  [5, 10, 16, 18, 86, 105],
  [11, 20, 21, 31, 46, 55, 80, 93, 99],
  [1, 50, 68, 69, 85, 88, 102],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  []],
 [[133, 134, 155, 156, 199, 209, 217, 225, 268, 273, 281],
  [114,
   136,
   139,
   143,
   158,
   163,
   165,
   168,
   174,
   184,
   194,
   196,
   212,
   224,
   226,
   230,
   296],
  [125,
   128,
   146,
   162,
   171,
   185,
   192,
   204,
   231,
   255,
   257,
   259,
   262,
   274,
   286,
   300],
  [116, 117, 126, 127, 131, 144, 161, 164, 166, 221, 294],
  [129, 154, 172, 180

# Khi²

In [19]:
# tf_of_your_word = tf[numDoc][strWord]
tf = []
for doc in usable:
    tf_doc = {}
    for word in vectorizer.get_feature_names():
        tf_doc[word] = doc.count(word)
    tf.append(tf_doc)

In [20]:
# Number total of words
# nb_total_word[numPartition]
nb_total_word = []
nb = 0

for numDoc in range(0, len(usable)):
    for word in vectorizer.get_feature_names():
        nb += tf[numDoc][word]
    if numDoc+1 in limits:
        nb_total_word.append(nb)
        nb=0
    

In [21]:
nb_total_word

[10917, 17349, 16649, 21289, 28637]

In [22]:
tf[0]

{'noyau': 0,
 'danalyse': 0,
 'apparer': 0,
 'reell issu': 0,
 'tou': 0,
 'frequente': 0,
 'dexprimer': 0,
 'prediction': 0,
 'systeme dinformation': 0,
 'decrivons': 0,
 'evaluer': 0,
 'algorithme': 0,
 'tenter': 0,
 'article classification': 0,
 'programmer': 0,
 'contribuer': 0,
 'proposon': 0,
 'segmentation': 0,
 'classement': 0,
 'jeu': 0,
 'scor': 0,
 'lextraction motif': 0,
 'classe': 0,
 'transformer': 0,
 'organisation': 0,
 'trouver': 0,
 'plan': 0,
 'knowledge': 0,
 'papier presente': 0,
 'disposer': 0,
 'base donneer': 0,
 'lautomatisation': 0,
 'constant': 0,
 'nombre': 0,
 'rappel': 0,
 'traduire': 0,
 'induire': 0,
 'conference': 0,
 'determine': 0,
 'region': 0,
 'article methode original': 0,
 'rapport': 0,
 'bruitees': 0,
 'efficacite': 0,
 'detude': 0,
 'article decrit': 0,
 'dedonnee': 0,
 'comprendre': 0,
 'collaboratif': 0,
 'relatif': 0,
 'decouverte motif': 0,
 'convergence': 0,
 'proposees': 0,
 'dense': 0,
 'individu': 0,
 'graph voisinage': 0,
 'nommees': 0,

In [23]:
# nb_word[num_partition][word]
nb_word = []

word_in_this_parti = {}
for word in vectorizer.get_feature_names():
    word_in_this_parti[word] = 0

for numDoc in range(0, len(usable)):
    for word in vectorizer.get_feature_names():
        word_in_this_parti[word] += tf[numDoc][word]
    if numDoc+1 in limits:
        nb_word.append(word_in_this_parti)
        word_in_this_parti = {}
        for word in vectorizer.get_feature_names():
            word_in_this_parti[word] = 0

In [24]:
len(nb_word)

5

In [25]:
# nb_word_by_cluster[numPartition][numCluster]
nb_word_by_cluster = []
for parti in partitions:
    nb_word_clus = []
    for cluster in parti:
        nb = 0
        for numDoc in cluster:
            for word in vectorizer.get_feature_names():
                nb += tf[numDoc][word]
        nb_word_clus.append(nb)
    nb_word_by_cluster.append(nb_word_clus)

In [26]:
# value_of_khi2 = khi2[numPartition][numCluster][word]
khi2 = []

for numParti in range(0, len(partitions)):
    khi2parti = []
    for numCluster in range(0, len(partitions[numParti])):
        khi2cluster = {}
        
        for word in vectorizer.get_feature_names():
            if nb_word_by_cluster[numParti][numCluster] == 0:
                khi2cluster[word] = 0
            else:
                word_in_this_parti[word] = 0
                E = nb_word[numParti][word]
                E =+ nb_word_by_cluster[numParti][numCluster]
                E = E/ nb_total_word[numParti]
                N = 0
                for numDoc in partitions[numParti][numCluster]:
                    N += tf[numDoc][word]
                khi2cluster[word] = (pow(N - E, 2)/E)        
        khi2parti.append(khi2cluster)
    khi2.append(khi2parti)

In [27]:
# list of your labels = labels[numPartition][numCluster]
labels = []

for numPartition in range(0, len(nb_word_by_cluster)):
    label_clus = []
    for numCluster in range(0, len(nb_word_by_cluster[numPartition])):
        label_clus.append(Counter(khi2[numPartition][numCluster]).most_common(5))
    labels.append(label_clus)

In [52]:
# Some clusters can be empty
len(labels[0])

29

# Diachronic Analysis

In [55]:

def inter(listA, listB):
    return np.intersect1d(listA, listB)
    
# cluster_t and cluster_s must be in two different partitions
def proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S):
    total_inter = 0
    total_t = 0
    for f in range(0, len(labels[num_partition_T][num_cluster_t])):
        for f_s in labels[num_partition_S][num_cluster_s]:
            if labels[num_partition_T][num_cluster_t][f][0] == f_s[0]:
                total_inter += labels[num_partition_T][num_cluster_t][f][1]
                break
        total_t += labels[num_partition_T][num_cluster_t][f][1]
    if total_t == 0:
        return 0
    return total_inter / total_t
    

def P_A(num_cluster_s, num_partition_T, num_partition_S):
    # first, we have to know what are the cluster which got the label
    total = 0
    nb_computation = 0
    for label_s in labels[num_partition_S][num_cluster_s]:
        for num_cluster_t in range(0, len(partitions[num_partition_T])):
            if label_s in labels[num_partition_T][num_cluster_t]:
                total += proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S)
                nb_computation += 1
    if nb_computation == 0:
        return 0
    return total / nb_computation

# Define a coeficient for the activity 
def activity(num_partition_S, num_partition_T):
    res = 0
    for num_cluster_s in range(0, len(partitions[num_partition_S])):
        res += P_A(num_cluster_s, num_partition_T, num_partition_S)
    return res / len(partitions[num_partition_S])

# Ecart-type, but it isn't very usefull xD
sigma_t = 0.01
sigma_s = 0.01

# Our Graal
def similar(num_cluster_t, num_partition_T, num_cluster_s, num_partition_S):
    cond1 = proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S) > P_A(num_cluster_s, num_partition_T, num_partition_S)
    cond2 = proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S) > activity(num_partition_S, num_partition_T) + sigma_s
    
    cond3 = proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S) > P_A(num_cluster_s, num_partition_T, num_partition_S)
    cond4 = proba(num_cluster_t, num_cluster_s, num_partition_T, num_partition_S) > activity(num_partition_T, num_partition_S) + sigma_t
    return cond1 and cond2 and cond3 and cond4
    

In [57]:
for numParti in range(0, len(partitions)-1):
    for num_cluster_t in range(0, nb_cluster):
        for num_cluster_s in range(0, nb_cluster):
            if similar(num_cluster_t, numParti, num_cluster_s, numParti+1):
                print("("+str(num_cluster_t)+","+str(numParti)+") est similaire à ("+str(num_cluster_s)+","+str(numParti+1)+")")

(0,0) est similaire à (11,1)
(0,0) est similaire à (15,1)
(1,0) est similaire à (5,1)
(1,0) est similaire à (6,1)
(1,0) est similaire à (9,1)
(1,0) est similaire à (10,1)
(1,0) est similaire à (19,1)
(2,0) est similaire à (4,1)
(2,0) est similaire à (9,1)
(3,0) est similaire à (9,1)
(3,0) est similaire à (10,1)
(3,0) est similaire à (19,1)
(4,0) est similaire à (1,1)
(4,0) est similaire à (5,1)
(4,0) est similaire à (6,1)
(4,0) est similaire à (9,1)
(4,0) est similaire à (11,1)
(4,0) est similaire à (16,1)
(5,0) est similaire à (2,1)
(5,0) est similaire à (7,1)
(5,0) est similaire à (12,1)
(5,0) est similaire à (15,1)
(5,0) est similaire à (18,1)
(6,0) est similaire à (1,1)
(6,0) est similaire à (2,1)
(6,0) est similaire à (5,1)
(6,0) est similaire à (6,1)
(6,0) est similaire à (9,1)
(6,0) est similaire à (10,1)
(6,0) est similaire à (11,1)
(6,0) est similaire à (16,1)
(6,0) est similaire à (19,1)
(7,0) est similaire à (5,1)
(7,0) est similaire à (8,1)
(7,0) est similaire à (17,1)
(8,0

(12,3) est similaire à (7,4)
(13,3) est similaire à (1,4)
(13,3) est similaire à (2,4)
(14,3) est similaire à (1,4)
(14,3) est similaire à (2,4)
(14,3) est similaire à (16,4)
(15,3) est similaire à (3,4)
(15,3) est similaire à (4,4)
(15,3) est similaire à (5,4)
(15,3) est similaire à (10,4)
(16,3) est similaire à (0,4)
(16,3) est similaire à (1,4)
(16,3) est similaire à (4,4)
(16,3) est similaire à (8,4)
(16,3) est similaire à (9,4)
(16,3) est similaire à (13,4)
(16,3) est similaire à (14,4)
(16,3) est similaire à (16,4)
(16,3) est similaire à (17,4)
(16,3) est similaire à (18,4)
(17,3) est similaire à (0,4)
(17,3) est similaire à (1,4)
(17,3) est similaire à (4,4)
(17,3) est similaire à (5,4)
(17,3) est similaire à (8,4)
(17,3) est similaire à (9,4)
(17,3) est similaire à (13,4)
(17,3) est similaire à (14,4)
(17,3) est similaire à (15,4)
(17,3) est similaire à (17,4)
(17,3) est similaire à (18,4)
(18,3) est similaire à (0,4)
(18,3) est similaire à (4,4)
(18,3) est similaire à (11,4)
(

In [31]:
print(labels[0][1])
labels[0][1][0][1]

[('recommandation', 12379.303266725825), ('utilisateur', 1043.2261916270124), ('model', 764.7499070420321), ('article', 764.7499070420321), ('interet', 764.7499070420321)]


12379.303266725825