In [1]:
import nltk
import pandas as pd
import random
import numpy as np
import string
import math

from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
from nltk.tokenize import word_tokenize

from numpy import array
from collections import Counter
from scipy.sparse import csr_matrix

from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
from gensim import corpora, models, utils
from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
from gensim.models import LsiModel
from gensim.corpora import Dictionary

import re

In [2]:
# Use spacy lib
# On https://spacy.io/

import spacy
nlp = spacy.load('fr')

In [3]:
##############
# Parameters #
##############

min_gram = 1
max_gram = 3

# To create ours partitions, we must first know the years which will be the limits
limit_years = [2007, 2010, 2014]

# Ignore words that appear at a frequency less than max_frequ in the corpus
max_frequ = 0.8

# Ignore words appearing less than min_appear in the whole corpus
min_appear = 5

# Number of clusters by partitions
nb_clusters = 5

In [4]:
# Datas preprocessing methods.

# Lemmatisation without poncutations

stemmer = nltk.stem.snowball.FrenchStemmer()
fstw = stopwords.words('french')

# French Stop Words, extraits depuis le fichier stopwords-fr.txt + stopwords french de nltk
sourceFST = [x.replace('\n', '') for x in open('stopwords-fr.txt', mode="r", encoding="utf-8").readlines()]+fstw
sourceFST += [x.replace('\n', '') for x in open('perso_words-fr.txt', mode="r", encoding="utf-8").readlines()]

# Based on ration of french and english stopwords
def isEnglish(article):
    total_fsw = len([x for x in article.split() if x in sourceFST])
    total_esw = len([x for x in article.split() if x in stopwords.words('english')])
    ratio = 100
    if total_fsw != 0:
        ratio = total_esw/total_fsw
    return ratio > 1 and total_esw > 3

def lemmatize(article):
    arti_lower = article.lower()
    arti_2words = re.sub(" [0-z][0-z] ", " ", arti_lower) # word of length < 2
    arti_e = re.sub("(é|è|ê)", "e", arti_2words)
    arti_o = re.sub("à", "a", arti_e)
    arti_i = re.sub("ô", "o", arti_o)
    artiregex = re.sub("î", "i", arti_i)
    output = []
    outPonc = artiregex.translate(artiregex.maketrans("","", string.punctuation))
    outLem = nlp(outPonc)
    for token in outLem:
        if token.lemma_ not in sourceFST and [x for x in token.lemma_ if x not in "0123456789"] != []:
            output.append(token.lemma_)
    res = ' '.join(output)
    return res

In [5]:
# Data Reading
data = pd.read_csv('export_articles_EGC_2004_2018.csv', sep='\t', header=0)

In [6]:
# Let's process our corpus, and determine a limit to split it in partitions

# usable[] correspond to our corpus processed
# limits[] let us know when to delimit partitions
limits = []
usable = []

prev_year = data['year'][0]
numArti = 0
for i in range(0, len(data['abstract']), 1):
    #if not null, empty, or whatever (so if there is a abstract):
    if not isinstance(data['abstract'][i], float) and not isEnglish(data['abstract'][i]):
        text = data['abstract'][i]
        if not isinstance(data['title'][i], float):
            text += " "+data['title'][i]

        numArti+=1
        usable.append(re.sub(" [0-z][0-z] ", " ", stemmer.stem(lemmatize(text))))
        year = data['year'][i]
        if year != prev_year:
            prev_year = year
            if year in limit_years:
                limits.append(numArti)
limits.append(numArti)



In [7]:
# Display pre-processed datas

vectorizer = TfidfVectorizer(stop_words=sourceFST, use_idf=True, ngram_range=(min_gram, max_gram), max_df=max_frequ, min_df=min_appear)
tfidf = vectorizer.fit_transform(usable)

print("nombre d'articles =", len(usable))
print("nombre de mots =", len(tfidf.toarray()[0]))
print("limits =", limits)

usable[1]

  sorted(inconsistent))


nombre d'articles = 991
nombre de mots = 2397
limits = [223, 468, 694, 991]


'classification croise coclustering technique permettre dextraire structuresousjacente existant entrer ligne colonne tabler donneer sou former bloc application utiliser technique algorithme coclustering actuel passer lechelle approche utilise succe method modl optimiser critere vraisemblance regularisee cependent taille plaire important methode atteindre limiter article presenter nouvel algorithme coclustering niveau compter critere modl permettre traiter efficacement donnee grand tailler pouvoir memoir experience montrer lapproche propose gagn temps calcul produire solution qualite two level coclustering algorithm for very large dater set'

In [8]:
# Creation of partitions_tfidf[], which give us the TFIDF of each cluster of each partition
# partitions_tfidf[num_partition][num_doc][num_word]
# Beware, num_doc can't be equals to 1091 (max). You have partitions, so every doc aren't in every partitions
# num_word can be found via vectorizer.get_feature_name()
partitions_tfidf = []
beg = 0
for l in limits:
    last = l
    partitions_tfidf.append([list(x) for x in list(tfidf.toarray())[beg:last]])
    beg = l

In [9]:
vectorizer.get_feature_names()

['acce',
 'accessible',
 'achat',
 'acquisition',
 'acquérir',
 'acteur',
 'actif',
 'action',
 'activite',
 'actuel',
 'actuellement',
 'adapt',
 'adaptatif',
 'adaptation',
 'adapte',
 'adaptee',
 'adapter',
 'adequat',
 'adn',
 'adopter',
 'afc',
 'affiner',
 'agent',
 'agregation',
 'aid',
 'aider',
 'ainsiqu',
 'ajoutee',
 'ajouter',
 'ala',
 'aleatoir',
 'algebriqu',
 'algorithm',
 'algorithm classification',
 'algorithm dapprentissage',
 'algorithm dextraction',
 'algorithm efficace',
 'algorithme',
 'algorithme dapprentissage',
 'algorithme dextraction',
 'algorithme ete',
 'algorithme fouiller',
 'algorithme incremental',
 'algorithmique',
 'alignement',
 'alternatif',
 'amelior',
 'amelioration',
 'ameliore',
 'amelioree',
 'ameliorer',
 'ameliorer qualite',
 'amene',
 'amont',
 'analys',
 'analyse',
 'analyser',
 'analyser donnee',
 'analyser factoriel',
 'analyser semantiqu',
 'analytique',
 'anne',
 'annees',
 'annot',
 'annotation',
 'annotation semantiqu',
 'annoter',
 '

# KMeans

In [10]:
# Applying KMeans on tfidf
# the labels_ give assignment of doc to the cluster number 


In [12]:
# doc_clustering is a dictionnary 
# it looks like -> { doc_number : [partition_number, cluster_number] }
# This is used to reassign doc number to their respective partition and and cluster
doc_clustering = {}

km = KMeans(n_clusters=nb_clusters)

numDoc = 0
for i in range(0, len(limits)):
    dash = km.fit(partitions_tfidf[i])
    previousBound = 0
    if i > 0:
        previousBound = limits[i-1]
    for numDocItern in range(0, limits[i]-previousBound):
        doc_clustering[numDoc] = [i, dash.labels_[numDocItern]]
        numDoc+=1

In [13]:
doc_clustering

{0: [0, 3],
 1: [0, 4],
 2: [0, 0],
 3: [0, 1],
 4: [0, 1],
 5: [0, 2],
 6: [0, 3],
 7: [0, 3],
 8: [0, 0],
 9: [0, 3],
 10: [0, 4],
 11: [0, 3],
 12: [0, 2],
 13: [0, 3],
 14: [0, 2],
 15: [0, 1],
 16: [0, 2],
 17: [0, 2],
 18: [0, 2],
 19: [0, 3],
 20: [0, 1],
 21: [0, 3],
 22: [0, 3],
 23: [0, 4],
 24: [0, 4],
 25: [0, 3],
 26: [0, 2],
 27: [0, 4],
 28: [0, 0],
 29: [0, 0],
 30: [0, 4],
 31: [0, 2],
 32: [0, 1],
 33: [0, 1],
 34: [0, 4],
 35: [0, 4],
 36: [0, 1],
 37: [0, 1],
 38: [0, 3],
 39: [0, 4],
 40: [0, 4],
 41: [0, 3],
 42: [0, 0],
 43: [0, 0],
 44: [0, 4],
 45: [0, 4],
 46: [0, 4],
 47: [0, 4],
 48: [0, 4],
 49: [0, 3],
 50: [0, 4],
 51: [0, 0],
 52: [0, 4],
 53: [0, 0],
 54: [0, 4],
 55: [0, 3],
 56: [0, 3],
 57: [0, 1],
 58: [0, 1],
 59: [0, 4],
 60: [0, 4],
 61: [0, 0],
 62: [0, 4],
 63: [0, 3],
 64: [0, 0],
 65: [0, 4],
 66: [0, 0],
 67: [0, 3],
 68: [0, 3],
 69: [0, 4],
 70: [0, 1],
 71: [0, 1],
 72: [0, 4],
 73: [0, 2],
 74: [0, 3],
 75: [0, 2],
 76: [0, 2],
 77: [0, 

In [14]:
# Allows to get list of documents number
# return [dou numbers]
# params : partition_number , cluster number
def get_doc(part, clust):
    docs = []
    for i in range(0,len(doc_clustering)):
        if doc_clustering[i][0] == part and doc_clustering[i][1] == clust:
            docs.append(i)
    return docs

In [15]:
# Get the partitions variable
# Here partitions[part][cluster] = list of docs numbe
partitions = []
for i in range(0, len(limits)):
    clusters = []
    for j in range(0, nb_clusters):
        clusters.append(get_doc(i,j))
    partitions.append(clusters)

In [16]:
partitions

[[[2,
   8,
   28,
   29,
   42,
   43,
   51,
   53,
   61,
   64,
   66,
   95,
   104,
   107,
   110,
   112,
   113,
   132,
   135,
   141,
   148,
   150,
   161,
   168,
   181,
   193,
   198,
   202],
  [3,
   4,
   15,
   20,
   32,
   33,
   36,
   37,
   57,
   58,
   70,
   71,
   79,
   85,
   87,
   99,
   102,
   109,
   117,
   118,
   120,
   121,
   123,
   124,
   130,
   134,
   136,
   139,
   149,
   151,
   153,
   154,
   155,
   159,
   160,
   165,
   169,
   170,
   175,
   179,
   184,
   185,
   186,
   189,
   191,
   205,
   209,
   210,
   212,
   216,
   218],
  [5,
   12,
   14,
   16,
   17,
   18,
   26,
   31,
   73,
   75,
   76,
   78,
   82,
   84,
   105,
   106,
   108,
   125,
   126,
   128,
   138,
   140,
   143,
   146,
   147,
   157,
   162,
   171,
   180,
   192,
   201,
   203,
   204,
   207,
   208,
   222],
  [0,
   6,
   7,
   9,
   11,
   13,
   19,
   21,
   22,
   25,
   38,
   41,
   49,
   55,
   56,
   63,
   67,
   68,
  

# Quality Measure

In [17]:
# INSERT QUALITY MEASURE HERE

# Khi²

In [18]:
# tf_of_your_word = tf[numDoc][strWord]
tf = []
for doc in usable:
    tf_doc = {}
    for word in vectorizer.get_feature_names():
        tf_doc[word] = doc.count(word)
    tf.append(tf_doc)

In [19]:
# Number total of words
# nb_total_word[numPartition]
nb_total_word = []
nb = 0

for numDoc in range(0, len(usable)):
    for word in vectorizer.get_feature_names():
        nb += tf[numDoc][word]
    if numDoc+1 in limits:
        nb_total_word.append(nb)
        nb=0
    

In [20]:
nb_total_word

[26871, 29619, 26638, 36296]

In [21]:
tf[0]

{'tâcher': 0,
 'miser': 0,
 'dissimilarit': 0,
 'sembler': 0,
 'caracteriseer': 0,
 'premiere': 0,
 'traiter donnee': 0,
 'daider': 0,
 'depender': 0,
 'extrait': 0,
 'similarite': 0,
 'outil': 0,
 'exploitable': 0,
 'etablie': 0,
 'topolog': 0,
 'communautaire': 0,
 'apprentissage automatique': 0,
 'travailler': 0,
 'dabord': 0,
 'densite': 0,
 'frequence': 0,
 'ensemble regl': 0,
 'ete implemente': 0,
 'lineaire': 0,
 'historique': 0,
 'extraite': 0,
 'faisabilite': 0,
 'qualite': 0,
 'dontologie partir': 0,
 'algorithme fouiller': 0,
 'corpus': 0,
 'quell': 0,
 'cle': 0,
 'acteur': 0,
 'formation': 0,
 'detablir': 0,
 'article proposer methode': 0,
 'analyser semantiqu': 0,
 'grand': 0,
 'standard': 0,
 'reconnaître': 0,
 'predicteur': 0,
 'image': 0,
 'nouvel': 0,
 'system dinformation': 0,
 'xquery': 0,
 'presentent': 0,
 'caracterise': 0,
 'articlenou': 0,
 'associ': 0,
 'heterogen': 0,
 'connu': 0,
 'structurer donnee': 0,
 'gestion': 0,
 'technologi': 0,
 'dimager': 0,
 'classi

In [22]:
# nb_word[num_partition][word]
nb_word = []

word_in_this_parti = {}
for word in vectorizer.get_feature_names():
    word_in_this_parti[word] = 0

for numDoc in range(0, len(usable)):
    for word in vectorizer.get_feature_names():
        word_in_this_parti[word] += tf[numDoc][word]
    if numDoc+1 in limits:
        nb_word.append(word_in_this_parti)
        word_in_this_parti = {}
        for word in vectorizer.get_feature_names():
            word_in_this_parti[word] = 0

In [23]:
len(nb_word)

4

In [24]:
# nb_word_by_cluster[numPartition][numCluster]
nb_word_by_cluster = []
for parti in partitions:
    nb_word_clus = []
    for cluster in parti:
        nb = 0
        for numDoc in cluster:
            for word in vectorizer.get_feature_names():
                nb += tf[numDoc][word]
        nb_word_clus.append(nb)
    nb_word_by_cluster.append(nb_word_clus)

In [25]:
# Expected values, if nothing were dependant
# exp[numPartition][numCluster][numWord]
#exp = []
#for numParti in range(0, len(partitions)):
#    exp_clus = []
#    for numCluster in range(0, len(partitions[numParti])):
#        exp_word = []
#        for numWord in range(0, vectorizer.get_feature_names()):
#            exp_word.append((nb_word[numParti][numWord] + nb_word_by_cluster[numPart][numCluster]) / nb_total_word[numParti])
#        exp_cluster.append(exp_word)
#    exp.append(exp_clus)


In [26]:
# value_of_khi2 = khi2[numPartition][numCluster][word]
khi2 = []

for numParti in range(0, len(partitions)):
    khi2parti = []
    for numCluster in range(0, len(partitions[numParti])):
        khi2cluster = {}
        
        for word in vectorizer.get_feature_names():
            word_in_this_parti[word] = 0
            E = nb_word[numParti][word]
            E =+ nb_word_by_cluster[numParti][numCluster]
            E = E/ nb_total_word[numParti]
            N = 0
            for numDoc in partitions[numParti][numCluster]:
                N += tf[numDoc][word]
            khi2cluster[word] = (pow(N - E, 2)/E)        
        khi2parti.append(khi2cluster)
    khi2.append(khi2parti)

In [27]:
# list of your labels = labels[numPartition][numCluster]
labels = []

for numPartition in range(0, len(nb_word_by_cluster)):
    label_clus = []
    for numCluster in range(0, len(nb_word_by_cluster[numPartition])):
        label_clus.append(Counter(khi2[numPartition][numCluster]).most_common(5))
    labels.append(label_clus)

In [28]:
labels

[[[('utilis', 32114.449285655046),
   ('tre', 27210.013070419256),
   ('recommandation', 27210.013070419256),
   ('don', 24462.304182873908),
   ('donne', 18619.595899923486)],
  [('tre', 97510.57116399668),
   ('for', 33357.007117990695),
   ('don', 18310.03169653174),
   ('donne', 15121.389503369615),
   ('ete', 13641.377056544343)],
  [('don', 44287.64083212384),
   ('donne', 36586.840832123846),
   ('donnee', 27994.34083212385),
   ('tre', 24878.874165457182),
   ('motif', 23390.040832123854)],
  [('tre', 49712.56476088993),
   ('don', 19441.678684940565),
   ('for', 18863.69767228234),
   ('ete', 17733.90020392791),
   ('donne', 17182.08374823171)],
  [('don', 151807.7332852144),
   ('donne', 145475.29023532424),
   ('donnee', 110313.59585189154),
   ('tre', 50790.45057420095),
   ('for', 31007.7031628423)]],
 [[('don', 133461.81364922848),
   ('tre', 121893.25208421123),
   ('donne', 114107.25872314192),
   ('donnee', 89363.07814422715),
   ('for', 49654.82630745634)],
  [('tre',