### Imports et initialisations

In [3]:
import numpy as np
# Pour l'échantillonnage aléatoire
import random
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from clustertools import *
import os
import gc

In [38]:
clean_method = "Lem"
clust_method = "som_225"

#répertoire dans lequel seront stockés les clusters, doit se terminer obligatoirement par /
clusters_dir = "clusters/1M_{}_clusters/{}/".format(clean_method, clust_method)
clusters_save_dir = clusters_dir + "a_reclustering_som/"

### Chargement des données en mémoire

In [65]:
clean_clusters, raw_clusters, two_dim_clusters = load_clusters(clusters_dir)

In [13]:
len(clean_clusters[1])

82496

### Encodage du jeu de données

On charge le modèle depuis le disque

In [39]:
from gensim.models import KeyedVectors

In [40]:
model = KeyedVectors.load("models/w2vec_model_d300_1M_{}".format(clean_method))
dim = 300

In [41]:
# Encode un message
# msg : liste de chaînes de caractères correspondant aux mots du message
# model : le modèle utilisé pour l'encodage
# dim : la dimension des vecteurs mots dans ce modèle
# Renvoie un vecteur qui est la moyenne de tous les vecteurs correspondants aux mots du message
# Si aucun mot du message n'est dans le modèle, renvoie un vecteur de zéros.
def encode(msg, model, dim):
    return np.mean([model[word] for word in msg if word in model] or [np.zeros(dim)], axis = 0)

In [66]:
%%time
#encodage des clusters
encoded_clusters = [np.array([encode(msg, model, dim) for msg in cluster]) for cluster in clean_clusters]

CPU times: user 20.2 s, sys: 0 ns, total: 20.2 s
Wall time: 20.2 s


### SOM

In [9]:
from minisom import MiniSom

In [67]:
if not os.path.exists(clusters_save_dir):
    os.mkdir(clusters_save_dir)

In [68]:
%%time
for i, (enc_cluster, clean_cluster, raw_cluster, two_dim_cluster) in enumerate(zip(encoded_clusters, clean_clusters, raw_clusters, two_dim_clusters)):
    init_globals(clean_cluster, raw_cluster, two_dim_cluster, clusters_save_dir + "_{}".format(i))
    som_model = MiniSom(3, 4, 300, neighborhood_function='gaussian', random_seed=0)
    som_model.pca_weights_init(enc_cluster)
    som_model.train(enc_cluster, 10000, verbose=True)    
    som_pred = []
    for msg in enc_cluster:
        winner = som_model.winner(msg)
        som_pred.append(winner[0]*4+winner[1])  
    centroids = []
    for row in som_model.get_weights():
        for w in row:
            centroids.append(w)
    som_centers = [[r[0] for r in model.similar_by_vector(c)] for c in centroids]
    #parse_result(som_pred, 'som_{}'.format(n_clust), som_centers)
    #show_hate_clusters(som_pred, som_centers)
    wlists, mlists, cmlists, e2dmlists, mfw, hkw, clust_n_msg = parse(som_pred)
    cluster_number_printed = False
    for j in range(len(clust_n_msg)):
        if (hkw[j][1] > 5):
            if not cluster_number_printed:
                cluster_number_printed = True
                print("Cluster {}:".format(i))
            print("Sous-cluster {} : {} messages".format(j, clust_n_msg[j]))
            print("{} messages contiennent un mot-clé haineux (soit {:.2f}%)".format(hkw[j][0], hkw[j][1]))
            print("Les {} mots les plus fréquents :".format(len(mfw[j])), end=' ')
            for word in mfw[j]:
                print(word, end=' ')
            print("\nLes mots les plus proches du centre :", end = ' ')
            for w in som_centers[j]:
                print(w, end=' ')
            print('\n')
            save_cluster_raw_msg(mlists[j], clusters_save_dir + "_cluster_{}.{}_raw_msg_{:.2f}pch".format(i,j, hkw[j][1]))
            with open(clusters_save_dir + "_subclusters_info.txt", "a") as f:
                f.write("Cluster {}, sous-cluster {} : {} messages\n".format(i, j, clust_n_msg[j]))
                f.write("{} messages contiennent un mot-clé haineux (soit {:.2f}%)\n".format(hkw[j][0], hkw[j][1]))
                f.write("Les {} mots les plus fréquents : ".format(len(mfw[j])))
                for word in mfw[j]:
                    f.write(word + ' ')
                f.write("\nLes mots les plus proches du centre : ")
                for w in som_centers[j]:
                    f.write(w + ' ')
                f.write('\n\n')
    if cluster_number_printed:
        print()    

In [10]:
def recluster():
    for i, (enc_cluster, clean_cluster, raw_cluster, two_dim_cluster) in enumerate(zip(encoded_clusters, clean_clusters, raw_clusters, two_dim_clusters)):
        init_globals(clean_cluster, raw_cluster, two_dim_cluster, clusters_save_dir + "_{}".format(i))
        som_model = MiniSom(3, 4, 300, neighborhood_function='gaussian', random_seed=0)
        som_model.pca_weights_init(enc_cluster)
        som_model.train(enc_cluster, 10000, verbose=True)    
        som_pred = []
        for msg in enc_cluster:
            winner = som_model.winner(msg)
            som_pred.append(winner[0]*4+winner[1])  
        centroids = []
        for row in som_model.get_weights():
            for w in row:
                centroids.append(w)
        som_centers = [[r[0] for r in model.similar_by_vector(c)] for c in centroids]
        #parse_result(som_pred, 'som_{}'.format(n_clust), som_centers)
        #show_hate_clusters(som_pred, som_centers)
        wlists, mlists, cmlists, e2dmlists, mfw, hkw, clust_n_msg = parse(som_pred)
        cluster_number_printed = False
        for j in range(len(clust_n_msg)):
            save_cluster_raw_msg(mlists[j], clusters_save_dir + "cluster_{}.{}_raw_msg_{:.2f}pch".format(i,j, hkw[j][1]))
            if (hkw[j][1] > 5):
                if not cluster_number_printed:
                    cluster_number_printed = True
                    print("Cluster {}:".format(i))
                print("Sous-cluster {} : {} messages".format(j, clust_n_msg[j]))
                print("{} messages contiennent un mot-clé haineux (soit {:.2f}%)".format(hkw[j][0], hkw[j][1]))
                print("Les {} mots les plus fréquents :".format(len(mfw[j])), end=' ')
                for word in mfw[j]:
                    print(word, end=' ')
                print("\nLes mots les plus proches du centre :", end = ' ')
                for w in som_centers[j]:
                    print(w, end=' ')
                print('\n')
                with open(clusters_save_dir + "subclusters_info.txt", "a") as f:
                    f.write("Cluster {}, sous-cluster {} : {} messages\n".format(i, j, clust_n_msg[j]))
                    f.write("{} messages contiennent un mot-clé haineux (soit {:.2f}%)\n".format(hkw[j][0], hkw[j][1]))
                    f.write("Les {} mots les plus fréquents : ".format(len(mfw[j])))
                    for word in mfw[j]:
                        f.write(word + ' ')
                    f.write("\nLes mots les plus proches du centre : ")
                    for w in som_centers[j]:
                        f.write(w + ' ')
                    f.write('\n\n')
        if cluster_number_printed:
            print()  
        gc.collect()

In [42]:
%%time
clusters_dir = "clusters/1M_Lem_clusters/"

for method in ["km_25", "km_50", "km_100", "som_100", "som_225", "som_400"]:
    print("Clustering des clusters {}\nChargement des clusters en mémoire.".format(method))
    clean_clusters, raw_clusters, two_dim_clusters = load_clusters(clusters_dir + method + '/')
    print("Encodage des clusters")
    encoded_clusters = [np.array([encode(msg, model, dim) for msg in cluster]) for cluster in clean_clusters]
    clusters_save_dir = clusters_dir + method + "/a_reclustering_som/"
    if not os.path.exists(clusters_save_dir):
        os.mkdir(clusters_save_dir)
    recluster()
    print()

Clustering des clusters km_25
Chargement des clusters en mémoire.
Encodage des clusters
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.1165662920495003
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.395496645925121
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.9664969084258203
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.6282865153648953


  self._weights[i, j] = c1*pc[pc_order[0]] + c2*pc[pc_order[1]]


 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.7270758893941067
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.3123805338472496
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.4408674805139372
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.7659576236009624
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.7941994788648851
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.6018742675873123
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.3318916102642067
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.4243316806203878
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.7319796595246781
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.343476732928684
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.4618377382924728
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.8915912110168807
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 

  return sqrt(-2 * cross_term + input_data_sq + weights_flat_sq.T)


 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.563559618929798
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.5198528748453457
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.1025173882069383
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.5554101854738638
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.9370673984734575
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.5672241400018763
Cluster 17:
Sous-cluster 4 : 1315 messages
68 messages contiennent un mot-clé haineux (soit 5.17%)
Les 20 mots les plus fréquents : merde pute énorme tête cul con coup chien problème ferme rat petit porc gueul prendre mort fdp malade putain coeur 
Les mots les plus proches du centre : gros salope, mouv gargantuesque ptt renifler desole lstb miett incroyable!!!!!!!!!! 


 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 0.2845137864625587
 [ 10000 / 10000 ] 100% - 0:00:00 left 
 quantization error: 1.70853

In [12]:
clean_clusters, raw_clusters, two_dim_clusters = load_clusters(clusters_dir + "km_25" + '/')
encoded_clusters = [np.array([encode(msg, model, dim) for msg in cluster]) for cluster in clean_clusters]
clusters_save_dir = clusters_dir + "km_25" + "/a_reclustering_som/"


In [14]:
enc_cluster = encoded_clusters[1]
clean_cluster = clean_clusters[1]
raw_cluster = raw_clusters[1]
two_dim_cluster = two_dim_clusters[1]
init_globals(clean_cluster, raw_cluster, two_dim_cluster, clusters_save_dir + "_{}".format(1))

som_model = MiniSom(3, 4, 300, neighborhood_function='gaussian', random_seed=0)
som_model.pca_weights_init(enc_cluster)
som_model.train(enc_cluster, 10000, verbose=True)    
som_pred = []
for msg in enc_cluster:
    winner = som_model.winner(msg)
    som_pred.append(winner[0]*4+winner[1])  
centroids = []
for row in som_model.get_weights():
    for w in row:
        centroids.append(w)
som_centers = [[r[0] for r in model.similar_by_vector(c)] for c in centroids]

In [23]:
ignored = [tup[0] for tup in create_sorted_wlist(clean_cluster)[:n_most_freq_ignored]]

In [35]:
pred = som_pred
res = build_res_dict(pred)
wlists, mlists, cmlists, e2dmlists = create_word_and_msg_lists(res)
mfw = most_freq_words(n_clust_info, wlists, ignored)
hkw = search_hate_words(mlists, kw_file)
clust_n_msg = n_msg_by_clust(res)

In [32]:
import importlib
import clustertools

In [34]:
importlib.reload(clustertools)
from clustertools import *