### Importamos las principales librerias de nlp

In [1]:
import spacy
from gensim.models import Word2Vec
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter

NUM_CLUSTERS = 25

# Preprocesamiento del Corpus
A cada oración del corpus se la procesa con los siguientes pasos:

* Se sacan stopwords y signos de puntuación.
* Se filtran solo tokens alfabéticos y de un largo considerable
* Se remueven pronombres

In [9]:
def normalize_sentence(span):
    words = [t for t in span if not t.is_punct | t.is_stop]
    lexical_tokens = [t for t in words if len(t.orth_) > 3 and
                      t.orth_.isalpha()]

    cleaned_lemmas = [tok.lemma_.lower()
                      for tok in lexical_tokens if tok.pos_ != 'PRON']

    return cleaned_lemmas

### Se utilizó el corpus de Cristian Cardellino. Se utiliza una parte del corpus debido a falta de memoria

In [3]:
with open("spanish_billion_words_00") as f:
    raw_text = f.read()
raw_text = raw_text[:len(raw_text) // 75]


nlp = spacy.load("es_core_news_sm")

### Procesamiento de los spans

In [4]:

doc = nlp(raw_text)
lemmatized_sentences = []

for span in doc.sents:
    lemmatized_sentences.append(normalize_sentence(span))

## Aplicamos Word2Vec

In [11]:
model = Word2Vec(lemmatized_sentences, min_count=1, window=5)
vocabulary = model.wv.key_to_index

vectors = []
for word in vocabulary:
    vectors.append(model.wv[word])

matrix = np.array(vectors)
print("Matrix shape:", matrix.shape)

Matrix shape: (9351, 100)


## Normalizamos la matriz de Word2Vec y reducimos dimensiones con poca varianza

In [6]:
matrix_normed = matrix / matrix.max(axis=0)

variances = np.square(matrix_normed).mean(axis=0) - \
    np.square(matrix_normed.mean(axis=0))
VarianzaMin = 0.001
red_matrix = np.delete(matrix_normed, np.where(
    variances < VarianzaMin), axis=1)

### Aplicamos algoritmo de K-means 

In [7]:
k_means_model = KMeans(n_clusters=NUM_CLUSTERS)
k_means_model.fit(red_matrix)

KMeans(n_clusters=25)

In [8]:
def show_results(vocabulary, model):
    # Show results
    c = Counter(sorted(model.labels_))
    print("\nTotal clusters:", len(c))
    for cluster in c:
        print("Cluster#", cluster, " - Total words:", c[cluster])

    # Show top terms and words per cluster
    print("Top words per cluster:")
    print()

    keysVocab = list(vocabulary.keys())
    for n in range(len(c)):
        print("Cluster %d" % n)
        print("Words:", end='')
        word_indexs = [i for i, x in enumerate(list(model.labels_)) if x == n]
        for i in word_indexs:
            print(' %s' % keysVocab[i], end=',')
        print()
        print()

    print()

show_results(vocabulary, k_means_model)



Total clusters: 25
Cluster# 0  - Total words: 1627
Cluster# 1  - Total words: 25
Cluster# 2  - Total words: 128
Cluster# 3  - Total words: 10
Cluster# 4  - Total words: 278
Cluster# 5  - Total words: 60
Cluster# 6  - Total words: 100
Cluster# 7  - Total words: 165
Cluster# 8  - Total words: 17
Cluster# 9  - Total words: 71
Cluster# 10  - Total words: 459
Cluster# 11  - Total words: 30
Cluster# 12  - Total words: 776
Cluster# 13  - Total words: 1185
Cluster# 14  - Total words: 161
Cluster# 15  - Total words: 22
Cluster# 16  - Total words: 52
Cluster# 17  - Total words: 1081
Cluster# 18  - Total words: 1948
Cluster# 19  - Total words: 11
Cluster# 20  - Total words: 2
Cluster# 21  - Total words: 203
Cluster# 22  - Total words: 349
Cluster# 23  - Total words: 567
Cluster# 24  - Total words: 24
Top words per cluster:

Cluster 0
Words: mediodía, perdóname, bárbara, apresuré, visitarlas, dimir, créanmir, bendiga, vedado, mencioné, retornar, jurar, vieja, opción, fastidio, desdichar, dej, ata