In [None]:
import string
import collections

from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import spacy
nlp = spacy.load('pt')

## Text Preprocessing
Common tasks:  
* Tokenization
* Removing stopwords and ponctuation
* Stemming and/or lemmatization

In [None]:
# Stopwords and lowercase are applied at TfidfVectorizer
def pre_process_text(text):
    tokens = nlp(text)
    only_alpha = list(filter(lambda x: x.is_alpha, tokens))
    return only_alpha

In [None]:
def cluster_texts(texts, clusters=3):
    vectorizer = TfidfVectorizer(tokenizer=pre_process_text,
                                 stop_words=stopwords.words('portuguese'),
                                 lowercase=True)
 
    tfidf_model = vectorizer.fit_transform(texts)
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
 
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)
 
    return clustering

In [None]:
train = pd.read_csv("~/corpus/menor.csv", header=0, \
                    delimiter=";", quoting=1, quotechar='"')
cluster_size = 5

articles = train['mensagem'].values 
clusters = cluster_texts(train['mensagem'].values, cluster_size)

for i in range(cluster_size):
    print("================================================================================================")
    print("Cluster número %d" % i) 
    for j in clusters[i]:
        print("- "+articles[j])
    print("================================================================================================")