In [21]:
import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import json
import string

In [22]:
from utilities import *

In [23]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('portuguese')

In [24]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("portuguese")

In [25]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

def tokenize(arquivos):
    np_arquivos = np.array(arquivos)
    print np_arquivos
    np_arquivos_lower = np.char.lower(np_arquivos.tolist())
    print np_arquivos_lower
    no_punctuation = np.char.translate(np_arquivos_lower, None, string.punctuation)
    print no_punctuation
    print no_punctuation.tolist()
    tokens = nltk.word_tokenize(no_punctuation.tolist())
    return tokens

In [26]:
#teste = ["Testando método de tokenização.", "Outra frase para testar método?", "Mais uma frase para teste!"]
#tokenize(teste)

In [27]:
nome_arquivo = 'arquivo_subset.json'
#nome_arquivo = 'arquivo.json'

with open(nome_arquivo) as json_file:
    arquivos_json = json.load(json_file)

In [28]:
arquivos_ids = []
arquivos_nomes = []
arquivos_conteudos = []

print len(arquivos_json)

for idx in xrange(len(arquivos_json)):
    if (idx <= 100 and idx % 10 == 0) or (idx <= 1000 and idx % 100 == 0) \
        or (idx <= 10000 and idx % 1000 == 0) or idx % 10000 == 0:
            print("Arquivo %s" % idx)
    arquivo = arquivos_json[idx]
    arquivos_ids.append(arquivo['ctr_id'])
    arquivos_nomes.append(arquivo['api_Nome_do_arquivo_tg'])
    arquivos_conteudos.append(arquivo['api_Arquivo_bn'])
print("Arquivo %s" % len(arquivos_json))

1000
Arquivo 0
Arquivo 10
Arquivo 20
Arquivo 30
Arquivo 40
Arquivo 50
Arquivo 60
Arquivo 70
Arquivo 80
Arquivo 90
Arquivo 100
Arquivo 200
Arquivo 300
Arquivo 400
Arquivo 500
Arquivo 600
Arquivo 700
Arquivo 800
Arquivo 900
Arquivo 1000


In [29]:
#totalvocab_tokenized = tokenize(arquivos_conteudos)
#totalvocab_tokenized

In [30]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in arquivos_conteudos:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [31]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(arquivos_conteudos)

print(tfidf_matrix.shape)

CPU times: user 5.99 s, sys: 0 ns, total: 5.99 s
Wall time: 5.84 s
(1000, 157)


In [33]:
terms = tfidf_vectorizer.get_feature_names()

In [42]:
terms

[u'_h',
 u'ag',
 u'agetop',
 u'ap\xf3s tentat',
 u'ap\xf3s tentat coloc',
 u'ap\xf3s tr\xeas',
 u'ap\xf3s tr\xeas tentat',
 u'ar ag',
 u'ar centr',
 u'ar centraliz',
 u'ar centraliz regional',
 u'assinatur',
 u'assinatur recebedor',
 u'aten\xe7\xe3 ap\xf3s',
 u'aten\xe7\xe3 ap\xf3s tentat',
 u'aten\xe7\xe3 ap\xf3s tr\xeas',
 u'ausent',
 u'ausent desconhec',
 u'autua\xe7\xe3',
 u'autua\xe7\xe3 aut',
 u'av',
 u'avis',
 u'avis cheg',
 u'avis cheg coloc',
 u'c',
 u'carimb',
 u'carimb unidad',
 u'carimb unidad entreg',
 u'centr',
 u'centraliz',
 u'centraliz regional',
 u'cheg',
 u'cheg coloc',
 u'cheg coloc objet',
 u'coloc objet',
 u'coloc objet post',
 u'conte\xfad',
 u'conte\xfad opcional',
 u'control',
 u'correi',
 u'd',
 u'dat entreg',
 u'declar',
 u'declar conte\xfad',
 u'declar conte\xfad opcional',
 u'desconhec',
 u'devolu ar',
 u'devolu ar centr',
 u'devolu ar centraliz',
 u'devolu mudou-s',
 u'dias',
 u'dias motiv',
 u'dias motiv devolu',
 u'digitaliz',
 u'digitaliz dr',
 u'docume

In [34]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [35]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 508 ms


In [36]:
clusters

[4,
 1,
 1,
 1,
 1,
 4,
 2,
 0,
 2,
 1,
 0,
 0,
 0,
 3,
 1,
 3,
 1,
 1,
 0,
 4,
 1,
 0,
 1,
 1,
 4,
 0,
 2,
 0,
 1,
 2,
 1,
 2,
 1,
 1,
 0,
 1,
 2,
 1,
 0,
 1,
 0,
 1,
 3,
 3,
 0,
 2,
 1,
 0,
 1,
 3,
 3,
 1,
 1,
 2,
 0,
 1,
 1,
 3,
 3,
 1,
 0,
 1,
 3,
 2,
 1,
 3,
 2,
 0,
 1,
 0,
 0,
 4,
 2,
 0,
 0,
 0,
 0,
 3,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 0,
 1,
 3,
 0,
 1,
 3,
 2,
 3,
 0,
 3,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 3,
 2,
 2,
 3,
 2,
 3,
 2,
 1,
 1,
 3,
 1,
 2,
 4,
 2,
 2,
 3,
 2,
 2,
 1,
 2,
 4,
 1,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 3,
 3,
 2,
 2,
 2,
 1,
 2,
 1,
 2,
 4,
 2,
 2,
 1,
 3,
 2,
 2,
 2,
 1,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 2,
 1,
 4,
 1,
 1,
 4,
 2,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 4,
 1,
 2,
 4,
 2,
 2,
 2,
 2,
 3,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 3,
 2,
 3,
 1,
 1,
 3,
 1,
 2,
 3,
 2,
 2,
 1,
 3,
 2,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 3,
 2,
 1,
 1,
 1,
 3,
 1,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 1,
 4,
 2,
 1,
 1,
 2,
 1,
 2,
 3,
 2,
 2,
 2,
 1,


In [37]:
from sklearn.externals import joblib

joblib.dump(km,  'agetop_cluster.pkl')
km = joblib.load('agetop_cluster.pkl')
clusters = km.labels_.tolist()

In [38]:
arquivos = { 'id': arquivos_ids, 'nome': arquivos_nomes, 'conteudo': arquivos_conteudos, 'cluster': clusters }

frame = pd.DataFrame(arquivos, index = [clusters] , columns = ['id', 'nome', 'conteudo', 'cluster'])

In [39]:
frame

Unnamed: 0,id,nome,conteudo,cluster
4,e7b1caef1a4214b070b10194d35725caaa093e6d,14010345000000042014022013.PDF,89700-000 CONCORDIA /SC\nII II\nII\nli II II I...,4
1,2c8f7e2b456f7f5716132b00548430fee887db7a,14010345000000052014022013.PDF,NA ITSEEO/GGRAN/DR/GT\nDATA DE POTAGEM\n27/01/...,1
1,31a127b1b09a5ffae8c9ea921040049922d8348e,14010345000000072014022013.PDF,"CORREIO<\nDESTINATÁRIO:\nkg, a\nDANI E RINALDI...",1
1,2de0586a4d8d03afa8a1f975a16f73955016731e,14010345000000112014022013.PDF,NA IT SEED/GGRANtDR/GT\nCORREIOS\nDESTINATÁRIO...,1
1,d0f55a2299497200428c9e8d9594d56afb0509c7,14010345000000142014022013.PDF,89560-000 VIDEIRA / SC\nIIII\nAR 340405364 AG\...,1
4,4b356b47c2a2cdc8df3438c06ed4f4e4f99e4979,14010345000000172014022013.PDF,igi a CONTROLE\nNAP\nCORREIO< SEED/GGRAN/DR/GT...,4
2,c12a4e69ad3802b0a02a0afaafe81094ffcd7fb8,14010345000000182014022013.PDF,SEED/GGRAN/DR/G0\nDATA DE POSTAGEM\n10102/2014...,2
0,7dfca19151197265b1de79d8f111a864e394a201,14010345000000202014022013.PDF,CORREIO< mar /gr a ia«\n11\nNOTIFICAÇÃO DE\nAU...,0
2,c41b61f5d2058daf55e7d11a814acc967510f360,14010345000000192014022013.PDF,SEED/GGRAN/DR/GO\nDATA DE POSTAGEM\nCORREIO< 1...,2
1,de97a9f8ad2cd983dc43a2fdc526b2ce8e933800,14010345000000272014022013.PDF,"DATA DE ENTREGA\n7.Y y\nN° DOCUM ENT,0 10E IDE...",1


In [40]:
frame['cluster'].value_counts()

0    389
1    262
3    134
2    134
4     81
Name: cluster, dtype: int64

In [41]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :6]:
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()
    print()
    #print("Cluster %d nomes:" % i, end='')
    #for title in frame.ix[i]['nome'].values.tolist():
    #    print(' %s,' % title, end='')
    #print()
    #print()

Top terms per cluster:

Cluster 0 words: notificação, i, n°, go, el, _h,

Cluster 1 words: agetop, dr, sete, três, três, chegada,

Cluster 2 words: sp, notificação, i, r, autuação, notificação,

Cluster 3 words: ii, ii, notificação, autuação, notificação, el,

Cluster 4 words: ii, ii, agetop, ar, digitalização, sete,

