In [1]:
import pandas as pd
import sklearn 
import numpy as np
import nltk
import json
import re
import time
import codecs

In [2]:
#nome_arquivo = 'arquivo_subset.json'
nome_arquivo = 'arquivo.json'

with open(nome_arquivo) as json_file:
    arquivos_json = json.load(json_file)

In [3]:
arquivos_pandas = pd.read_json(nome_arquivo)

In [4]:
def clean_text(text):
    list_of_cleaning_signs = ['\x0c', '\n']
    for sign in list_of_cleaning_signs:
        text = text.replace(sign, ' ')
    #text = unicode(text, errors='ignore')
    clean_text = re.sub('[^a-zA-Z]+', ' ', text)
    return clean_text.lower()

In [5]:
arquivos_pandas['arquivo_clean'] = arquivos_pandas['api_Arquivo_bn'].apply(lambda x: clean_text(x))

In [6]:
# defines a tokenizer and stemmer which returns the set of stems in the text that it is passed
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("portuguese")
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [7]:
# load nltk's portuguese stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('portuguese')

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Producing tf_idf matrix separately based on text
tfidf_vectorizer_text = TfidfVectorizer(max_df=0.8, max_features=500,
                                 min_df=0.2, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
%time tfidf_matrix_text = tfidf_vectorizer_text.fit_transform(arquivos_pandas['arquivo_clean'])

CPU times: user 11min 10s, sys: 3.64 s, total: 11min 14s
Wall time: 12min 8s


In [9]:
terms_text = tfidf_vectorizer_text.get_feature_names()

In [10]:
def given_id_give_index(arquivo_id, arquivos):
    return arquivos[arquivos['ctr_id']==arquivo_id].index[0]
#
def given_index_give_id(index, arquivos):
    return arquivos.iloc[index]['ctr_id']

In [11]:
def top_tfidf_feats(row, terms, top_n=25):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(terms[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df['feature']
def given_id_give_keywords(arquivos, tfidfMatrix, terms, arquivo_id, top_n=20):
    row_id = given_id_give_index(arquivo_id, arquivos)
    row = np.squeeze(tfidfMatrix[row_id].toarray())
    return top_tfidf_feats(row, terms, top_n)

In [12]:
Ex_arquivo_id = '2c8f7e2b456f7f5716132b00548430fee887db7a'
Ex_arquivo_index = given_id_give_index(Ex_arquivo_id, arquivos_pandas)
arquivos_pandas.iloc[Ex_arquivo_index]['arquivo_clean'][0:1000]

u'na itseeo ggran dr gt data de potagem p de oliveira transportes me r ver joao vitor costa sala a bro boa vista erizmiirim do agetop correio agetop tentativas de entrega siqueira campos pr for kis o di carimbo rega o fev z ar ag iii hi liii fli h aten o ap s tr s tentativas de entrega emitir aviso de chegada e colocar a objeto em posta restante por sete dias motivos da devolu o o ii dere o para devolu o do ar ntro de digitaliza o da dr auto v nopmento eimudou se el recusado endere o insuficiente e n o procurado n o existe o n mero e ausente f desconhecido fateutdo outros ta pr a ist krums poo i edor yy jr legivel do recebedor data triz il ecume tope identidade ci ttanz olas tr stinat rio rl sricae matricula do cart eirc'

In [13]:
arquivo_id_example = '2c8f7e2b456f7f5716132b00548430fee887db7a'
print ("Keywords based on id:")
print (given_id_give_keywords(arquivos_pandas, tfidf_matrix_text,
                                  terms_text, arquivo_id_example, top_n = 10))

Keywords based on id:
0                         tr
1                     agetop
2                         gt
3          dias motiv devolu
4             set dias motiv
5                 dias motiv
6               digitaliz dr
7                     h aten
8           cheg coloc objet
9    recus ender insuficient
Name: feature, dtype: object


In [14]:
from sklearn.neighbors import NearestNeighbors
num_neighbors = 4
# Based on text
nbrs_text = NearestNeighbors(n_neighbors=num_neighbors,
                                  algorithm='auto').fit(tfidf_matrix_text)
distances_text, indices_text = nbrs_text.kneighbors(tfidf_matrix_text)

MemoryError: 

In [None]:
print ("Nbrs of the example based on text similarity: %r" % indices_text[1])

In [None]:
Ex_arquivo_id = '2c8f7e2b456f7f5716132b00548430fee887db7a'
Ex_index = given_id_give_index(Ex_arquivo_id, arquivos_pandas)
print ("The text of the example is:\n")
print (arquivos_pandas.iloc[indices_text[Ex_index][0]]['arquivo_clean'])
print ("The text of the similar are:\n")
for i in range(1, len(indices_text[Ex_index])):
    print ("Neighbor No. %r has following text: \n" % i)
    print (arquivos_pandas.iloc[indices_text[Ex_index][i]]['arquivo_clean'])
    print ("\n")