In [180]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import time
import sys
import math
import re
import regex
import nltk 
from  nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

%matplotlib inline
pal = sns.color_palette()
corpus = []
word_set = set()

lics = pd.read_csv('./input/licitacoes/licitacoes_2016_2017.csv', dtype={'FornecedorDocumento': object})
#lc_recent.head()

In [2]:
def preprocessing(raw):
    if str(raw) == 'nan':
        return ""
    try:
        # 1. Remove non-letters
        raw = regex.sub("[^\p{Latin}]"," ", raw, re.UNICODE)

        # 2. Convert words to lower case and split them
        raw = [(re.sub(r'((?<!\d)\.(?!\d))|((?<!\d)\:(?!\d))', '', word)).replace(',','') for word in raw.lower().split() if len(word) > 1]
        #raw = raw.lower().split()
        #raw = re.findall(r'\w+|[.]+', raw.lower())

        # 3. Remove stop words (false by default)
        stops = set(stopwords.words("portuguese"))
        #wordlist = nltk.word_tokenize(raw)        
        raw = [w for w in raw if not w in stops]
    except:
        print("Unexpected error:", sys.exc_info())
        
    return raw

In [3]:
# Doc2Vec
def custom_Doc2Vec(taggeddocs):
    # Building the model
    model = Doc2Vec(taggeddocs, dm=0, alpha=0.025, size=20, min_alpha=0.025, min_count=0)
    start = time.perf_counter()
    print('Started Training')

    # Training
    total_epochs = 50
    total_docs = len(taggeddocs)
    for epoch in range(total_epochs):
        started_epoch = time.time()
        t0 = time.perf_counter()
        if epoch % 10 == 0:
            print("Started Epoch %d" % (epoch + 1))
        model.train(taggeddocs, total_examples=total_epochs, epochs=total_epochs)
        model.alpha -= 0.002 # decrease the learning rate
        model.min_alpha = model.alpha # fix the learning rate, no decay
        t1 = time.perf_counter()
        print("Epoch %d in: " % (epoch + 1), "%.3f seconds" % (t1-t0))
    
    end = time.perf_counter()
    print('Finished Training in: ', "%.3f seconds" % (end-start))
    
    return model

In [4]:
taggeddocs = []
for lic in lics.values:
    licx = preprocessing(lic[7])
    if len(licx) > 0:
        doc = TaggedDocument(words=licx, tags=[lic[0]])
        taggeddocs.append(doc)

In [5]:
model = custom_Doc2Vec(taggeddocs)

Started Training
Started Epoch 1
Epoch 1 in:  39.297 seconds
Epoch 2 in:  37.323 seconds
Epoch 3 in:  34.578 seconds
Epoch 4 in:  35.671 seconds
Epoch 5 in:  37.297 seconds
Epoch 6 in:  33.139 seconds
Epoch 7 in:  31.139 seconds
Epoch 8 in:  32.175 seconds
Epoch 9 in:  31.711 seconds
Epoch 10 in:  33.254 seconds
Started Epoch 11
Epoch 11 in:  37.824 seconds
Epoch 12 in:  39.381 seconds
Epoch 13 in:  38.173 seconds
Epoch 14 in:  39.419 seconds
Epoch 15 in:  43.309 seconds
Epoch 16 in:  40.349 seconds
Epoch 17 in:  43.636 seconds
Epoch 18 in:  37.656 seconds
Epoch 19 in:  47.222 seconds
Epoch 20 in:  42.227 seconds
Started Epoch 21
Epoch 21 in:  46.720 seconds
Epoch 22 in:  38.624 seconds
Epoch 23 in:  39.711 seconds
Epoch 24 in:  42.739 seconds
Epoch 25 in:  42.945 seconds
Epoch 26 in:  37.648 seconds
Epoch 27 in:  37.582 seconds
Epoch 28 in:  42.287 seconds
Epoch 29 in:  36.542 seconds
Epoch 30 in:  33.751 seconds
Started Epoch 31
Epoch 31 in:  43.966 seconds
Epoch 32 in:  31.942 secon

In [None]:
text1 = preprocessing(lics['objeto'].tolist()[0])
text2 = preprocessing(lics['objeto'].tolist()[9080])
# Comput Cosine distances
similarity_vec = model.n_similarity(text1, text2)
print('Similarity Index: {:4.2f} %'.format(similarity_vec*100))

In [None]:
model.save('./output/lics_model.doc2vec')

In [78]:
t0 = time.perf_counter()
licitacoes = {}
for lic1 in lics.values:
    lic1x = preprocessing(lic1[7])
    if len(lic1x) > 0:
        for sim in model.docvecs.most_similar([lic1[0]], topn=50):
            if sim[1] >= 0.9:
                if str(lic1[0]) not in licitacoes:
                    licitacoes[str(lic1[0])] = []
                licitacoes[str(lic1[0])].append({'id': sim[0], 'sim': sim[1]})
    if lic1[0] % 1000 == 0:
        t1 = time.perf_counter()
        print("Done %d in: " % (lic1[0] + 1), "%.3f seconds" % (t1-t0))
        t0 = time.perf_counter()

Done 1 in:  0.266 seconds
Done 1001 in:  1.038 seconds
Done 2001 in:  1.070 seconds
Done 3001 in:  1.082 seconds
Done 4001 in:  1.329 seconds
Done 5001 in:  1.176 seconds
Done 6001 in:  1.054 seconds
Done 7001 in:  1.073 seconds
Done 8001 in:  1.084 seconds
Done 9001 in:  2.601 seconds
Done 10001 in:  1.221 seconds
Done 11001 in:  1.073 seconds


In [188]:
lics['sim'] = ''
lics['user_id'] = '1234'
for index, row in lics.iterrows():
    if str(index) in licitacoes:
        lics.set_value(index,'sim',licitacoes[str(index)])

lics.to_csv('./output/licitacoes_2016_2017.csv', index=False, encoding='utf-8')
lics.head()

Unnamed: 0,index,Orgao,Retranca,Modalidade,textbox17,textbox19,Número_Licitação,objeto,DataPublicaçãoExtrato,Fornecedor,FornecedorTipo,FornecedorDocumento,DataAssinaturaExtrato,ValidadeExtrato,TipoValidadeExtrato,ValorContrato,NúmeroContrato,sim,user_id
0,0,EDUCAÇÃO,EGAAADM,PREGÃO ELETRÔNICO,14/SME/2015,2014-0.286.506-2,EXTRATO DE CONTRATO / NOTA DE EMPENHO,REGISTRO DE PREÇOS PARA AQUISIÇÃO DE FÓRMULA I...,4/3/2016,MARTINUCI COMÉRCIO E REPRESENTAÇÕES DE PRODUTO...,PJ,18097272000117,24/02/2016,4.0,Meses,"942.955,20",10/SME/CODAE/2016,"[{'id': 2594, 'sim': 0.9812979698181152}, {'id...",1234
1,1,HOSPITAL DO SERVIDOR PÚBLICO MUNICIPAL,ELAAADM,PREGÃO ELETRÔNICO,145/2015,2015-0.223.234-7,EXTRATO DE CONTRATO / NOTA DE EMPENHO,FORMULAÇÕES MAGISTRAIS E OFICINAIS,4/3/2016,PIRES DE CAMPOS & CIA LTDA - EPP,PJ,45516507000130,29/02/2016,12.0,Meses,"55.944,00",078/2016,"[{'id': 7685, 'sim': 0.9862677454948425}, {'id...",1234
2,2,HOSPITAL DO SERVIDOR PÚBLICO MUNICIPAL,ELBCADM,INEXIGIBILIDADE,2015-0.296.519-0,2015-0.296.519-0,EXTRATO DE CONTRATO / NOTA DE EMPENHO,MANUTENÇAO,4/3/2016,LASER LAB COM. ASSISTÊNCIA TÉCNICA LTDA,PJ,72682263000139,22/02/2016,12.0,Meses,"27.948,00",077/2016,"[{'id': 8584, 'sim': 0.9657570123672485}, {'id...",1234
3,3,TRABALHO E EMPREENDEDORISMO,EBNAADM,PREGÃO PRESENCIAL,11/SDTE/2013,2013-0.258.474-6,EXTRATO DE ADITAMENTO,CONTRATAÇAO DE EMPRESA ESPECIALIZADA NA PRESTA...,4/3/2016,ARK TEC GUARDA DE DOCUMENTOS LTDA,PJ,65689895000169,11/2/2016,12.0,Meses,"43.973,48",001/2014/PMSP/SDTE,,1234
4,4,SAÚDE,EPAAADM,PREGÃO ELETRÔNICO,349/2015-SMS.G,2015-0.265.139-0,EXTRATO DE ATA DE REGISTRO DE PREÇO,Registro de preços para o fornecimento de CAIX...,4/3/2016,PONTUAL COMERCIAL EIRELI,PJ,1854654000145,26/02/2016,12.0,Meses,"886.002,00",084/2016-SMS.G,,1234


In [None]:
import boto
from csv import reader

MY_ACCESS_KEY_ID = 'AKIAJNUAMOKDMGZFQK6A'
MY_SECRET_ACCESS_KEY = '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'


def do_batch_write(items, table_name, dynamodb_table, dynamodb_conn):
    batch_list = dynamodb_conn.new_batch_write_list()
    batch_list.add_batch(dynamodb_table, puts=items)
    while True:
        response = dynamodb_conn.batch_write_item(batch_list)
        unprocessed = response.get('UnprocessedItems', None)
        if not unprocessed:
            break
        batch_list = dynamodb_conn.new_batch_write_list()
        unprocessed_list = unprocessed[table_name]
        items = []
        for u in unprocessed_list:
            item_attr = u['PutRequest']['Item']
            item = dynamodb_table.new_item(
                    attrs=item_attr
            )
            items.append(item)
        batch_list.add_batch(dynamodb_table, puts=items)


def import_csv_to_dynamodb(table_name, csv_file_name, colunm_names,     column_types):
    dynamodb_conn =     boto.connect_dynamodb(aws_access_key_id=MY_ACCESS_KEY_ID, aws_secret_access_key=MY_SECRET_ACCESS_KEY)
    dynamodb_table = dynamodb_conn.get_table(table_name)     
    BATCH_COUNT = 25 # 25 is the maximum batch size for Amazon DynamoDB

    items = []

    count = 0
    csv_file = open(csv_file_name, 'r')
    for cur_line in reader(csv_file):
        count += 1

        row = {}
        for colunm_number, colunm_name in enumerate(colunm_names):
            row[colunm_name] = column_types[colunm_number]    (cur_line[colunm_number])
        
        if row['contract_id'] != 'index':

            if len(row['sim']) == 0:
                del row['sim']
            if len(row['NúmeroContrato']) == 0:
                del row['NúmeroContrato']
            if len(row['Fornecedor']) == 0:
                del row['Fornecedor']
            if len(row['FornecedorDocumento']) == 0:
                del row['FornecedorDocumento']
            if len(row['ValidadeExtrato']) == 0:
                del row['ValidadeExtrato']
            if len(row['TipoValidadeExtrato']) == 0:
                del row['TipoValidadeExtrato']
            if len(row['ValorContrato']) == 0:
                del row['ValorContrato']
            #print(row)    
            item = dynamodb_table.new_item(attrs=row)
            items.append(item)
            if count % BATCH_COUNT == 0:
                #print ('batch write start ... ', 
                do_batch_write(items, table_name, dynamodb_table, dynamodb_conn))
                items = []
                #print ('batch done! (row number: ' + str(count) + ')')

    # flush remaining items, if any
    if len(items) > 0: 
        do_batch_write(items, table_name, dynamodb_table, dynamodb_conn)


    csv_file.close() 


def main():
    '''
    Demonstration of the use of import_csv_to_dynamodb()
    We assume the existence of a table named `test_persons`, with
    - Last_name as primary hash key (type: string)
    - First_name as primary range key (type: string)
    '''
    colunm_names = 'contract_id Orgao Retranca Modalidade textbox17 textbox19 Número_Licitação objeto DataPublicaçãoExtrato Fornecedor FornecedorTipo FornecedorDocumento DataAssinaturaExtrato ValidadeExtrato TipoValidadeExtrato ValorContrato NúmeroContrato sim user_id'.split()
    table_name = 'contracts'
    csv_file_name = './output/licitacoes_2016_2017.csv'
    column_types = [str, str, str, str, str, str, str, str, str, str, str, str, str, str, str, str, str, str, str]
    import_csv_to_dynamodb(table_name, csv_file_name, colunm_names, column_types)


if __name__ == "__main__":
    main()
    #cProfile.run('main()') # if you want to do some profiling