In [0]:
import collections
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import nltk
import heapq
import time

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/rafaelsguerra/information_retrieval/master/lab02/results.csv?token=AEGSL3ECKRA2WO4RA4CWPEC44SMKW')

In [0]:
data_text = data['text'].tolist()

In [0]:
# Para a estratégia de tokenização, foi optado incluir números palavras compostas por hífen (sexta-feira é considerada como uma só palavra).
# Ainda como parte da estratégia, foi decidido retirar stopwords e palavras com tamanho 2 ou maior.
# Todas as palavras estão escritas em caixa baixa.

def parse(document):  
  tokenizer = RegexpTokenizer(r'\b[A-zÀ-ú\d\-\']+') 
  tokens = []

  tokens.extend(tokenizer.tokenize(document.lower()))
    
  stop_words = stopwords.words('portuguese')

  filtered_tokens = []

  for token in tokens:
    if token not in stop_words  and len(token) >= 2: # and len(token) > 2
      filtered_tokens.append(token)
  
  return filtered_tokens

In [0]:
def build_index(data):
  inverted_list = {}
  n = 0
  
  for document in data:
    n = n + 1
    tokens = parse(document)
    
    tokens_no_duplicates = collections.Counter(tokens)
    
    for token in tokens_no_duplicates.keys():
      if token not in inverted_list:
        inverted_list[token] = []
      inverted_list[token].append((n, tokens_no_duplicates[token]))
    
  return inverted_list


In [0]:
inverted_index = build_index(data_text)

In [0]:
def document_at_a_time(query, inverted_index, k):
  inverted_lists = {}
  r = []
  
  for word in query.split(" "):
    if word in inverted_index.keys():
      inverted_lists[word] = inverted_index[word]
      
  documents = list(range(1, len(data.url) + 1))
  
  for document in documents:
    score = 0
    
    for key in inverted_lists.keys():
      for tuple in inverted_lists[key]:
        if tuple[0] == document:
          score += tuple[1]
          break
        
    if score != 0:
      heapq.heappush(r, (score, document))
      
  return heapq.nlargest(k, r)   
  
    

In [0]:
def term_at_a_time(query, inverted_index, k):
  accumulators = {}
  inverted_lists = {}
  r = []
  
  for word in query.split(" "):
    if word in inverted_index.keys():
      inverted_lists[word] = inverted_index[word]
      
  for key in inverted_lists.keys():
    for tuple in inverted_lists[key]:
      document = tuple[0]
      freq = tuple[1]
      if document in accumulators.keys():
        accumulators[document] += freq
      else:
        accumulators[document] = freq
        
  for tuple in accumulators.items():
    score = tuple[1]
    heapq.heappush(r, (score, tuple[0]))
    
  return heapq.nlargest(k, r)

In [0]:
def query_results(queries):
  results_document = []
  results_term = []
  time_results_document = []
  time_results_term = []
  k = 10

  for query in queries:
    init_document = time.time()
    result_document = document_at_a_time(query, inverted_index, k)
    end_document = time.time()
    results_document.append(result_document)
    time_results_document.append(end_document - init_document)
  
    init_term = time.time()
    result_term = term_at_a_time(query, inverted_index, k)
    end_term = time.time()
    results_term.append(result_term)
    time_results_term.append(end_term - init_term)

  queries_df = pd.DataFrame()
  queries_df['query'] = queries
  queries_df['document_at_a_time'] = results_document
  queries_df['term_at_a_time'] = results_term
  queries_df['compare'] = queries_df.document_at_a_time == queries_df.term_at_a_time
  queries_df['tempo_médio_document_at_a_time'] = time_results_document
  queries_df['tempo_médio_term_at_a_time'] = time_results_term
  
  return queries_df

In [0]:
queries = ["bolsonaro", "corrupção", "economia", "brasil", "governo"]
df = query_results(queries)

df

Unnamed: 0,query,document_at_a_time,term_at_a_time,compare,tempo_médio_document_at_a_time,tempo_médio_term_at_a_time
0,bolsonaro,"[(46, 151), (37, 166), (35, 207), (26, 19), (1...","[(46, 151), (37, 166), (35, 207), (26, 19), (1...",True,0.001195,0.000101
1,corrupção,"[(7, 171), (4, 204), (3, 207), (3, 144), (3, 8...","[(7, 171), (4, 204), (3, 207), (3, 144), (3, 8...",True,0.000425,3e-05
2,economia,"[(10, 138), (8, 125), (6, 127), (6, 69), (6, 3...","[(10, 138), (8, 125), (6, 127), (6, 69), (6, 3...",True,0.00084,6.2e-05
3,brasil,"[(47, 151), (15, 166), (11, 19), (10, 26), (9,...","[(47, 151), (15, 166), (11, 19), (10, 26), (9,...",True,0.001878,0.000125
4,governo,"[(15, 173), (13, 166), (10, 248), (10, 115), (...","[(15, 173), (13, 166), (10, 248), (10, 115), (...",True,0.001697,0.000124


Descrição das colunas do dataframe:


*   **query**: o termo na busca
*   **document_at_a_time**: lista com os scores e documentos da busca utilizando o algoritmo document_at_a_time
*   **term_at_a_time**: lista com os scores e documentos da busca utilizando o algoritmo term_at_a_time
*   **compare**: contém ```True``` se os resultados dos dois algoritmos forem iguais
*   **mean_time_document_at_a_time**: tempo médio que o algoritmo document_at_a_time levou para processar a busca
*   **mean_time_term_at_a_time**: tempo médio que o algoritmo term_at_a_time levou para processar a busca

In [0]:
def document_at_a_time_conjunctive(query, inverted_index, k):
  inverted_lists = {}
  r = []
  
  for word in query.split(" "):
    if word in inverted_index.keys():
       inverted_lists[word] = inverted_index[word]
        
  first_key = list(inverted_lists.keys())[0]
   
  for post in inverted_lists[first_key]:
    score = 0
    document = -1
    
    for inverted_list in inverted_lists.values():
      if post in inverted_list:
        document = post[0]
        score += post[1]
      else:
        document = -1
        break
    
    if (document > -1):
      if (score != 0):
        heapq.heappush(r, (score, document))
        
  return heapq.nlargest(k, r)

In [0]:
results_document = []
results_term = []
k = 10

queries = ["bolsonaro corrupção", "governo federal", "futebol brasil", "política mulheres", "tempo hoje"]

for query in queries:
  result_document = document_at_a_time_conjunctive(query, inverted_index, k)
  results_document.append(result_document)

queries_df = pd.DataFrame()
queries_df['query'] = queries
queries_df['document_at_a_time_conjunctive'] = results_document
  
queries_df

Unnamed: 0,query,document_at_a_time_conjunctive
0,bolsonaro corrupção,"[(2, 210), (2, 7)]"
1,governo federal,"[(4, 213), (4, 203), (2, 223), (2, 126), (2, 3..."
2,futebol brasil,"[(4, 121), (4, 76)]"
3,política mulheres,"[(2, 113), (2, 66), (2, 25)]"
4,tempo hoje,"[(4, 229), (4, 203), (4, 195), (4, 156), (4, 1..."
