# Processamento de Consulta

limpando o input da consulta

In [34]:
import re
import unidecode
from nltk.corpus import stopwords

def clear_query_input(query, field=None, usestopwords=False):

    # separa as palavras da query
    if type(query) == str:
        substr = re.findall(r'"(.*?)"', query)
        words = re.sub(r'"(.*?)"', '', query)
        words = words.lower().strip().split(' ')
        words = words + substr
   
    # remove simbolos da query
    toRemove = r'[.*,;\(\)\'\"\?\!%\$]'
    for i in range(len(words)):
        words[i] = re.sub(toRemove, '', words[i])
        words[i] = unidecode.unidecode(words[i])
    
    # coloca os stop words em caso decidirmos usa-los 
    if usestopwords:
        set_sw = set(stopwords.words('english'))
        newWords = []
        for word in words:
            if not word in set_sw:
                if (field):
                    newWords.append(word + '-' + field)
                else:
                    newWords.append(word)
        return list(set(newWords))
    
    # caso ele seja a consulta seja de um campo especifico, ele é atribuido as palavras
    if (field):
        for i in range(len(words)):
            words[i] = words[i] + '-' + field
    return list(set(words))

In [36]:
exemplo = 'Uma pequena consulta'
print(clear_query_input(exemplo))
print(clear_query_input(exemplo, field='title'))

['pequena', 'uma', 'consulta']
['uma-title', 'pequena-title', 'consulta-title']


# Ranqueamento

usando os pesos com tfidf: tf * idf
    para o termo-documento = f(i,j) * log(N/ni)
    para a termo-consulta = (0.5 * 0.5(f(i,q)/max(i)f(i,q)) * log(N/ni)
   

In [4]:
from numpy import log

def get_idf(posting, num_docs):

    idf = log(num_docs/len(posting))
    return idf

In [30]:
def cosine_Score(terms, tfidf, invlist, num_docs, doc_lengths, n):
    scores = {i: 0 for i in range(1, num_docs + 1)}
    # term-at-a-time
    for term in terms:

        w_term_query = terms.count(term)
        if (tfidf):
            w_term_query = 0.5 + 0.5 * \
                (w_term_query /
                 (max([i for i, val in enumerate(terms) if val == term])+1))
            w_term_query *= get_idf(invlist[term], num_docs) 
        for post in invlist[term]:
            # print(post)
            w_doc_term = post[1]  # tf
            if tfidf:
                w_doc_term *= get_idf(invlist[term], num_docs)
            scores[post[0]] += w_term_query * w_doc_term
    for d in range(num_docs):
        # print(doc_lengths)
        scores[d+1] /= doc_lengths[f'{d+1}']
    keys_scores = sorted(scores, key=scores.get, reverse=True)
    # print(scores)
    return {keys_scores[i]: scores[keys_scores[i]] for i in range(5)}

In [6]:
import json
from bs4 import BeautifulSoup

def get_docs_len(docs_path_map='./data/data_map.json'):
    lengths = {}
    with open(docs_path_map, 'r') as map:
        map_docs = map.read()
        map_docs = json.loads(map_docs)
    for id in map_docs:
        with open(f"../data/htmls/{map_docs[id]['arquivo']}.html") as html:
            lengths[id] = len(BeautifulSoup(
                html.read(), 'html.parser').text.split(' '))
    return lengths

In [7]:
# pegando tamanho dos arquivos
docs_len = get_docs_len(docs_path_map='../data/data_map.json')

# query

In [8]:
def get_invindex(path='../inverted_index/geral_invindex.json'):
    with open(path, 'r') as file:
        text = file.read()
        invindex = json.loads(text)
    return invindex

In [9]:
def run_query(query, tfidf, num_docs, docs_lenght, n, field=None, invlist_path='../inverted_index/geral_invindex.json'):
    terms = clear_query_input(query, field)
    invindex = get_invindex(invlist_path)
    return cosine_Score(terms, tfidf, invindex, num_docs, docs_lenght, n)

In [31]:
def execute_query(query_string, title, gen, dev, plat, price, num_docs, docs_lenght, tfidf=True, n=5):

    games_list = []
    if query_string != '':

        games = run_query(query_string, tfidf, num_docs, docs_lenght, n)
        games_list.append(games)

    if title != '':

        games = run_query(title, tfidf, num_docs,
                          docs_lenght, n, field='title', invlist_path='../inverted_index/fields_invindex.json')
        games_list.append(games)

    if gen != '':

        games = run_query(
            gen, tfidf, num_docs, docs_lenght, n, field='genre', invlist_path='../inverted_index/fields_invindex.json')
        games_list.append(games)

    if dev != '':

        games = run_query(
            dev, tfidf, num_docs, docs_lenght, n, field='dev', invlist_path='../inverted_index/fields_invindex.json')
        games_list.append(games)

    if plat != '':

        games = run_query(
            plat, tfidf, num_docs, docs_lenght, n, field='plataforma', invlist_path='../inverted_index/fields_invindex.json')
        games_list.append(games)

    if price != '':

        games = run_query(
            price, tfidf, num_docs, docs_lenght, n, field='price', invlist_path='../inverted_index/fields_invindex.json')
        games_list.append(games)

    # print(games_list)
    all_games = {}
    for ranks_list in games_list:
        for game in ranks_list:
            if not str(game) in all_games:
                all_games[str(game)] = float(ranks_list[game])
            else:
                all_games[str(game)] += float(ranks_list[game])
    for doc in all_games:
        all_games[doc] /= 6
    print(all_games)
    return sorted(all_games)



In [11]:
data_map = {}
with open('../data/data_map.json') as infile:
    text = infile.read()
    data_map = json.loads(text)
qt_docs = len(data_map)

In [37]:
print(execute_query(query_string='mario', title='', gen='', dev='', plat='', price='', num_docs=qt_docs, docs_lenght=docs_len))

print(execute_query(query_string='', title='ps4', gen='esportes', dev='', plat='', price='', num_docs=qt_docs, docs_lenght=docs_len))

{'3': 0.00022263975667812265, '37': 0.0002224151402972932, '281': 0.00021516483219078373, '276': 0.00021089721338904052, '263': 0.00021028031959431228}
['263', '276', '281', '3', '37']
{'73': 1.2361432721097838e-05, '1': 1.2351395016898815e-05, '69': 1.2350559277334778e-05, '52': 1.2333868204590222e-05, '55': 1.2322211271162538e-05, '132': 0.00017867587854341054, '38': 0.00017851915589169577, '37': 0.0001779548298002975, '76': 0.00017794286162289483, '39': 0.00017789500500711513}
['1', '132', '37', '38', '39', '52', '55', '69', '73', '76']
