# Trabalho de Recuperação de Informação
## Desenvolvido por Ronald Albert

In [None]:
import regex as re
import pandas as pd

In [None]:
def normalize(doc):
    return doc.lower()

In [None]:
def tokenize(doc, sep_list):
    default_sep = sep_list[0]
    
    for sep in sep_list[1:]:
        doc = doc.replace(sep, default_sep)
        
    doc = re.sub(r"\s+", " ", doc)
    doc = doc.strip()

    tokens = doc.split(default_sep)
    return tokens


In [None]:
def eliminate_stopwords(tokens, stopwords):
    return [token for token in tokens if token not in stopwords]

In [None]:
def clean_doc(doc, sep_list, stopwords):
    doc = normalize(doc)
    tokens = tokenize(doc, sep_list)
    tokens = eliminate_stopwords(tokens, stopwords)
    
    return tokens

In [None]:
def incidence_matrix(docs):
    token_set = set()
    
    for doc in docs:
        for token in doc:
            token_set.add(token)
    
    doc_list = list(range(0, len(docs)))
    
    incidence_matrix = pd.DataFrame(columns=doc_list)
    
    for token in token_set:
        token_series = pd.Series(index=doc_list, dtype='object')
        
        for index,value in token_series.iteritems():
            token_series[index] = docs[int(index)].count(token)/len(docs[int(index)])
        
        incidence_matrix = incidence_matrix.append(token_series, ignore_index=True)
        
    incidence_matrix.index = token_set
    return incidence_matrix

In [None]:
M=[['Parasita é o grande vencedor do Oscar 2020, com quatro prêmios'],
['Green Book, Roma e Bohemian Rhapsody são os principais vencedores do Oscar 2019'],
['Oscar 2020: Confira lista completa de vencedores. Parasita e 1917 foram os grandes vencedores da noite'],
['Em boa fase, Oscar sonha em jogar a Copa do Mundo da Rússia'],
['Conheça os indicados ao Oscar 2020; Cerimônia de premiação acontece em fevereiro'],
['Oscar Schmidt receberá Troféu no Prêmio Brasil Olímpico 2019. Jogador de basquete com mais pontos em Jogos Olímpicos.'],
['Seleção brasileira vai observar de 35 a 40 jogadores para definir lista da Copa América'],
['Oscar 2020: saiba como é a escolha dos jurados e como eles votam'],
['Bem, Amigos! discute lista da Seleção, e Galvão dá recado a Tite: Cadê o Luan?'],
['IFAL-Maceió convoca aprovados em lista de espera do SISU para chamada oral'],
['Arrascaeta e Matías Viña são convocados pelo Uruguai para eliminatórias da Copa. Além deles, há outros destaques na lista.'],
['Oscar do Vinho: confira os rótulos de destaque da safra 2018'],
['Parasita é o vencedor da Palma de Ouro no Festival de Cannes'],
['Estatísticas. Brasileirão Série A: Os artilheiros e garçons da temporada 2020'],
['Setembro chegou! Confira o calendário da temporada 2020/2021 do futebol europeu']] #conjunto de documentos

stopwords=['a', 'o', 'e', 'é', 'de', 'do', 'da', 'no', 'na', 'são', 'dos', 'com','como',
'eles', 'em', 'os', 'ao', 'para', 'pelo'] #lista de stopwords

q='oscar 2020' #consulta

separadores=[' ',',','.','!','?',':',';','/'] #separadores para tokenizacao

In [None]:
M_cleaned = [clean_doc(doc[0], separadores, stopwords) for doc in M]
M_cleaned

[['parasita', 'grande', 'vencedor', 'oscar', '2020', 'quatro', 'prêmios'],
 ['green',
  'book',
  'roma',
  'bohemian',
  'rhapsody',
  'principais',
  'vencedores',
  'oscar',
  '2019'],
 ['oscar',
  '2020',
  'confira',
  'lista',
  'completa',
  'vencedores',
  'parasita',
  '1917',
  'foram',
  'grandes',
  'vencedores',
  'noite'],
 ['boa', 'fase', 'oscar', 'sonha', 'jogar', 'copa', 'mundo', 'rússia'],
 ['conheça',
  'indicados',
  'oscar',
  '2020',
  'cerimônia',
  'premiação',
  'acontece',
  'fevereiro'],
 ['oscar',
  'schmidt',
  'receberá',
  'troféu',
  'prêmio',
  'brasil',
  'olímpico',
  '2019',
  'jogador',
  'basquete',
  'mais',
  'pontos',
  'jogos',
  'olímpicos'],
 ['seleção',
  'brasileira',
  'vai',
  'observar',
  '35',
  '40',
  'jogadores',
  'definir',
  'lista',
  'copa',
  'américa'],
 ['oscar', '2020', 'saiba', 'escolha', 'jurados', 'votam'],
 ['bem',
  'amigos',
  'discute',
  'lista',
  'seleção',
  'galvão',
  'dá',
  'recado',
  'tite',
  'cadê',
  'lu

In [None]:
incidence_matrix(M_cleaned)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
40,0.000000,0.000000,0.0,0.000,0.000,0.000000,0.090909,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
luan,0.000000,0.000000,0.0,0.000,0.000,0.000000,0.000000,0.0,0.090909,0.0,0.0,0.0,0.0,0.000000,0.000000
brasileirão,0.000000,0.000000,0.0,0.000,0.000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.142857,0.000000
bohemian,0.000000,0.111111,0.0,0.000,0.000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
pontos,0.000000,0.000000,0.0,0.000,0.000,0.071429,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,0.000000,0.000000,0.0,0.000,0.000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.111111
grande,0.142857,0.000000,0.0,0.000,0.000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
indicados,0.000000,0.000000,0.0,0.000,0.125,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000
europeu,0.000000,0.000000,0.0,0.000,0.000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.111111
