In [83]:
import heapq as hq
from porter import *
from collections import Counter
import re
import numpy as np

## Exercice 1 - indexation d’un petit jeu de données

In [50]:
doc1 = "the new home has been saled on top forecasts"
doc2 = "the home sales rise in july"
doc3 = "there is an increase in home sales in july"
doc4 = "july encounter a new home sales rise"

corpus = [doc1, doc2, doc3, doc4]

mots_vides = ["the", "a", "an", "on", "behind", "under", "there", "in", "on"]

In [51]:
def preprocess(doc):
    # Séparer les mots
    mots = [m.lower() for m in re.split(r"\W+", doc) if m not in mots_vides] # \W tout caractere non alphanumerique
    return dict(Counter([stem(m) for m in mots]))
##    
preprocess(doc1)

{'new': 1, 'home': 1, 'ha': 1, 'been': 1, 'sale': 1, 'top': 1, 'forecast': 1}

In [68]:
def index_simple(corpus):
    d = {}
    for i,doc in enumerate(corpus):
        d[i] = preprocess(doc)
    return d

index_simple(corpus)

{0: {'new': 1,
  'home': 1,
  'ha': 1,
  'been': 1,
  'sale': 1,
  'top': 1,
  'forecast': 1},
 1: {'home': 1, 'sale': 1, 'rise': 1, 'juli': 1},
 2: {'is': 1, 'increas': 1, 'home': 1, 'sale': 1, 'juli': 1},
 3: {'juli': 1, 'encount': 1, 'new': 1, 'home': 1, 'sale': 1, 'rise': 1}}

In [69]:
def index_inverse(corpus):
    d = {}
    ind = index_simple(corpus)
    for k1,v1 in ind.items():
        for k2,v2 in ind[k1].items():
            if k2 in d.keys():
                d[k2][k1] = v2
            else:
                d[k2] = {k1 : v2}
    return d

index_inverse(corpus)

{'new': {0: 1, 3: 1},
 'home': {0: 1, 1: 1, 2: 1, 3: 1},
 'ha': {0: 1},
 'been': {0: 1},
 'sale': {0: 1, 1: 1, 2: 1, 3: 1},
 'top': {0: 1},
 'forecast': {0: 1},
 'rise': {1: 1, 3: 1},
 'juli': {1: 1, 2: 1, 3: 1},
 'is': {2: 1},
 'increas': {2: 1},
 'encount': {3: 1}}

In [70]:
## Pondération tf-idf
def index_inverse_tfidf(corpus):
    ind_inv = index_inverse(corpus)
    n = len(corpus)
    for k,v in ind_inv.items():
        for k2,v2 in v.items():  
            ind_inv[k][k2] = v2*np.log(n/len(v))
    return ind_inv

index_inverse_tfidf(corpus)

{'new': {0: 0.6931471805599453, 3: 0.6931471805599453},
 'home': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 'ha': {0: 1.3862943611198906},
 'been': {0: 1.3862943611198906},
 'sale': {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0},
 'top': {0: 1.3862943611198906},
 'forecast': {0: 1.3862943611198906},
 'rise': {1: 0.6931471805599453, 3: 0.6931471805599453},
 'juli': {1: 0.28768207245178085,
  2: 0.28768207245178085,
  3: 0.28768207245178085},
 'is': {2: 1.3862943611198906},
 'increas': {2: 1.3862943611198906},
 'encount': {3: 1.3862943611198906}}

 ## Exercice 2 – Rechercher avec des index

### TAAT Algorithm

In [105]:
# pip install ir_datasets

In [107]:
import ir_datasets

dataset = ir_datasets.load('cord19/trec-covid')
for query in dataset.queries_iter():
    print(query)

TrecQuery(query_id='1', title='coronavirus origin', description='what is the origin of COVID-19', narrative="seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans")
TrecQuery(query_id='2', title='coronavirus response to weather changes', description='how does the coronavirus respond to changes in the weather', narrative='seeking range of information about the SARS-CoV-2 virus viability in different weather/climate conditions as well as information related to transmission of the virus in different climate conditions')
TrecQuery(query_id='3', title='coronavirus immunity', description='will SARS-CoV2 infected people develop immunity? Is cross protection possible?', narrative='seeking studies of immunity developed due to infection with SARS-CoV2 or cross protection gained due to infection with other coronavirus types')
TrecQuery(query_id='4', title='how do people die from the coronavirus', description='w

In [106]:
def add_if(doc_score, heap, k):
    if len(heap)<k:
        heapq.push(heap, ds)
    elif heap[0][1]<ds.score:
        heapq.heapreplace(heap, ds)

Dataset(id='cord19/trec-covid', provides=['docs', 'queries', 'qrels'])

In [101]:
def taat(query,index, k):
    # Appliquer l'algorithme de TAAT
    vocabulary = index.keys()
    terms = dict((k,index[k]) for k in preprocess(query) if k in index)
    print(terms)
    res = {}
    h = []
        
    for k,v in terms.items():
        for k2,v2 in v.items():
            if k2 in res.keys():
                res[k2] += v2
            else:
                res[k2] = v2
            hq.heappush(h, (k2, res[k2]))
    return sorted(res.items(), key=lambda x:x[1], reverse=True)
        
index = index_inverse(corpus)
query = "new home"
taat(query, index,3)

{'new': {0: 1, 3: 1}, 'home': {0: 1, 1: 1, 2: 1, 3: 1}}


[(0, 2), (3, 2), (1, 1), (2, 1)]

### DAAT Algorithm