# Projet: Passage Ranking

In [1]:
from pyserini.search.lucene import LuceneSearcher
LuceneSearcher.list_prebuilt_indexes()

from pyserini.index.lucene import IndexReader
IndexReader.list_prebuilt_indexes()

  from .autonotebook import tqdm as notebook_tqdm


                        cacm                                                                                                    \
description              Lucene index of the CACM corpus. (Lucene 9)                                                             
filename                 lucene-index.cacm.tar.gz                                                                                
urls                     [https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.20221005.252b5e.tar.gz]   
md5                      cfe14d543c6a27f4d742fb2d0099b8e0                                                                        
size compressed (bytes)  2347197                                                                                                 
total_terms              320968                                                                                                  
documents                3204                                                             

## Téléchargement de l'index msmarco-v1-passage-full

In [2]:
# The downloaded index will be in ~/.cache/pyserini/indexes/

index_reader = IndexReader.from_prebuilt_index('msmarco-v1-passage-full')
index_reader.stats()

{'total_terms': 352316036,
 'documents': 8841823,
 'non_empty_documents': 8841823,
 'unique_terms': 2660824}

## L'index téléchargé est stocké dans le dossier ~/.cache/pyserini/indexes/

In [2]:
!ls ~/.cache/pyserini/indexes/

## Affichage des 10 premiers terms avec leurs statistiques

In [4]:
import itertools

for term in itertools.islice(index_reader.terms(), 10):
  print(f'{term.term} (df={term.df}, cf={term.cf})')

0 (df=73630, cf=103199)
0'2.56n (df=2, cf=2)
0'26 (df=1, cf=1)
0,0 (df=122, cf=163)
0,0,0 (df=31, cf=32)
0,0,0,0 (df=4, cf=4)
0,0,0,0,0,0,0,8,7,0,7,5,8,8,8,6,7,2 (df=3, cf=4)
0,0,0,0.26 (df=1, cf=1)
0,0,0,0.5 (df=1, cf=1)
0,0,0,1 (df=1, cf=1)


## Statistiques d'un terme non analysé

In [5]:
term = 'cities'

# Look up its document frequency (df) and collection frequency (cf).
# Note, we use the unanalyzed form:
df, cf = index_reader.get_term_counts(term)
print(f'term "{term}": df={df}, cf={cf}')

term "cities": df=239653, cf=361842


## Statistiques d'un terme analysé

In [6]:
term = 'cities'

# Analyze the term.
analyzed = index_reader.analyze(term)
print(f'La forme analysé de "{term}" est "{analyzed[0]}"')

# Skip term analysis:
df, cf = index_reader.get_term_counts(analyzed[0], analyzer=None)
print(f'term "{term}": df={df}, cf={cf}')

La forme analysé de "cities" est "citi"
term "cities": df=239653, cf=361842


## Traverser les posting d'un terme

In [9]:
# Chercher les posting d'un terme analysé
term = "0,0,0,0"
postings_list = index_reader.get_postings_list(term, analyzer=None)
print(f"taille {len(postings_list)}")
for posting in postings_list:
  print(f'docid={posting.docid}, tf={posting.tf}, pos={posting.positions}')

taille 4
docid=3959740, tf=1, pos=[50]
docid=6565175, tf=1, pos=[8]
docid=6565183, tf=1, pos=[16]
docid=7453459, tf=1, pos=[49]


## Construction d'un nouveau document sur la base du score de chaque terme

In [17]:
# fonction kernel avec programmation dynamique
dyn_dict = dict()


In [22]:
import numpy as np
import scipy.special

alpha = 1
sig  = 2_000
sig2 = 4_000_000
def kernel_skewed(j,i):
  # programmation dynamique
  clef = f"{i}-{j}" if i<=j else f"{j}-{i}"
  if clef in dyn_dict:
    return dyn_dict[clef]
  
  ij2 = (i-j) ** 2
  a = np.exp(-ij2 / (2*sig2))
  b = 1 + scipy.special.erf(alpha*(i-j) / np.sqrt(2))

  v = a*b
  dyn_dict[clef] = v
  
  return v

In [23]:
import itertools

N = 8841823 # nb passages
new_data = dict() # dictionnaire du nouveau corpus généré

it = 0
for term in itertools.islice(index_reader.terms(), None):
  it += 1
  idf = np.log((N+1)/(term.df+1))
  # print(f'{term.term} (df={term.df}, cf={term.cf})')
  
  postings_list = index_reader.get_postings_list(term.term, analyzer=None)
  l_pos = []
  for posting in postings_list:
    if posting.docid not in new_data:
      new_data[posting.docid] = {term.term: 0}
    else:
      new_data[posting.docid][term.term] = 0
    l_pos.extend(posting.positions)
  
  ip = 0
  for posting in postings_list:
    ip += 1
    print(f"term {it} sur {N}  -->  {(100*((it)/N)):>0.2f}% | pos {ip} sur {len(postings_list)}  -->  {(100*((ip)/len(postings_list))):>0.2f}%",end='\r')
    # print(f'  docid={posting.docid}, tf={posting.tf}, pos={posting.positions}')
    tf = sum([kernel_skewed(pos, j) for j in posting.positions for pos in l_pos])
    score = tf*idf
    new_data[posting.docid][term.term] = score

term 1 sur 8841823  -->  0.00% | pos 50592 sur 73630  -->  68.71%

KeyboardInterrupt: 

0.5204998778130465
