<a href="https://colab.research.google.com/github/sadra-barikbin/persian-information-retrieval-example/blob/main/Persian-IR-example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [2]:
!pip install hazm transformers ir_measures
!pip install -q clean-text[gpl]

Collecting nltk==3.3
  Using cached nltk-3.3-py3-none-any.whl
Installing collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.6.5
    Uninstalling nltk-3.6.5:
      Successfully uninstalled nltk-3.6.5
Successfully installed nltk-3.3


In [1]:
import torch
import yaml
import hazm
import tqdm
import numpy as np
import pandas as pd
import ir_measures as IRm
from typing import List, Tuple
from pathlib import Path
from sklearn.metrics import make_scorer, average_precision_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Loading & Preparing Data

## Corpus

In [None]:
!wget https://github.com/language-ml/2-LM-embedding-projects/raw/main/problem3/doc_collection.zip

In [None]:
!unzip doc_collection.zip

In [None]:
!cat IR_dataset/1000.txt

In [5]:
# corpus = [(int(path.stem), path.open().read()) for path in Path('dataset/IR_dataset').iterdir()]
corpus = [(int(path.stem), path.open().read()) for path in Path('dataset/IR_dataset').iterdir()]
corpus = pd.DataFrame(corpus, columns=['docId','text']).set_index('docId').sort_index()

In [None]:
ccorpus = [(int(path.stem), path.open().read()) for path in Path('IR_dataset').iterdir()]

In [6]:
corpus.head()

Unnamed: 0_level_0,text
docId,Unnamed: 1_level_1
0,برخی از هواداران مصدق یا اعضای جبهه ملی که در ...
1,جبهه ملی ایران که به اختصار جبهه ملی نیز خواند...
2,سرلشکر زاهدی در سال ۱۳۲۸ و پس از آن‌که دخالت‌ه...
3,نمایندگان طرفدار مصدق در حمایت از ابقای دولت و...
4,نمایندگان طرفدار مصدق در حمایت از ابقای دولت و...


## Qrels

In [None]:
!wget https://raw.githubusercontent.com/language-ml/2-LM-embedding-projects/main/problem3/evaluation_IR.yml

In [7]:
# query_raw_data = yaml.safe_load(open('evaluation_IR.yml'))
query_raw_data = yaml.safe_load(open('dataset/evaluation_IR.yml'))

In [8]:
query = pd.Series(query_raw_data.keys())
qrels = [{'query_id':idx, 'doc_id':d,
          'relevance':3} for idx,q in query.to_dict().items() for d in query_raw_data[q]['similar_high']]
qrels.extend([{'query_id':idx, 'doc_id':d,
          'relevance':2} for idx,q in query.to_dict().items() for d in query_raw_data[q]['similar_med']])
qrels.extend([{'query_id':idx, 'doc_id':d,
          'relevance':1} for idx,q in query.to_dict().items() for d in query_raw_data[q]['similar_low']])
qrels.extend([{'query_id':idx, 'doc_id':query_raw_data[q]['relevant'][0],
          'relevance':4} for idx,q in query.to_dict().items()])
qrels = pd.DataFrame(qrels)

In [9]:
query[147],query_raw_data[query[147]]

('گرجستان  تاریخ',
 {'relevant': [388],
  'similar_high': [389, 390, 391, 392, 393, 394],
  'similar_low': [404, 405, 406, 407, 408, 409, 410, 411, 412, 413],
  'similar_med': [395, 364, 396, 397, 398, 399, 400, 401, 402, 403]})

In [10]:
qrels.sample(n=5).reset_index(drop=True)

Unnamed: 0,query_id,doc_id,relevance
0,126,464,2
1,129,1083,2
2,122,2137,3
3,48,2613,2
4,119,617,1


## Normaliztion

In [11]:
normalize = hazm.Normalizer().normalize
corpus.text = corpus.text.transform(normalize)
query = query.transform(normalize)

# Embedding the documents

## Method 1 : Tfidf

In [13]:
vectorizer = TfidfVectorizer(max_features=500,ngram_range=(1,2))
vectorizer.fit(corpus.text)

TfidfVectorizer(max_features=500, ngram_range=(1, 2))

## Method 2 : ParsBert

In [2]:
from transformers import AutoConfig, AutoTokenizer, AutoModel, TFAutoModel
import tensorflow

model_name_or_path = "HooshvareLab/bert-fa-zwnj-base"
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [3]:
model = AutoModel.from_pretrained(model_name_or_path, from_tf = True)
model = model.cuda()

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


In [14]:
text = "ما در قرن ۲۱ زندگی می‌کنیم" 
encoding = tokenizer.encode_plus(
      text,
      add_special_tokens=True, # Add '[CLS]' and '[SEP]'
      return_token_type_ids=False,
      max_length = 500,
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',  # Return PyTorch tensors
    )
out = model(
            input_ids = encoding['input_ids'].cuda(), 
            attention_mask= encoding['attention_mask'].cuda())
out['pooler_output'][0]

tensor([-2.9803e-01, -7.1409e-01, -9.8238e-02,  1.3373e-01, -7.0057e-01,
        -4.0727e-01,  2.9717e-01,  2.9900e-01,  2.6092e-01, -6.0040e-01,
         1.1803e-02, -1.8664e-01,  2.2774e-01, -2.8943e-01, -2.1663e-01,
        -5.2125e-02, -5.7990e-01, -4.9832e-01, -4.4152e-01,  6.4462e-01,
        -1.1495e-01,  6.1037e-01, -4.1411e-01,  4.8718e-01,  8.9476e-01,
         4.6958e-01,  5.5911e-01,  7.4270e-03, -2.6307e-01,  5.9849e-01,
        -4.6554e-01,  7.7377e-02, -2.6185e-01, -7.6583e-02,  1.3258e-01,
        -8.5721e-01,  3.3935e-01, -7.0128e-01,  7.2777e-02,  2.5269e-01,
        -4.5089e-02,  1.3263e-01, -5.2308e-01,  3.7634e-01, -3.9391e-01,
         2.8489e-01, -2.7325e-01, -1.5413e-01,  4.2223e-01, -6.4387e-02,
        -6.7197e-01, -5.1075e-01,  2.0961e-01, -2.2505e-01,  2.6400e-01,
        -7.0605e-01,  4.5489e-02,  2.4439e-01, -1.1642e-01, -3.9023e-02,
        -3.8892e-01,  2.2926e-01, -5.1469e-02,  3.9726e-01,  4.9048e-01,
        -5.7769e-01, -4.2485e-01,  2.0493e-01,  5.2

In [15]:
def get_embed(part):
  encoding = tokenizer.encode_plus(
    part,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=False,
    max_length = 500,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',  # Return PyTorch tensors
  )
  out = model(
      input_ids = encoding['input_ids'].cuda(), 
      attention_mask= encoding['attention_mask'].cuda())
  return out['pooler_output'].cpu().detach().numpy()

In [16]:
doc_vec = np.zeros((1, 768))
doc_map = np.zeros(1)

for index, doc in tqdm.tqdm(corpus.iterrows()):
  doc_split = doc['text'].split()
  doc_parts = [' '.join(doc_split[i:i + 300]) for i in range(0, len(doc_split) - 150, 150)]
  for part in doc_parts:
    doc_vec = np.append(doc_vec, get_embed(part), axis = 0)
    doc_map = np.append(doc_map, index)

3258it [06:04,  8.95it/s]


# Document Retrieval

In [17]:
class KNN_based_IR(BaseEstimator):
  def __init__(self,n_neighbors=1+10+10+10) -> None:
    super().__init__()
    self.nn = NearestNeighbors(n_neighbors=n_neighbors)
  def set_params(self,**kwargs):
    self.nn.set_params(**kwargs)
  def fit(self, X: np.array):
    self.nn.fit(X)
  def predict(self, X: np.array):
    distances, docIds = self.nn.kneighbors(X)
    scores = np.max(distances)-distances
    return scores, docIds

In [18]:
IR_system = KNN_based_IR()
IR_system.fit(vectorizer.transform(corpus.text))

In [19]:
bert_knn = KNN_based_IR(80)
bert_knn.fit(doc_vec[1:])

# IR Evaluation
Tailored for our multi-level Test Collection.

In [21]:
def tf_knn_pred(knn):
  return knn.predict(vectorizer.transform(query))


preds = tf_knn_pred(IR_system)

In [22]:
def bert_knn_pred(knn):
  bert_score = []
  bert_id = []
  mn = 100
  for q in tqdm.tqdm(query):
    score, oid = knn.predict(get_embed(q))
    score = score[0]
    doc_id = [doc_map[i] for i in oid[0]]
    n_score = []
    n_id = []
    for sc, id in zip(score, doc_id):
      if id not in n_id:
        n_id.append(id)
        n_score.append(sc)
    mn = min(mn, len(n_score[:31]))
    bert_score.append(n_score[:31])
    bert_id.append(n_id[:31])
  
  return (np.array(bert_score), np.array(bert_id).astype(int))


bert_pred = bert_knn_pred(bert_knn)

100%|██████████| 150/150 [00:06<00:00, 24.19it/s]


## Adapting IR output to our Test Collection

In [23]:
def adapt_IR_output_to_measure_input(IR_output: Tuple[np.array, np.array]):
  scores, docIds = IR_output
  return pd.DataFrame({'query_id': np.tile(query.index,(31,1)).flatten(order='F').astype(str),
                       'doc_id':   docIds.flatten().astype(str),
                       'score':    scores.flatten()})

In [24]:
def knn_tuning(n, param, embed, pred_f, measure):
  score = -1
  best_p = -1
  for p in param:
    knn = KNN_based_IR(n)
    knn.set_params(metric = p)
    knn.fit(embed)
    val = measure(qrels.astype({'query_id':str,'doc_id':str}),pred_f(knn))
    if val > score:
      score = val
      best_p = p
  return best_p, score


param = ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] # Metrics for sparse input

## MRR (Mean Reciprocal Rank)

In [25]:
MRR = IRm.measures.MRR()
def mrr_measure(qrels, ret):
  ret = adapt_IR_output_to_measure_input(ret)
  return MRR.calc_aggregate(qrels[qrels.relevance == 4], ret)
# mrr_scorer = make_scorer(mrr)

In [26]:
mrr_measure(qrels.astype({'query_id':str,'doc_id':str}),preds)

0.11047063325391496

In [27]:
knn_tuning(31, param, vectorizer.transform(corpus.text), tf_knn_pred, mrr_measure)

('cosine', 0.11664736491005835)

In [28]:
mrr_measure(qrels.astype({'query_id':str,'doc_id':str}),bert_pred)

0.06931617823372208

In [29]:
knn_tuning(80, param, doc_vec[1:], bert_knn_pred, mrr_measure)

100%|██████████| 150/150 [00:05<00:00, 25.60it/s]
100%|██████████| 150/150 [00:11<00:00, 12.60it/s]
100%|██████████| 150/150 [00:05<00:00, 25.01it/s]
100%|██████████| 150/150 [00:05<00:00, 26.23it/s]
100%|██████████| 150/150 [00:06<00:00, 24.50it/s]
100%|██████████| 150/150 [00:05<00:00, 25.57it/s]


('cosine', 0.06982707206877474)

## MAP (Mean Average Precision)

In [30]:
def map_measure(qrels, ret):
  ret = adapt_IR_output_to_measure_input(ret)
  return np.mean([IRm.measures.AP(rel=level).\
                    calc_aggregate(qrels[qrels.relevance == level], ret) for level in range(1,4+1)])

# map_scorer = make_scorer(map)

In [31]:
map_measure(qrels.astype({'query_id':str,'doc_id':str}),preds)

0.06630537139767945

In [32]:
knn_tuning(31, param, vectorizer.transform(corpus.text), tf_knn_pred, map_measure)

('cosine', 0.06931719832816999)

In [33]:
map_measure(qrels.astype({'query_id':str,'doc_id':str}),bert_pred)

0.048315030270106925

In [34]:
knn_tuning(80, param, doc_vec[1:], bert_knn_pred, map_measure)

100%|██████████| 150/150 [00:06<00:00, 24.61it/s]
100%|██████████| 150/150 [00:11<00:00, 13.24it/s]
100%|██████████| 150/150 [00:06<00:00, 24.08it/s]
100%|██████████| 150/150 [00:05<00:00, 25.18it/s]
100%|██████████| 150/150 [00:06<00:00, 24.42it/s]
100%|██████████| 150/150 [00:05<00:00, 25.91it/s]


('cityblock', 0.04838834768432319)

## P@K

In [35]:
def p_measure(qrels, ret):
  ret = adapt_IR_output_to_measure_input(ret)
  return np.mean([IRm.measures.P(cutoff=k, rel=level).\
                    calc_aggregate(qrels[qrels.relevance == level], ret)\
                  for k,level in zip([1,11,21,31],range(1,4+1))])

In [36]:
p_measure(qrels.astype({'query_id':str,'doc_id':str}),preds)

0.033872596937113045

In [37]:
knn_tuning(31, param, vectorizer.transform(corpus.text), tf_knn_pred, p_measure)

('cosine', 0.03418284224735836)

In [38]:
p_measure(qrels.astype({'query_id':str,'doc_id':str}),bert_pred)

0.026597076758367057

In [39]:
knn_tuning(80, param, doc_vec[1:], bert_knn_pred, p_measure)

100%|██████████| 150/150 [00:06<00:00, 24.51it/s]
100%|██████████| 150/150 [00:11<00:00, 12.68it/s]
100%|██████████| 150/150 [00:06<00:00, 24.47it/s]
100%|██████████| 150/150 [00:05<00:00, 25.32it/s]
100%|██████████| 150/150 [00:06<00:00, 24.64it/s]
100%|██████████| 150/150 [00:05<00:00, 26.06it/s]


('cosine', 0.02688893543732251)

# Pipeline Definition

In [None]:
pipeline = Pipeline([('embedding','passthrough'),
                     ('retrieval','passthrough')])