<a href="https://colab.research.google.com/github/sadra-barikbin/persian-information-retrieval-example/blob/main/Persian-IR-example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [17]:
!pip install hazm transformers ir_measures
!pip install -q clean-text[gpl]



In [18]:
import torch
import yaml
import hazm
import tqdm
import numpy as np
import pandas as pd
import ir_measures as IRm
from typing import List, Tuple
from pathlib import Path
from sklearn.metrics import make_scorer, average_precision_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Loading & Preparing Data

## Corpus

In [19]:
!wget https://github.com/language-ml/2-LM-embedding-projects/raw/main/problem3/doc_collection.zip

--2021-12-31 06:57:38--  https://github.com/language-ml/2-LM-embedding-projects/raw/main/problem3/doc_collection.zip
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/language-ml/2-LM-embedding-projects/main/problem3/doc_collection.zip [following]
--2021-12-31 06:57:38--  https://raw.githubusercontent.com/language-ml/2-LM-embedding-projects/main/problem3/doc_collection.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6083582 (5.8M) [application/zip]
Saving to: ‘doc_collection.zip.1’


2021-12-31 06:57:38 (114 MB/s) - ‘doc_collection.zip.1’ saved [6083582/6083582]



In [20]:
!unzip doc_collection.zip

Archive:  doc_collection.zip
replace IR_dataset/559.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [21]:
!cat IR_dataset/1000.txt

ببر سیبری که با نام‌های ببر آلتایی، ببر منچوری، ببر کره‌ای، ببر آمور و ببر اوسوری نیز شناخته می‌شود، یکی از زیرگونه‌های ببر است که در گذشته در بخش‌های وسیعی از شرق آسیا می‌زیست اما امروزه تنها در منطقهٔ حفاظت شده‌ای در شرق سیبری زندگی می‌کند. ببر سیبری بزرگترین زیرگونهٔ ببر و بزرگترین گربه‌سان زندهٔ جهان است. ببر منقرض شده مازندران نزدیک‌ترین زیرگونه ببر به ببر سیبری است و مطالعات ژنتیکی جدید حکایت از آن دارد که این دو را حتی می‌توان یک زیرگونه محسوب کرد.

ببر سیبری در دهه ۱۹۳۰ در آستانه انقراض قرار داشت و تعداد آن‌ها تنها به بیست تا سی ببر کاهش یافته بود. اما این حیوان به طرزی باورنکردنی از انقراض قریب‌الوقوع رهایی جست و جمعیت آن تا سال ۲۰۱۰ به حدود ۳۶۰ ببر رسید. ببر سیبری با توجه به همین افزایش جمعیت از سال ۲۰۱۰ از بالاترین ردهٔ حفاظتی یعنی «به شدت در معرض خطر» خارج شده و در یک رده پایین‌تر یعنی «در خطر انقراض» قرار گرفته است. ببرهای سیبری تنوع ژنتیکی بسیار پائینی دارند که این به دلیل کاهش شدید جمعیت این حیوان در دهه ۱۹۴۰ و تعداد اندک توله ببرهایی است که به بلوغ می‌رسند. ضمن اینکه بی

In [22]:
corpus = [(int(path.stem), path.open().read()) for path in Path('IR_dataset').iterdir()]
corpus = pd.DataFrame(corpus, columns=['docId','text']).set_index('docId').sort_index()

In [23]:
ccorpus = [(int(path.stem), path.open().read()) for path in Path('IR_dataset').iterdir()]

In [24]:
corpus.head()

Unnamed: 0_level_0,text
docId,Unnamed: 1_level_1
0,برخی از هواداران مصدق یا اعضای جبهه ملی که در ...
1,جبهه ملی ایران که به اختصار جبهه ملی نیز خواند...
2,سرلشکر زاهدی در سال ۱۳۲۸ و پس از آن‌که دخالت‌ه...
3,نمایندگان طرفدار مصدق در حمایت از ابقای دولت و...
4,نمایندگان طرفدار مصدق در حمایت از ابقای دولت و...


## Qrels

In [25]:
!wget https://raw.githubusercontent.com/language-ml/2-LM-embedding-projects/main/problem3/evaluation_IR.yml

--2021-12-31 06:59:28--  https://raw.githubusercontent.com/language-ml/2-LM-embedding-projects/main/problem3/evaluation_IR.yml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 50854 (50K) [text/plain]
Saving to: ‘evaluation_IR.yml.1’


2021-12-31 06:59:28 (6.99 MB/s) - ‘evaluation_IR.yml.1’ saved [50854/50854]



In [26]:
query_raw_data = yaml.safe_load(open('evaluation_IR.yml'))

In [27]:
query = pd.Series(query_raw_data.keys())
qrels = [{'query_id':idx, 'doc_id':d,
          'relevance':3} for idx,q in query.to_dict().items() for d in query_raw_data[q]['similar_high']]
qrels.extend([{'query_id':idx, 'doc_id':d,
          'relevance':2} for idx,q in query.to_dict().items() for d in query_raw_data[q]['similar_med']])
qrels.extend([{'query_id':idx, 'doc_id':d,
          'relevance':1} for idx,q in query.to_dict().items() for d in query_raw_data[q]['similar_low']])
qrels.extend([{'query_id':idx, 'doc_id':query_raw_data[q]['relevant'][0],
          'relevance':4} for idx,q in query.to_dict().items()])
qrels = pd.DataFrame(qrels)

In [28]:
query[147],query_raw_data[query[147]]

('گرجستان  تاریخ',
 {'relevant': [388],
  'similar_high': [389, 390, 391, 392, 393, 394],
  'similar_low': [404, 405, 406, 407, 408, 409, 410, 411, 412, 413],
  'similar_med': [395, 364, 396, 397, 398, 399, 400, 401, 402, 403]})

In [29]:
qrels.sample(n=5).reset_index(drop=True)

Unnamed: 0,query_id,doc_id,relevance
0,82,243,2
1,20,640,3
2,66,1921,2
3,99,1235,3
4,32,2071,1


## Normaliztion

In [30]:
normalize = hazm.Normalizer().normalize
corpus.text = corpus.text.transform(normalize)
query = query.transform(normalize)

# Embedding the documents

## Method 1 : Tfidf

In [31]:
vectorizer = TfidfVectorizer(max_features=500,ngram_range=(1,2))
vectorizer.fit(corpus.text)

TfidfVectorizer(max_features=500, ngram_range=(1, 2))

## Method 2 : ParsBert

In [16]:
from transformers import AutoConfig, AutoTokenizer, AutoModel, TFAutoModel

model_name_or_path = "HooshvareLab/bert-fa-zwnj-base"
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

model = AutoModel.from_pretrained(model_name_or_path)
model = model.cuda()

Downloading:   0%|          | 0.00/565 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/292 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/bert-fa-zwnj-base were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at HooshvareLab/bert-fa-zwnj-base and are newly initialized: ['bert.pooler.dense.weight', 'bert.p

RuntimeError: ignored

In [None]:
text = "ما در قرن ۲۱ زندگی می‌کنیم" 
encoding = tokenizer.encode_plus(
      text,
      add_special_tokens=True, # Add '[CLS]' and '[SEP]'
      return_token_type_ids=False,
      max_length = 500,
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',  # Return PyTorch tensors
    )
out = model(
            input_ids = encoding['input_ids'].cuda(), 
            attention_mask= encoding['attention_mask'].cuda())
out['pooler_output'][0]

In [None]:
def get_embed(part):
  encoding = tokenizer.encode_plus(
    part,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=False,
    max_length = 500,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',  # Return PyTorch tensors
  )
  out = model(
      input_ids = encoding['input_ids'].cuda(), 
      attention_mask= encoding['attention_mask'].cuda())
  return out['pooler_output'].cpu().detach().numpy()

In [105]:
doc_vec = np.zeros((1, 768))
doc_map = np.zeros(1)

for index, doc in tqdm.tqdm(corpus.iterrows()):
  doc_split = doc['text'].split()
  doc_parts = [' '.join(doc_split[i:i + 300]) for i in range(0, len(doc_split) - 150, 150)]
  for part in doc_parts:
    doc_vec = np.append(doc_vec, get_embed(part), axis = 0)
    doc_map = np.append(doc_map, index)


3258it [13:06,  4.14it/s]


# Document Retrieval

In [33]:
class KNN_based_IR(BaseEstimator):
  def __init__(self,n_neighbors=1+10+10+10) -> None:
    super().__init__()
    self.nn = NearestNeighbors(n_neighbors=n_neighbors)
  def set_params(self,**kwargs):
    self.nn.set_params(**kwargs)
  def fit(self, X: np.array):
    self.nn.fit(X)
  def predict(self, X: np.array):
    distances, docIds = self.nn.kneighbors(X)
    scores = np.max(distances)-distances
    return scores, docIds

In [34]:
IR_system = KNN_based_IR()
IR_system.fit(vectorizer.transform(corpus.text))

In [108]:
bert_knn = KNN_based_IR(80)
bert_knn.fit(doc_vec[1:])

# IR Evaluation
Tailored for our multi-level Test Collection.

In [46]:
def tf_knn_pred(knn):
  return knn.predict(vectorizer.transform(query))


preds = tf_idf_pred(IR_system)

In [41]:
def bert_knn_pred(knn):
  bert_score = []
  bert_id = []
  mn = 100
  for q in tqdm.tqdm(query):
    score, oid = knn.predict(get_embed(q))
    score = score[0]
    doc_id = [doc_map[i] for i in oid[0]]
    n_score = []
    n_id = []
    for sc, id in zip(score, doc_id):
      if id not in n_id:
        n_id.append(id)
        n_score.append(sc)
    mn = min(mn, len(n_score[:31]))
    bert_score.append(n_score[:31])
    bert_id.append(n_id[:31])
  
  return (np.array(bert_score), np.array(bert_id).astype(int))


bert_pred = bert_knn_pred(bert_knn)

NameError: ignored

## Adapting IR output to our Test Collection

In [42]:
def adapt_IR_output_to_measure_input(IR_output: Tuple[np.array, np.array]):
  scores, docIds = IR_output
  return pd.DataFrame({'query_id': np.tile(query.index,(31,1)).flatten(order='F').astype(str),
                       'doc_id':   docIds.flatten().astype(str),
                       'score':    scores.flatten()})

In [60]:
def knn_tuning(n, param, embed, pred_f, measure):
  score = -1
  best_p = -1
  for p in param:
    knn = KNN_based_IR(n)
    knn.set_params(metric = p)
    knn.fit(embed)
    val = measure(qrels.astype({'query_id':str,'doc_id':str}),pred_f(knn))
    if val > score:
      score = val
      best_p = p
  return best_p, score


param = ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] # Metrics for sparse input

## MRR (Mean Reciprocal Rank)

In [44]:
MRR = IRm.measures.MRR()
def mrr_measure(qrels, ret):
  ret = adapt_IR_output_to_measure_input(ret)
  return MRR.calc_aggregate(qrels[qrels.relevance == 4], ret)
# mrr_scorer = make_scorer(mrr)

In [45]:
mrr_measure(qrels.astype({'query_id':str,'doc_id':str}),preds)

0.11047063325391496

In [61]:
knn_tuning(31, param, vectorizer.transform(corpus.text), tf_knn_pred, mrr_measure)

('cosine', 0.11664736491005835)

In [114]:
mrr_measure(qrels.astype({'query_id':str,'doc_id':str}),bert_pred)

0.07376145695263342

## MAP (Mean Average Precision)

In [63]:
def map_measure(qrels, ret):
  ret = adapt_IR_output_to_measure_input(ret)
  return np.mean([IRm.measures.AP(rel=level).\
                    calc_aggregate(qrels[qrels.relevance == level], ret) for level in range(1,4+1)])

# map_scorer = make_scorer(map)

In [64]:
map_measure(qrels.astype({'query_id':str,'doc_id':str}),preds)

0.06630537139767945

In [65]:
knn_tuning(31, param, vectorizer.transform(corpus.text), tf_knn_pred, map_measure)

('cosine', 0.06931719832816999)

In [117]:
map_measure(qrels.astype({'query_id':str,'doc_id':str}),bert_pred)

0.048246473334093494

## P@K

In [66]:
def p_measure(qrels, ret):
  ret = adapt_IR_output_to_measure_input(ret)
  return np.mean([IRm.measures.P(cutoff=k, rel=level).\
                    calc_aggregate(qrels[qrels.relevance == level], ret)\
                  for k,level in zip([1,11,21,31],range(1,4+1))])

In [67]:
p_measure(qrels.astype({'query_id':str,'doc_id':str}),preds)

0.033872596937113045

In [68]:
knn_tuning(31, param, vectorizer.transform(corpus.text), tf_knn_pred, p_measure)

('cosine', 0.03418284224735836)

In [120]:
p_measure(qrels.astype({'query_id':str,'doc_id':str}),bert_pred)

0.027153330540427296

# Pipeline Definition

In [None]:
pipeline = Pipeline([('embedding','passthrough'),
                     ('retrieval','passthrough')])