## *مجموعه داده*

### الف

In [None]:
import tarfile
import re

folder_path = './'
# Step 1: Extract the .tar.gz file
with tarfile.open(folder_path+"cran.tar.gz", "r:") as tar:
    tar.extractall(folder_path+"cranfield_dataset")

# Paths to the extracted files
docs_path = folder_path+"cranfield_dataset/cran.all.1400"
queries_path = folder_path+"cranfield_dataset/cran.qry"
qrels_path = folder_path+"cranfield_dataset/cranqrel"

# Step 1: Load documents from 'cran.all.1400'
documents = {}
with open(docs_path, 'r') as file:
    content = file.read()
    docs = re.split(r"\.I ", content)[1:]  # Split by document identifier prefix
    for doc in docs:
        lines = doc.splitlines()
        doc_id = lines[0].strip()
        doc_text = "\n".join(lines[2:])  # Skip .T and first line for title/author, rest is content
        documents[doc_id] = doc_text

# Step 2: Load queries from 'cran.qry'
queries = {}
with open(queries_path, 'r') as file:
    content = file.read()
    qry_sections = re.split(r"\.I ", content)[1:]  # Split by query identifier prefix
    for idx, qry in enumerate(qry_sections):
        lines = qry.splitlines()
        query_text = "\n".join(lines[2:])  # Skip the first 2 lines to get query text
        queries[idx + 1] = query_text  # Use a continuous index from 1 to total count

# Step 3: Load and remap relevance judgments from 'cranqrel'
relevance_judgments = {i + 1: set() for i in range(len(queries))}
# query_id_map = {old_id: new_id for new_id, old_id in enumerate(sorted(queries.keys()), start=1)}

with open(qrels_path, 'r') as file:
    for line in file:
        query_id, doc_id, relevance = map(int, line.split())
        # new_query_id = query_id_map.get(query_id)
        if query_id and relevance >= 2:  # Only consider relevance >= 2
            relevance_judgments[query_id].add(str(doc_id))

# Step 4: Define qrels_defs() and metadata() equivalents
def qrels_defs():
    print("Qrels Definitions:")
    print("0: Not Relevant")
    print("1: Marginally Relevant")
    print("2: Relevant")
    print("3: Highly Relevant")

def metadata(documents, queries, relevance_judgments):
    print("Dataset Metadata:")
    print(f"Total number of documents: {len(documents)}")
    print(f"Total number of queries: {len(queries)}")
    total_rels = sum(len(docs) for docs in relevance_judgments.values())
    print(f"Total number of relevance judgments: {total_rels}")
    print(f"Average judgments per query: {total_rels / len(queries):.2f}")

# Call the functions to print qrels definitions and metadata
qrels_defs()
metadata(documents, queries, relevance_judgments)

Qrels Definitions:
0: Not Relevant
1: Marginally Relevant
2: Relevant
3: Highly Relevant
Dataset Metadata:
Total number of documents: 1400
Total number of queries: 225
Total number of relevance judgments: 1484
Average judgments per query: 6.60


## *بخش اول*

### الف

In this section, we train a WordPeice tokenizer on the dcuments and create vocabulary using the tokenizer.

In [None]:
from tokenizers import BertWordPieceTokenizer
import os

tokenizer = BertWordPieceTokenizer(
    clean_text = True,
    strip_accents = False
)

document_values = documents.values()

vocab_size = 20000
tokenizer.train_from_iterator(document_values, vocab_size=vocab_size)

os.mkdir('./tokenizer')
tokenizer.save_model('./tokenizer')


['./tokenizer\\vocab.txt']

In [None]:
import numpy as np
from numpy.linalg import norm
import math
from collections import Counter, defaultdict
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('./tokenizer')

# Tokenize and count terms in each document
all_tokens = {}
doc_lengths = {}

for doc_id, text in documents.items():
    tokens = tokenizer.tokenize(text)
    all_tokens[doc_id] = tokens
    doc_lengths[doc_id] = len(tokens)

vocab = tokenizer.get_vocab()


### ب

Using the created vocabulary, we build Term Frequency Embedding vectors for tokens

In [None]:

token_to_index = {token: idx for idx, token in enumerate(vocab)}  # Map tokens to indices

tf_matrix = defaultdict()

for doc_id, text in documents.items():
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    # Initialize (length=vocab size)
    tf_vector = [0] * len(vocab)
    
    token_counts = defaultdict(int)
    for token_id in token_ids:
        token_counts[token_id] += 1
    
    # Fill the vector with term frequencies
    for token_id, count in token_counts.items():
        token = tokenizer.convert_ids_to_tokens([token_id])[0]
        if token in token_to_index:
            idx = token_to_index[token]
            tf_vector[idx] = count
    
    tf_matrix[doc_id] = tf_vector


## 1-4-a:
After TF vectors creation, we evaluate its functionality using the provided Relevance Judgments. For this sake we:

        1: Create Term Frequency vectors for queries

        2: Calculate the similarity between each query and document

        3: We search for the most 5 similar documents
        
        4: Using the relevace judgments and most 5 similar documents, we calculate MP@5 evaluation criteria

This method has the least accuracy comapres the other two, with hte MP@5 criteria of 12%.

In [None]:
# Do the same for the queries
query_embeddings = defaultdict()

for quer_id, text in queries.items():
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    tf_vector = [0] * len(vocab)

    token_counts = defaultdict(int)
    for token_id in token_ids:
        token_counts[token_id] += 1

    for token_id, count in token_counts.items():
        token = tokenizer.convert_ids_to_tokens([token_id])[0] 
        if token in token_to_index:
            idx = token_to_index[token]
            tf_vector[idx] = count
    
    query_embeddings[quer_id] = tf_vector




In [None]:
cosins = []
for quer_id, query_vector in query_embeddings.items():
    similarity = defaultdict(float)
    for doc_id, doc_vector in tf_matrix.items():
        cos_sim = np.dot(query_vector, doc_vector)/(norm(query_vector) * norm(doc_vector))
        similarity[doc_id] = cos_sim
    cosins.append(similarity)
        
for i in range(len(queries)):
    cosins[i] = dict(sorted(cosins[i].items(), key=lambda item:item[1], reverse=True))
    

In [None]:
mp_at_k = 0
for i in range(1, len(queries)+1):
    x = iter(cosins[i-1].items())
    p_at_k = 0
    for j in range(5):
        if f"{next(x)[0]}" in relevance_judgments[i]:
            p_at_k += 1

    p_at_k /= 5
    mp_at_k += p_at_k

mp_at_k /= len(queries)
mp_at_k *= 100
print(f"MP@K criteria for TF embedding: {mp_at_k}%")

MP@K criteria for TF embedding: 12.088888888888874%


## *بخش دوم*

After TF embeddings, we create TF-IDF embedding vectors. 
The main difference is in their objectives; Taking advantage of mere frequencies (the objective of TF) is not very accurate, since the frequent tokens like '.' attract more attention, but in reality they are not very remarkable. 

TF-IDF method uses the Inverse Document Frequency which reduces their importance by their appearance in the all documents.
(For conveniance and lower computations, I have put all document tokens in all_tokens. Also, I have avoided to build a sparse matrix and just put the non-zero tokens with their values in dictionary; This reduces the processes significantly)

In [None]:
# Calculate TF for each term in each document
tfs = {}
for doc_id, tokens in all_tokens.items():
    term_counts = Counter(tokens)
    doc_len = doc_lengths[doc_id]
    tfs[doc_id] = {term: count / doc_len for term, count in term_counts.items()}
    tfs[doc_id] = dict(sorted(tfs[doc_id].items(), key=lambda item:item[1], reverse=True))
    

In [None]:
# Calculate DF for each term
dfs = defaultdict(int)
for tokens in all_tokens.values():
    unique_tokens = set(tokens)
    for term in unique_tokens:
        dfs[term] += 1

# Calculate IDF for each term
docs_num = len(documents)
idfs = {term: math.log(docs_num/df)+1 for term, df in dfs.items()}

tf_idfs = defaultdict(int)
for doc_id, tf_values in tfs.items():
    tf_idfs[doc_id] = {term: tf * idfs[term] for term, tf in tf_values.items()}
    tf_idfs[doc_id] = dict(sorted(tf_idfs[doc_id].items(), key=lambda item: item[1], reverse=True))


## 1-4-b:
In this section we evalute MP@5 criteria for TF-IDF Embedding vectors. This method is more accurate than other two with the MP@5 of 25%. 

In [None]:
query_tfs = defaultdict(int)

for quer_id, text in queries.items():
    tokens = tokenizer.tokenize(text)
    term_counts = Counter(tokens)
    query_len = len(tokens)
    query_tfs[quer_id] = {term: count / query_len for term, count in term_counts.items()}
    query_tfs[quer_id] = dict(sorted(query_tfs[quer_id].items(), key=lambda item: item[1], reverse=True))

query_tf_idfs = defaultdict(int)
for quer_id, tf_values in query_tfs.items():
    query_tf_idfs[quer_id] = {term: tf * idfs[term] for term, tf in tf_values.items() if term in idfs}
    query_tf_idfs[quer_id] = dict(sorted(query_tf_idfs[quer_id].items(), key=lambda item: item[1], reverse=True))
    

In [None]:
cosins = []
for quer_id, query_tf_idf in query_tf_idfs.items():
    similarity = defaultdict(float)
    for doc_id, doc_tf_idf in tf_idfs.items():
        dot_product = 0
        for token in query_tf_idf:
            if token in doc_tf_idf:
                dot_product += query_tf_idf[token] * doc_tf_idf[token]
            
        cos_sim = dot_product / (norm(list(query_tf_idf.values())) * norm(list(doc_tf_idf.values())))
        similarity[doc_id] = cos_sim

    cosins.append(similarity)
        
for i in range(len(queries)):
    cosins[i] = dict(sorted(cosins[i].items(), key=lambda item:item[1], reverse=True))    


In [None]:
mp_at_k = 0
for i in range(1, len(queries)+1):
    x = iter(cosins[i-1].items())
    p_at_k = 0
    for j in range(5):
        if f"{next(x)[0]}" in relevance_judgments[i]:
            p_at_k += 1

    p_at_k /= 5
    mp_at_k += p_at_k

mp_at_k /= len(queries)
mp_at_k
mp_at_k *= 100
print(f"MP@5 criteria for TF-IDF embedding: {mp_at_k}%")


MP@5 criteria for TF-IDF embedding: 25.422222222222242%


## *بخش سوم*

### الف

In this section we create the PPMI Embedding vectors. PPMI method emphasizes meaningful relationships between words by assigning higher scores to pairs that appear together more often than expected by chance.

In [None]:
vocab_size = len(tokenizer.vocab)
context_window = 5

term_freq = defaultdict(int)
co_occurrence = defaultdict(lambda: defaultdict(int))
total_tokens_num = 0

for doc_id, text in documents.items():
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    tokens_num = len(token_ids)
    total_tokens_num += tokens_num

    for i, token_id in enumerate(token_ids):
        term_freq[token_id] += 1
        start_point = max(i - context_window, 0)
        end_point = min(i + context_window + 1, tokens_num)

        for j in range(start_point, end_point):
            if j != i:
                co_occurrence[token_id][token_ids[j]] += 1
                

In [None]:
ppmis = defaultdict(lambda: defaultdict(float))

for term_x in co_occurrence:
    for term_y in co_occurrence[term_x]:
        p_xy = co_occurrence[term_x][term_y] / total_tokens_num
        p_x = term_freq[term_x] / total_tokens_num
        p_y = term_freq[term_y] / total_tokens_num

        # Calculate PMI
        pmi = np.log2(p_xy / (p_x * p_y)) if p_xy > 0 else 0

        # Calculate PPMI
        ppmis[term_x][term_y] = max(pmi, 0)
        

### ب

After creation of embedding vectors for each token, now we should calculate the Document and Query embeddings by taking an average from all of the embedding vectors of that document

In [None]:
# Build Document Embeddings using PPMI values
doc_embeddings = {}

for doc_id, text in documents.items():
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)  

    doc_vector = np.zeros(vocab_size)

    for token_id in token_ids:
        ppmi_values = np.array([ppmis[token_id].get(other_id, 0) for other_id in ppmis[token_id]])
        doc_vector[token_id] = np.mean(ppmi_values)

    doc_embeddings[doc_id] = doc_vector
    

In [None]:
query_embeddings = {}

for query_id, text in queries.items():
    query_tokens = tokenizer.tokenize(text)
    query_token_ids = tokenizer.convert_tokens_to_ids(query_tokens)  # Convert to token IDs

    # Initialize a zero vector for the query with the size of the vocabulary
    query_vector = np.zeros(vocab_size)

    for token_id in query_token_ids:
        # Take the average of PPMIs between the current term and all others
        ppmi_values = np.array([ppmis[token_id].get(other_id, 0) for other_id in ppmis[token_id]])
        query_vector[token_id] = np.mean(ppmi_values)

    query_embeddings[query_id] = query_vector


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


## 1-4-c:
In this section we calculate the MP@5 criteria for PPMI method. This method has around 16% correct relevance judgments.

In [None]:
cosins = []
for quer_id, query_vector in query_embeddings.items():
    similarity = defaultdict(float)
    for doc_id, doc_vector in doc_embeddings.items():
        cos_sim = np.dot(query_vector, doc_vector)/(norm(query_vector) * norm(doc_vector))
        similarity[doc_id] = cos_sim
    cosins.append(similarity)
        
for i in range(len(queries)):
    cosins[i] = dict(sorted(cosins[i].items(), key=lambda item:item[1], reverse=True))

In [None]:
mp_at_k = 0
for i in range(1, len(queries)+1):
    x = iter(cosins[i-1].items())
    p_at_k = 0
    for j in range(5):
        if f"{next(x)[0]}" in relevance_judgments[i]:
            p_at_k += 1

    p_at_k /= 5
    mp_at_k += p_at_k

mp_at_k /= len(queries)
mp_at_k *= 100
print(f"MP@5 criteria for PPMI embedding: {mp_at_k}%")

MP@5 criteria for PPMI embedding: 25.422222222222242%


## *بخش چهارم*

### الف

1-4-a has been provided after بخش اول

### ب

1-4-b has been provided after بخش دوم

### ج

1-4-c has been provided after بخش سوم