## Smart using Rocchio's algorithm

#### Imports

In [6]:
import numpy as np
import nltk
import string
import math
import json
import ast
import pandas as pd
from gensim.models import Word2Vec
from nltk.stem.snowball import FrenchStemmer, GermanStemmer, ItalianStemmer, SpanishStemmer, EnglishStemmer, ArabicStemmer
from kiwipiepy import Kiwi
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
from spacy.lang.ko.stop_words import STOP_WORDS as ko_stop
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\liamg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\liamg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Access corpus and queries of respective languages when already tokenized and preprocessed previously

In [3]:
lang = 'fr'
queries_df = pd.read_csv(f'Data_query_exp/{lang}_query.csv') 
corpus_df = pd.read_csv(f'Data_query_exp/{lang}_corpus.csv')

If the text needs to be retrieved and tokenized, then the cells below are run:

In [4]:
lang = 'fr'
with open("Data/corpus.json/corpus.json", 'r', encoding='utf-8') as f:
    corpus_data = json.load(f)

corpus_df = pd.DataFrame(corpus_data)
queries_df = pd.read_csv('Data/dev.csv')
corpus_df = corpus_df[corpus_df['lang']==lang]
queries_df = queries_df[queries_df['lang']==lang]

In [None]:
if lang == 'en':
    stemmer = EnglishStemmer()
    stopwords = stopwords.words("english")
elif lang == 'fr':
    stemmer = FrenchStemmer()
    stopwords = stopwords.words("french")
elif lang == 'de':
    stemmer = GermanStemmer()
    stopwords = stopwords.words("german")
elif lang == 'it':
    stemmer = ItalianStemmer()
    stopwords = stopwords.words("italian")
elif lang == 'es':
    stemmer = SpanishStemmer()
    stopwords = stopwords.words("spanish")
elif lang == 'ar':
    stemmer = ArabicStemmer()
    stopwords = stopwords.words("arabic")
elif lang == 'ko':
    stemmer = Kiwi()
    stopwords = list(ko_stop)

def tokenize(text):
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))

    if lang != 'ko':
        tokens = word_tokenize(text) 
        tokenized_words = [stemmer.stem(word.lower()) for word in tokens if word.lower() not in stopwords]
    else:
        tokens = stemmer.analyze(text)[0][0] 
        tokenized_words = [word.form for word in tokens if word.tag.startswith('N') and word.form not in stopwords] 
    
    return tokenized_words

corpus_df['text_token'] = corpus_df['text'].apply(tokenize)
queries_df['query_token'] = queries_df['query'].apply(tokenize)

#### Vectorize the documents and the queries

In [None]:
vectorizer = CountVectorizer(max_features=7500)
corpus_data_matrix = vectorizer.fit_transform(corpus_df['text_token'])
query_matrix = vectorizer.transform(queries_df['query_token'])

#### Calculate term frequency and inverse document frequency matrices

In [None]:
tf_matrix = corpus_data_matrix 
tf_query_matrix = query_matrix  

doc_freq = np.sum(tf_matrix > 0, axis=0) 

numb_docs = tf_matrix.shape[0] 
idf = np.log((numb_docs + 1) / (doc_freq + 1)) + 1 

tf_idf_matrix = tf_matrix.multiply(idf)

tf_idf_df = pd.DataFrame(tf_idf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tf_query_df = pd.DataFrame(tf_query_matrix.toarray(), columns=vectorizer.get_feature_names_out())

Normalize the matrices 

In [None]:
tf_idf_norm = normalize(tf_idf_df)
tf_query_norm = normalize(tf_query_df)

#### Rankings

In [None]:
def initial_rankings(document_matrix, query_matrix, corpus_data_df):
    #Calculate the cosine similarity between each query and document
    cosine_similarity_matrix = query_matrix @ document_matrix.T
    
    #Take the top 10 documents with highest cosine similarity per query
    top_10_results = []
    for i in range(cosine_similarity_matrix.shape[0]):
        top_10_indices = np.argsort(-cosine_similarity_matrix[i])[:10]
        top_10_doc_ids = [corpus_data_df['docid'].iloc[j] for j in top_10_indices]
        top_10_similarities = cosine_similarity_matrix[i, top_10_indices]
        top_10_results.append(list(zip(top_10_doc_ids, top_10_similarities)))
    
    return top_10_results

#### Rocchio's algorithm

This method of query expansion uses pseudo-relevance feedback.

In [None]:
def expand_query(tf_idf_norm_mat, tf_query_norm_mat, corpus_data_df, alpha, beta, gamma,
                 k=10,p=5,q=5):
    
    expanded_query_matrix = []
    
    cos_sim_mat = tf_query_norm_mat @ tf_idf_norm_mat.T
    
    for idx, query_vector in enumerate(tf_query_norm_mat):
        #Gets the top 10 document indices based on cosine similarity
        top_10_indices = np.argsort(-cos_sim_mat[idx])[:10] 
        #Considers the top p documents within the 10 to be relevant
        top_rel_doc_vecs = [tf_idf_norm_mat[i] for i in top_10_indices[:p]]
        #Considers the bottom q documents within the 10 be non-relevant
        bot_non_rel_doc_vecs = [tf_idf_norm_mat[i] for i in top_10_indices[-q:]]
        #Determines how much weight to attribute to the original query
        norm_query_vector = [alpha * weight for weight in query_vector]
        #Determines how much weight to give to the centroid of relevant documents
        norm_sum_relevant = [beta*sum(x)/len(top_rel_doc_vecs) for x in zip(*top_rel_doc_vecs)]
        #Determines how much weight to give to the centroid of non-relevant documents
        norm_sum_non_relevant = [-gamma*sum(x)/len(bot_non_rel_doc_vecs) for x in zip(*bot_non_rel_doc_vecs)]
        
        modified_query_vector = [sum(x) for x in zip(norm_sum_relevant, norm_sum_non_relevant, norm_query_vector)]
        
        modified_query_vector = [x if x > 0 else 0 for x in modified_query_vector]
        
        expanded_query_matrix.append(modified_query_vector)
    
    return expanded_query_matrix

#### Ranking the documents and evaluation

The cell below gets the initial rankings of cosine similarity between queries and documents prior to query expansion

In [None]:
top_10_results = initial_rankings(tf_idf_norm,tf_query_norm,corpus_df)

for query_index, results in enumerate(top_10_results):
    print(f"Query {queries_df['query_id'].iloc[query_index]}:")
    for doc_id, similarity in results:
        print(f"  Document ID: {doc_id}, Cosine Similarity: {similarity:.4f}")

Query expansion

In [None]:
modified_query = expand_query(tf_idf_norm,tf_query_norm,corpus_df,1,0.75,0.15)
modified_query_matrix = np.array(modified_query)
mod_query_matrix_norm = normalize(modified_query_matrix)

Re-ranking the documents after the query expansion

In [None]:
new_top_10_doc_ids = initial_rankings(tf_idf_norm,mod_query_matrix_norm,corpus_df)

for query_index, results in enumerate(new_top_10_doc_ids):
    print(f"Query {queries_df['query_id'].iloc[query_index]}:")
    for doc_id, similarity in results:
        print(f"  Document ID: {doc_id}, Cosine Similarity: {similarity:.4f}")

The cell below looks at the accuracy and sees how for how many queries the actual positive document appeared among the top 10 ranked documents

In [None]:
cosine_similarity_matrix2 = tf_query_norm @ tf_idf_norm.T

correct_count = 0  # count
for i in range(cosine_similarity_matrix2.shape[0]):
    top_10_indices = np.argsort(-cosine_similarity_matrix2[i])[:10]  # get top 10 indices
    top_10_doc_ids = [corpus_df['docid'].iloc[j] for j in top_10_indices]  #  get top 10 doc_ids
    print(f"Query {queries_df['query_id'].iloc[i]}: {queries_df['positive_docs'].iloc[i]}")
    print("Top 10 Document IDs:", top_10_doc_ids)

    # check if positive id is in top 10
    if queries_df['positive_docs'].iloc[i] in top_10_doc_ids:
        correct_count += 1  #  if positive id is in top 10, count + 1

# calculate accuracy
accuracy = correct_count / len(queries_df['positive_docs'])
# print accuracy
print()
print()
print(f"Accuracy: {accuracy:.4f}")