# 1.Building Trec collection

## Importing the tokenizer with its different capabilities

In [1]:
from collections import Counter
from nltk.corpus import stopwords


def std_tokenizer_build_standard_vocabulary(queries,documents,min_occ = 2,limit_docs = None, limit_queries = None):
    """Function that builds the standard vocabulary from a list of queries and a list of documents and
    with a limit on the number of documents and queries to manipulate"""
    vocabulary = Counter()
    
    count = 0
    for _,document in documents.iterrows():
        for word in document[0].split(" "):
            vocabulary[word] += 1
        count +=1
        if count==limit_docs:
            break
            
    count = 0
    for _,query in queries.iterrows():
        for word in query[0].split(" "):
            vocabulary[word] += 1
        count +=1
        if count==limit_queries:
            break
    
    vocabulary = {i:elem[0] for i,elem in enumerate(vocabulary.most_common()) if elem[1] >= min_occ}
    
    for key in list(vocabulary):
        vocabulary[vocabulary[key]] = key
    
    return vocabulary


def std_tokenizer_index(pdDataFrame,vocabulary,stemmer=None):
    """Function that indexes a dataframe either documents or queries for example according to a vocabulary.
    While doing that it can perform a stemmerization if the vocabulary was built on words that got stemmerized"""
    indexed_elements = []
    index = dict()
    count = 0
    if stemmer is None:
        for key,element in pdDataFrame.iterrows():
            indexed_elements.append([vocabulary[elem.lower()] for elem in element[0].split(" ") if elem.lower() in vocabulary])
            index[str(key)] = count
            index[count] = str(key)
            count += 1
            
    else:
        for key,element in pdDataFrame.iterrows():
            indexed_elements.append([vocabulary[stemmer.stem(elem.lower())] for elem in element[0].split(" ") if stemmer.stem(elem.lower()) in vocabulary])
            index[str(key)] = count
            index[count] = str(key)
            count += 1

    return index,indexed_elements



def std_tokenizer_index_dict(pdDataFrame,vocabulary):
    """Function that indexes a dict that could be documents or queries for example according to a vocabulary"""
    indexed_elements = []
    index = dict()
    count = 0
    for key,element in pdDataFrame.items():
        indexed_elements.append([vocabulary[elem] for elem in element.split(" ") if elem in vocabulary])
        index[str(key)] = count
        index[count] = str(key)
        count += 1
    return index,indexed_elements


def std_tokenizer_preprocess(queries,documents,min_occ = 5):
    """Function that preprocesses queries and documents. It builds the standard vocabulary and indexes both
    the documents and the queries and returns the vocabulary , the query and doc index and the indexed elements of 
    both doc and  query"""
    vocabulary = std_tokenizer_build_standard_vocabulary(queries,
                                           documents,
                                           min_occ = min_occ)
    
    doc_index,indexed_docs = std_tokenizer_index(documents,vocabulary)
    
    query_index,indexed_queries = std_tokenizer_index(queries,vocabulary)
    
    return vocabulary,query_index,indexed_queries,doc_index,indexed_docs
    

## Loading read_qrels and read_trec_train_qrels from utils

In [2]:
import os
import json
import pickle
import random
import collections
import numpy as np
import pytrec_eval
from collections import Counter

def read_qrels(path):
    qrels = []
    with open(path,'r') as f:
        for line in f:
            rel = line.split()
            qrels.append([rel[0],rel[2]])
    return qrels

def read_trec_train_qrels(path):
    pos_qrels = []
    neg_qrels = dict()
    with open(path,'r') as f:
        for line in f:
            rel = line.split()
            if rel[3] == '1':
                pos_qrels.append([rel[0],rel[2]])
            else :
                if rel[0] not in neg_qrels:
                    neg_qrels[rel[0]] = [rel[2]]
                else:
                    neg_qrels[rel[0]].append(rel[2])
    return {'pos':pos_qrels,'neg':neg_qrels}


## The Trec Collections class and its methods
It contains the definition of a class called TrecCollection which has various useful methods for processing the Trec collections.

In [1]:
import os
import pickle
import random
import string
import fasttext
import numpy as np
import pytrec_eval
import pandas as pd
from nltk.stem import snowball
from collections import Counter
from nltk.corpus import stopwords


def build_folds(queries_ids,k=5):
    """Builds folds for the K-fold cross validation"""
    nb_queries = len(queries_ids)
    nb_elem = int(nb_queries/k)
    random.shuffle(queries_ids)
    folds = []
    for i in range(k):
        folds.append(queries_ids[i*nb_elem:(i+1)*nb_elem])
    return folds



def read_queries(queries_path):
    """ Function that reads the queries from a path of the file containing those queries. It returns  dict of 
    query ids and query texts"""
    queries_ids = []
    queries_text = []
    with open(queries_path,'r') as f:
        for line in f:
            if line.startswith("<num>"):
                queries_ids.append(int(line[line.find(':')+1:-1]))
            if line.startswith("<title>"):
                pos = max(line.find(':'),line.find('>'))
                queries_text.append("".join([char for char in line[pos+2:-1] if char not in string.punctuation]))

    return dict(zip(queries_ids, queries_text))


def read_documents(documents_path):
    """Same function but for documents"""
    doc_ids = []
    doc_text = ['']
    fill_text = False
    with open(documents_path,'r',encoding='latin1') as f:

        for i,line in enumerate(f):
            if "<DOCNO>" in line : 
                doc_ids.append(line.strip("<DOCNO> ").strip(" </DOCNO>\n"))
            if "<TEXT>" in line:
                fill_text = True
                continue
            if "</TEXT>" in line:
                continue
            if "</DOC>" in line:
                doc_text.append('')
                fill_text = False
            elif fill_text:
                doc_text[-1] += line
    del doc_text[-1]

    for i in range(len(doc_text)):
        doc_text[i] = " ".join(doc_text[i].replace('\n',' ').split())
        doc_text[i] = "".join(char for char in doc_text[i] if char not in string.punctuation)

    return dict(zip(doc_ids, doc_text))


def save_qrel(path,qrels,subset):
    """Function that saves query document relavence scores into a file."""
    with open(path,'w') as f:
        for query in subset:
            for doc,rel in qrels[str(query)].items():
                f.write(str(query) + '\t0\t' + doc + '\t' + str(rel) + '\n')
                
                
def save_queries_csv(coll_path,queries,folds):
    """Function that saves folds of queries into csv files for each fold"""
    for i,elem in enumerate(folds):
        index = pd.Index([key for key in elem],name = 'id_left')
        d = {"text_left":[queries[key] for key in elem]}
        pd.DataFrame(data=d,index=index).to_csv(coll_path + '/fold'  + str(i) + '/queries.csv')
        
    
def save_documents_csv(coll_path,documents):
    """Function that saves documents into a csv file"""
    index = pd.Index([key for key in documents],name = 'id_right')
    d = {"text_right":[documents[key] for key in documents]}
    pd.DataFrame(data=d,index=index).to_csv(coll_path + '/documents.csv')

    
    
    
def read_collection(collection_paths = ['TREC/AP88-89','TREC/LA','TREC/FT91-94'],k=5):
    """Function that for every collection reads queries , create folds for the Kfold cross validation
    ,reads the collection qrels ,save qrels and queries for each fold,reads documents
    on xml format and saves them into csv format"""
    for collection_path in collection_paths:

        queries = read_queries(collection_path + '/queries')

        folds = build_folds(list(queries.keys()),k=k)

        with open(collection_path + '/qrels', 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)

        for i,fold in enumerate(folds):
            if not os.path.exists(collection_path + '/fold'  + str(i)):
                os.makedirs(collection_path + '/fold'  + str(i))    
            save_qrel(collection_path + '/fold'+ str(i) + '/qrels',qrel,fold)


        save_queries_csv(collection_path,queries,folds)

        documents = read_documents(collection_path + '/documents.xml')

        save_documents_csv(collection_path,documents)
    
class TrecCollection:
    def __init__(self,k=5,language='english'):
        self.documents = None
        self.k = k
        self.language = language
        self.stemmer = snowball.EnglishStemmer()
        self.stop_words = set(stopwords.words('english'))

        
    def load_collection(self,collection_path):
        """Function that loads the collection : it loads documents and the folds containing the queries per fold
        in Csv format,qrels per fold and the training qrels per fold . It is run after the function read_collection"""
        self.documents = pd.read_csv(collection_path + '/documents.csv',index_col='id_right',na_filter=False)
        
        self.folds_queries = []
        self.folds_qrels = []
        self.folds_training_qrels = []
        for i in range(self.k):
            self.folds_queries.append(pd.read_csv(collection_path + '/fold' + str(i) + '/queries.csv',
                                                  index_col='id_left',
                                                  na_filter=False))
            self.folds_qrels.append(read_qrels(collection_path + '/fold' + str(i) + '/qrels'))
            self.folds_training_qrels.append(read_trec_train_qrels(collection_path + '/fold' + str(i) + '/qrels'))
            
        
    
        
    def update_standard_vocabulary(self,sequences,remove_stopwords=True):
        """Function that updates the standard vocabulary using new sequences"""
        count = 0
        if remove_stopwords:
            for _,sequence in sequences.iterrows():
                for word in sequence[0].split(" "):
                    temp = word.lower()
                    if temp not in self.stop_words:
                        self.vocabulary[self.stemmer.stem(temp)] += 1
                count +=1
        else:
            for _,sequence in sequences.iterrows():
                for word in sequence[0].split(" "):
                    self.vocabulary[self.stemmer.stem(word.lower())] += 1
                count +=1
        
        
    def build_standard_vocabulary(self,
                                  min_occ = 2,
                                  remove_stopwords = True):
        """Function that builds the standard vocabulary from documents with minimum occurence equal to 2"""
        self.vocabulary = Counter()
    
        self.update_standard_vocabulary(self.documents,remove_stopwords)
        
        for i in range(self.k):
            self.update_standard_vocabulary(self.folds_queries[i],remove_stopwords)
                
        del self.vocabulary['']
                
        self.vocabulary = {i+1:elem[0] for i,elem in enumerate(self.vocabulary.most_common()) if elem[1] >= min_occ}

        for key in list(self.vocabulary):
            self.vocabulary[self.vocabulary[key]] = key
            
        self.vocabulary[0] = '<PAD>'
        self.vocabulary['<PAD>'] = 0
    
     
        
    def standard_preprocess(self,
                            remove_stopwords = True,
                            min_occ = 5):
        """General function that preprocesses the Trec collection by building vocabulary, using the tokenizer
        to index documents, the folds of queries and all the queries. It is run after the method 
        load_collection"""
        self.build_standard_vocabulary(min_occ = min_occ,
                                       remove_stopwords = remove_stopwords)
                
        self.doc_index,self.indexed_docs = std_tokenizer_index(self.documents,
                                                               self.vocabulary,
                                                               self.stemmer)
        
        self.queries_index = []
        self.indexed_queries = []
        
        for i in range(self.k):
        
            queries_index,indexed_queries = std_tokenizer_index(self.folds_queries[i],
                                                                self.vocabulary,
                                                                self.stemmer)
            self.queries_index.append(queries_index)
            self.indexed_queries.append(indexed_queries)
            
        
        self.all_indexed_queries = []
        for elem in self.indexed_queries:
            self.all_indexed_queries+=elem
        
        self.all_queries_index = dict()
        counter = 0
        for i in range(len(self.queries_index)):
            for j in range(int(len(self.queries_index[i])/2)):
                self.all_queries_index[counter] = self.queries_index[i][j]
                self.all_queries_index[self.queries_index[i][j]] = counter
                counter +=1
    
    
    def build_inverted_index(self):
        """Function that builds the inverted index of documents """
        self.inverted_index = dict()
        
        for token in self.vocabulary:
            if isinstance(token, int):
                self.inverted_index[token] = Counter()
            
        for i,indexed_document in enumerate(self.indexed_docs):
            for token in indexed_document:
                self.inverted_index[token][i] += np.float32(1.0)
                
                
    def compute_idf(self):
        """Funciton that computes the idf of every term in te inverted index"""
        nb_docs = len(self.doc_index)
        self.idf = {token:np.log((nb_docs+1)/(1+len(self.inverted_index[token]))) for token in self.inverted_index }
        
        
    def compute_docs_length(self):
        """Function that computes the length of each document in the collection"""
        self.docs_length = {i:len(doc) for i,doc in enumerate(self.indexed_docs)}
        
        
    def compute_collection_frequencies(self):
        """Function that computes frequency of words in the collection"""
        coll_length = sum([value for key,value in self.docs_length.items()])
        self.c_freq = {token:sum([freq for _,freq in self.inverted_index[token].items()])/coll_length for token in self.inverted_index}
        
    def index_relations(self):
#         self.folds_queries = []
#         self.folds_qrels = []
#         self.folds_training_qrels = []
        
        self.folds_indexed_qrels = []
        self.folds_training_indexed_qrels = []
        
        for i in range(self.k):
        
            training_indexed_qrels = dict()
            training_indexed_qrels['pos'] = []
            training_indexed_qrels['neg'] = dict()
            for elem in self.folds_training_qrels[i]['pos']:
                if elem[1] in self.doc_index:
                    training_indexed_qrels['pos'].append([self.all_queries_index[elem[0]],
                                                          self.doc_index[elem[1]]])

            for key in self.folds_training_qrels[i]['neg']:
                training_indexed_qrels['neg'][key] = []
                for elem in self.folds_training_qrels[i]['neg'][key]:
                    if elem in self.doc_index:
                        training_indexed_qrels['neg'][key].append(self.doc_index[elem])

            self.folds_training_indexed_qrels.append(training_indexed_qrels)

            indexed_qrels = []
            for elem in self.folds_qrels[i]:
                if elem[1] in self.doc_index:
                    indexed_qrels.append([self.all_queries_index[elem[0]],self.doc_index[elem[1]]])
            
            self.folds_indexed_qrels.append(indexed_qrels)
        
        
    def compute_info_retrieval(self):
        """Function that builds the inverted index, the idf of the terms; documents length
        and frequencies of terms in the collection and indexes the relations"""
        self.build_inverted_index()
        self.compute_idf()
        self.compute_docs_length()
        self.compute_collection_frequencies()
        self.index_relations()
        
        
    def save_results(self,index_queries,results,path,top_k=1000):
        with open(path,'w') as f:
            for query,documents in enumerate(results):
                for i,scores in enumerate(documents.most_common(top_k)):
                    f.write(index_queries[query] + ' Q0 ' + self.doc_index[scores[0]] + ' ' + str(i) + ' ' + str(scores[1]) + ' 0\n')
                    
                    
    def pickle_indexed_collection(self,path):
        """Function that writes the different indexed collection parts other than documents annd fold queries
        into a pickle format"""
        self.documents = None
        self.folds_queries = None
        with open(path,'wb') as f:
            pickle.dump(self,f)

            
    def compute_fasttext_embedding(self,model_path):
        """Function that computes the embedding matrix using the fasttext embedding: vectors of length 
        300 for every token in the vocabulary"""
        model = fasttext.load_model(model_path)
        dim = model.get_dimension()
        vocab_size = int(len(self.vocabulary)/2)
        self.embedding_matrix = np.zeros((vocab_size, dim))
        for _ in range(vocab_size):
            self.embedding_matrix[_] = model[self.vocabulary[_]]
                    
            
    def generate_training_batches(self,fold,batch_size=64):
        """Function that builds batches of queries and their corresponding negative and positive documents
        for training for a particular fold. These batches are picked from outside the fold we want to
        test or validate on"""
        positive_pairs = []
        negative_pairs = {}
        for i in range(self.k):
            if i != fold:
                positive_pairs += self.folds_training_indexed_qrels[i]['pos']
                negative_pairs.update(self.folds_training_indexed_qrels[i]['neg'])
        
        random.shuffle(positive_pairs)
        nb_docs = len(self.indexed_docs)
        nb_train_pairs = len(positive_pairs)
        query_batches = []
        positive_doc_batches = []
        negative_doc_batches = []
        pos = 0
        while(pos + batch_size < nb_train_pairs):
            query_batches.append([q for q,d in positive_pairs[pos:pos+batch_size]])
            positive_doc_batches.append([d for q,d in positive_pairs[pos:pos+batch_size]])
            neg_docs = []
            for elem in query_batches[-1]:
                neg_docs.append(random.choice(negative_pairs[self.all_queries_index[elem]]))
            negative_doc_batches.append(neg_docs)
            pos += batch_size
        return query_batches,positive_doc_batches,negative_doc_batches

## Building Trec Collection

In [4]:
trec_collections_path="/home/mrim/rezguiha/work/repro_chap7_res/TREC/"
fasttext_model_path="/home/mrim/rezguiha/work/repro_chap7_res/fastText/cc.en.300.bin"
#Using the next line only if the collection haven't been processed by creating folds containing documents 
#, queries and qrels for each fold and creating csv files from Xml format
# read_collection()

for collection in ["AP88-89","FT91-94","LA"]:
    Collection = TrecCollection(k=5)
    Collection.load_collection(trec_collections_path+collection)
    Collection.standard_preprocess(remove_stopwords = True,
                                   min_occ = 5)
    Collection.compute_info_retrieval()
    Collection.compute_fasttext_embedding(fasttext_model_path)
    Collection.pickle_indexed_collection(trec_collections_path+collection + '/indexed_collection')
    print(collection, " finished")




FT91-94  finished




LA  finished


# 2. Building Wikir Collections

## Class Collection for Wikir

In [4]:
import pickle
import random
import fasttext
import numpy as np
import pandas as pd
from nltk.stem import snowball
from collections import Counter
from nltk.corpus import stopwords



class Collection:
    def __init__(self,language='english'):
        self.documents = None
        self.training_queries = None
        self.validation_queries = None
        self.test_queries = None
        self.language = language
        self.stemmer = snowball.EnglishStemmer()
        self.stop_words = set(stopwords.words('english'))

        
    def load_collection(self,collection_path):
        """Function that loads an already processed collection and reads its csv files"""
        self.documents = pd.read_csv(collection_path + '/documents.csv',index_col='id_right',na_filter=False)
        self.training_queries = pd.read_csv(collection_path + '/training/queries.csv',index_col='id_left',na_filter=False)
        self.validation_queries = pd.read_csv(collection_path + '/validation/queries.csv',index_col='id_left',na_filter=False)
        self.test_queries = pd.read_csv(collection_path + '/test/queries.csv',index_col='id_left',na_filter=False)
    
        self.training_relevance = read_qrels(collection_path + '/training/qrels')
        self.validation_relevance = read_qrels(collection_path + '/validation/qrels')
        self.test_relevance = read_qrels(collection_path + '/test/qrels')
        
        
    def save_xml(self,output_dir):
        """Function that saves the documents, the training queries ,validation queries and test queries 
        generated into xml format"""
        with open(output_dir + '/documents.xml','w') as f:
            for key,value in self.documents.iterrows():
                f.write('<DOC>\n<DOCNO>' + str(key) + '</DOCNO>\n<TEXT>\n' + value[0] + '\n</TEXT></DOC>\n')

        with open(output_dir + '/training_queries.xml','w') as f:
            for key,value in self.training_queries.iterrows():
                f.write('<top>\n<num>' + str(key) + '</num><title>\n' + value[0] + '\n</title>\n</top>\n')

        with open(output_dir + '/validation_queries.xml','w') as f:
            for key,value in self.validation_queries.iterrows():
                f.write('<top>\n<num>' + str(key) + '</num><title>\n' + value[0] + '\n</title>\n</top>\n')

        with open(output_dir + '/test_queries.xml','w') as f:
            for key,value in self.test_queries.iterrows():
                f.write('<top>\n<num>' + str(key) + '</num><title>\n' + value[0] + '\n</title>\n</top>\n')
    
        
    def update_standard_vocabulary(self,sequences,remove_stopwords=True):
        """Function that updates the vocabulary on the basis of new sequences"""
        count = 0
        if remove_stopwords:
            for _,sequence in sequences.iterrows():
                for word in sequence[0].split(" "):
                    temp = word.lower()
                    if temp not in self.stop_words:
                        self.vocabulary[self.stemmer.stem(temp)] += 1
                count +=1
        else:
            for _,sequence in sequences.iterrows():
                for word in sequence[0].split(" "):
                    self.vocabulary[self.stemmer.stem(word.lower())] += 1
                count +=1
        
        
    def build_standard_vocabulary(self,
                                  min_occ = 2,
                                  remove_stopwords = True):
        """Function that builds the vocabulary from documents and the different queries"""
        self.vocabulary = Counter()
    
        self.update_standard_vocabulary(self.documents,remove_stopwords)
        self.update_standard_vocabulary(self.training_queries,remove_stopwords)
        self.update_standard_vocabulary(self.validation_queries,remove_stopwords)
        self.update_standard_vocabulary(self.test_queries,remove_stopwords)
    
        self.vocabulary = {i+1:elem[0] for i,elem in enumerate(self.vocabulary.most_common()) if elem[1] >= min_occ}

        for key in list(self.vocabulary):
            self.vocabulary[self.vocabulary[key]] = key
            
        self.vocabulary[0] = '<PAD>'
        self.vocabulary['<PAD>'] = 0
    
     
        
    def standard_preprocess(self,
                            remove_stopwords = True,
                            min_occ = 5):
        """Function that preprocesses the collection by building the vocabulary and indexing the documents 
        and the different queries"""
        print('Build voc',flush=True)
        self.build_standard_vocabulary(min_occ = min_occ,
                                       remove_stopwords = remove_stopwords)
        
        print('Index documents',flush=True)
        self.doc_index,self.indexed_docs = std_tokenizer_index(self.documents,
                                                             self.vocabulary,
                                                             self.stemmer)
        
        print('Index queries',flush=True)
        self.training_queries_index,self.indexed_training_queries = std_tokenizer_index(self.training_queries,
                                                                                        self.vocabulary,
                                                                                        self.stemmer)
        
        self.validation_queries_index,self.indexed_validation_queries = std_tokenizer_index(self.validation_queries,
                                                                                            self.vocabulary,
                                                                                            self.stemmer)
        
        self.test_queries_index,self.indexed_test_queries = std_tokenizer_index(self.test_queries,
                                                                                self.vocabulary,
                                                                                self.stemmer)
           
    
    def build_inverted_index(self):
        """Function that builds the inverted index from the vocabulary and the indexed documents"""
        self.inverted_index = dict()
        
        for token in self.vocabulary:
            if isinstance(token, int):
                self.inverted_index[token] = Counter()
            
        for i,indexed_document in enumerate(self.indexed_docs):
            for token in indexed_document:
                self.inverted_index[token][i] += np.float32(1.0)
                
                
    def compute_idf(self):
        """Function that computes the idf for every word"""
        nb_docs = len(self.doc_index)
        self.idf = {token:np.log((nb_docs+1)/(1+len(self.inverted_index[token]))) for token in self.inverted_index }
        
        
    def compute_docs_length(self):
        """Function that computes documents length"""
        self.docs_length = {i:len(doc) for i,doc in enumerate(self.indexed_docs)}
        
        
    def compute_collection_frequencies(self):
        """Function that computes frequencies of eac word"""
        coll_length = sum([value for key,value in self.docs_length.items()])
        self.c_freq = {token:sum([freq for _,freq in self.inverted_index[token].items()])/coll_length for token in self.inverted_index}
        
    def index_relations(self):
        self.training_indexed_relevance = []
        for elem in self.training_relevance:
            self.training_indexed_relevance.append([self.training_queries_index[elem[0]],self.doc_index[elem[1]]])

        self.validation_indexed_relevance = []
        for elem in self.validation_relevance:
            self.validation_indexed_relevance.append([self.validation_queries_index[elem[0]],self.doc_index[elem[1]]])
                
        self.test_indexed_relevance = []
        for elem in self.test_relevance:
            self.test_indexed_relevance.append([self.test_queries_index[elem[0]],self.doc_index[elem[1]]])
        
        
    def compute_info_retrieval(self):
        """Function that builds inverted index , idf , document length, collection frequencies and indexed relations"""
        self.build_inverted_index()
        self.compute_idf()
        self.compute_docs_length()
        self.compute_collection_frequencies()
        self.index_relations()
        
        
    def save_results(self,index_queries,results,path,top_k=1000):
        """Function that saves the top 1000 results according to their score"""
        with open(path,'w') as f:
            for query,documents in enumerate(results):
                for i,scores in enumerate(documents.most_common(top_k)):
                    f.write(index_queries[query] + ' Q0 ' + self.doc_index[scores[0]] + ' ' + str(i) + ' ' + str(scores[1]) + ' 0\n')
                    
                    
    def pickle_indexed_collection(self,path):
        """Function that saves th computed self elements into a pickle file"""
        self.documents = None
        self.training_queries = None
        self.validation_queries = None
        self.test_queries = None
        with open(path,'wb') as f:
            pickle.dump(self,f)

            
    def compute_fasttext_embedding(self,model_path):
        """Function that computes the fasttext embedding : vectos of 300 dimension"""
        model = fasttext.load_model(model_path)
        dim = model.get_dimension()
        vocab_size = int(len(self.vocabulary)/2)
        self.embedding_matrix = np.zeros((vocab_size, dim))
        for _ in range(vocab_size):
            self.embedding_matrix[_] = model[self.vocabulary[_]]
                    
            
    def generate_training_batches(self,batch_size=64):
        "Function that generates training batches"
        random.shuffle(self.training_indexed_relevance)
        nb_docs = len(self.indexed_docs)
        nb_train_pairs = len(self.training_indexed_relevance)
        query_batches = []
        positive_doc_batches = []
        negative_doc_batches = []
        pos = 0
        while(pos + batch_size < nb_train_pairs):
            query_batches.append([q for q,d in self.training_indexed_relevance[pos:pos+batch_size]])
            positive_doc_batches.append([d for q,d in self.training_indexed_relevance[pos:pos+batch_size]])
            negative_doc_batches.append([random.randint(0, nb_docs-1) for _ in range(len(positive_doc_batches[-1]))])
            pos += batch_size
        return query_batches,positive_doc_batches,negative_doc_batches
    
    
    def generate_test_batches(self,batch_size=64):
        """Function that generates test batch"""
        random.shuffle(self.test_indexed_relevance)
        nb_docs = len(self.indexed_docs)
        nb_test_pairs = len(self.test_indexed_relevance)
        query_batches = []
        positive_doc_batches = []
        negative_doc_batches = []
        pos = 0
        while(pos + batch_size < nb_test_pairs):
            query_batches.append([q for q,d in self.test_indexed_relevance[pos:pos+batch_size]])
            positive_doc_batches.append([d for q,d in self.test_indexed_relevance[pos:pos+batch_size]])
            negative_doc_batches.append([random.randint(0, nb_docs-1) for _ in range(len(positive_doc_batches[-1]))])
            pos += batch_size
        return query_batches,positive_doc_batches,negative_doc_batches

In [5]:

    coll_path="/home/mrim/rezguiha/work/repro_chap7_res/wikIR_78"
    fasttext_model_path="/home/mrim/rezguiha/work/repro_chap7_res/fastText/cc.en.300.bin"
    index_path="/home/mrim/rezguiha/work/repro_chap7_res/enwikIR_indexed"
    print('Reading collection',flush=True)
    
    Collection_wikir = Collection('english')
    Collection_wikir.load_collection(coll_path)
    
    print('Standard Preprocess',flush=True)
    Collection_wikir.standard_preprocess(remove_stopwords = True,
                                   min_occ = 5)
    
    print('Compute inverted index',flush=True)
    Collection_wikir.compute_info_retrieval()
    
    Collection_wikir.compute_fasttext_embedding(fasttext_model_path)
    
    Collection_wikir.pickle_indexed_collection(index_path+'/indexed_collection' )

Reading collection
Standard Preprocess
Build voc


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/mrim/rezguiha/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-5-363c95d3efe8>", line 10, in <module>
    Collection_wikir.standard_preprocess(remove_stopwords = True,
  File "<ipython-input-4-e7b98a2d87fb>", line 99, in standard_preprocess
    self.build_standard_vocabulary(min_occ = min_occ,
  File "<ipython-input-4-e7b98a2d87fb>", line 78, in build_standard_vocabulary
    self.update_standard_vocabulary(self.documents,remove_stopwords)
  File "<ipython-input-4-e7b98a2d87fb>", line 63, in update_standard_vocabulary
    self.vocabulary[self.stemmer.stem(temp)] += 1
  File "/home/mrim/rezguiha/anaconda3/lib/python3.8/site-packages/nltk/stem/snowball.py", line 1751, in stem
    if word.endswith(suffix):
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call

TypeError: object of type 'NoneType' has no len()

# 3.Training Models on TDV for Trec Collection

## Baseline models

In [None]:
import numpy as np
from collections import Counter

def simple_tf(indexed_queries,inverted_index):
    
    results = []
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    result[document] += freq
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results


def weighted_simple_tf(indexed_queries,inverted_index,weights):
    
    results = []
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    result[document] += weights[token]*freq
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results


def tf_idf(indexed_queries,inverted_index,idf):
    
    results = []
    
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    result[document] += freq*idf[token]
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results

    
def dir_language_model(indexed_queries,inverted_index,docs_length,c_freq, mu = 2500):
    
    results = []
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    result[document] += np.log(1 + (freq/(mu * c_freq[token]))) + np.log(mu/(docs_length[document] + mu))
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results
    
    
def Okapi_BM25(indexed_queries,inverted_index,docs_length,idf, k1 = 1.2, b = 0.75):
    
    results = []
    
    avg_docs_len = sum([value for key,value in docs_length.items()])/len(docs_length)
    
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    result[document] += idf[token] * ((k1 + 1)*freq)/(freq + k1*((1-b) + b*docs_length[document]/avg_docs_len))
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results

    
def fast_Okapi_BM25(indexed_queries,inverted_index,docs_length,idf,avg_docs_len, k1 = 1.2, b = 0.75):
    
    results = []
        
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    result[document] += idf[token] * ((k1 + 1)*freq)/(freq + k1*((1-b) + b*docs_length[document]/avg_docs_len))
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results


def weighted_tf_idf(indexed_queries,inverted_index,weights,idf):
    
    results = []
    
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    result[document] += weights[token]*freq*idf[token]
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results


    
def weighted_dir_language_model(indexed_queries,inverted_index,weights,docs_length,c_freq, mu = 2500):
    
    results = []
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    result[document] += weights[token]*(np.log(1 + (freq/(mu * c_freq[token]))) + np.log(mu/(docs_length[document] + mu)))
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results
    
    

def weighted_Okapi_BM25(indexed_queries,inverted_index,weights,docs_length,idf, k1 = 1.2, b = 0.75):
    
    results = []
    
    avg_docs_len = sum([value for key,value in docs_length.items()])/len(docs_length)
    
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    result[document] += weights[token]*idf[token]*((k1 + 1)*freq)/(freq + k1*((1-b) + b*docs_length[document]/avg_docs_len))
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results




def Lemur_tf_idf(indexed_queries,inverted_index,docs_length,idf,k1=1.2,b=0.75):
    
    avg_docs_len = sum([value for key,value in docs_length.items()])/len(docs_length)
    
    results = []
    
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    Robertson_tf = k1*freq/(freq + k1 *(1-b + b*docs_length[document]/avg_docs_len))
                    result[document] += Robertson_tf*np.power(idf[token],2)
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results



def JM_language_model(indexed_queries,inverted_index,docs_length,c_freq, Lambda = 0.15):
    results = []
    for indexed_query in indexed_queries:
        result = Counter()
        for token in indexed_query:
            if token in inverted_index:
                for document,freq in inverted_index[token].items():
                    result[document] += np.log(1 + ((1/(c_freq[token]))*(Lambda * freq )/((1-Lambda)*docs_length[document])) )
    
        if len(result)==0:
            result[-1] += 0
        results.append(result)
        
    return results



## Differentiable models 

In [None]:
import time
import utils
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import layers



class diff_simple_TF(Model):
    def __init__(self,embedding_matrix,dropout_rate=0.1):
        super(simple_TF, self).__init__()
        self.vocab_size,self.embedding_dim = embedding_matrix.shape
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, 
                                                   self.embedding_dim, 
                                                   weights=[embedding_matrix],
                                                   mask_zero=True)
        self.linear = tf.keras.layers.Dense(1,
                                            input_shape=(self.embedding_dim,),
                                            activation='relu',
                                            bias_initializer=tf.ones_initializer())
        self.dropout_rate=dropout_rate
    
    def make_BoW(self,seq,index,sparse_index):
        
        mask = tf.dtypes.cast(self.embedding.compute_mask(index),dtype=tf.float32)
        seq = tf.math.multiply(mask,tf.squeeze(seq))
        seq = tf.reshape(seq,[-1])
        
        seq = tf.SparseTensor(indices = sparse_index, values = seq , dense_shape=[self.vocab_size,index.shape[0]])
        seq = tf.sparse.reorder(seq)
        
        linearized = tf.matmul(seq.indices, tf.constant([[index.shape[0]], [1]],dtype=tf.int64))
        y, idx = tf.unique(tf.squeeze(linearized))
        values = tf.math.segment_sum(seq.values, idx)
        y = tf.expand_dims(y, 1)
        indices = tf.concat([y//index.shape[0], y%index.shape[0]], axis=1)
        seq = tf.SparseTensor(indices = indices, values = values , dense_shape=[self.vocab_size,index.shape[0]])
        seq = tf.sparse.reorder(seq)
        
        return tf.sparse.to_dense(seq)
        
    
    def call(self, q_index_float_32, q_index, q_sparse_index, d_index, d_sparse_index):
        
        q = self.make_BoW(q_index_float_32,q_index,q_sparse_index)
        
        d = tf.nn.dropout(self.embedding(d_index),rate=self.dropout_rate)
        d = tf.nn.dropout(self.linear(d),rate=self.dropout_rate)
        d = self.make_BoW(d,d_index,d_sparse_index)
        
        rel = tf.math.reduce_sum(tf.math.multiply(q,d),axis=0)
    
        return rel,d

    def compute_index(self):
        index = [_ for _ in range(self.vocab_size)]
        
        all_embeddings = self.embedding(np.asarray(index))

        return np.reshape(self.linear(all_embeddings).numpy(),(self.vocab_size,))
    
    
    
    
class diff_TF_IDF(Model):
    def __init__(self,embedding_matrix,dropout_rate=0.1):
        super(TF_IDF, self).__init__()
        self.vocab_size,self.embedding_dim = embedding_matrix.shape
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, 
                                                   self.embedding_dim, 
                                                   weights=[embedding_matrix],
                                                   mask_zero=True)
        self.linear = tf.keras.layers.Dense(1,
                                            input_shape=(self.embedding_dim,),
                                            activation='relu',
                                            bias_initializer=tf.ones_initializer())
        self.dropout_rate=dropout_rate
    
    def make_BoW(self,seq,index,sparse_index):
        
        mask = tf.dtypes.cast(self.embedding.compute_mask(index),dtype=tf.float32)
        seq = tf.math.multiply(mask,tf.squeeze(seq))
        seq = tf.reshape(seq,[-1])
        
        
        seq = tf.SparseTensor(indices = sparse_index, values = seq , dense_shape=[self.vocab_size,index.shape[0]])
        seq = tf.sparse.reorder(seq)
        
        linearized = tf.matmul(seq.indices, tf.constant([[index.shape[0]], [1]],dtype=tf.int64))
        y, idx = tf.unique(tf.squeeze(linearized))
        values = tf.math.segment_sum(seq.values, idx)
        y = tf.expand_dims(y, 1)
        indices = tf.concat([y//index.shape[0], y%index.shape[0]], axis=1)
        seq = tf.SparseTensor(indices = indices, values = values , dense_shape=[self.vocab_size,index.shape[0]])
        seq = tf.sparse.reorder(seq)
        
        return tf.sparse.to_dense(seq)
        
            
    def call(self, q_index_float_32, q_index, q_sparse_index, d_index, d_sparse_index):
        
        q = self.make_BoW(q_index_float_32,q_index,q_sparse_index)
        
        d = tf.nn.dropout(self.embedding(d_index),rate=self.dropout_rate)
        d = tf.nn.dropout(self.linear(d),rate=self.dropout_rate)
        d = self.make_BoW(d,d_index,d_sparse_index)
        
        maxdf = tf.keras.backend.max(tf.math.reduce_sum(d,axis = 1))
        
        idf = tf.math.log( (maxdf + 1) / (1+tf.math.reduce_sum(d,axis = 1)))
        
        idf_d = tf.multiply(d, tf.reshape(idf, (-1, 1)))
        
        rel = tf.math.reduce_sum(tf.math.multiply(q,idf_d),axis=0)
        
        return rel,d

    def compute_index(self):
        index = [_ for _ in range(self.vocab_size)]
        
        all_embeddings = self.embedding(np.asarray(index))

        return np.reshape(self.linear(all_embeddings).numpy(),(self.vocab_size,))
    
    
class diff_DIR(Model):
    def __init__(self,embedding_matrix,mu=2500.0,dropout_rate=0.1):
        super(DIR, self).__init__()
        self.vocab_size,self.embedding_dim = embedding_matrix.shape
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, 
                                                   self.embedding_dim, 
                                                   weights=[embedding_matrix],
                                                   mask_zero=True)
        self.linear = tf.keras.layers.Dense(1,
                                            input_shape=(self.embedding_dim,),
                                            activation='relu',
                                            bias_initializer=tf.ones_initializer())
        self.mu = tf.Variable(mu)
        self.dropout_rate=dropout_rate
        
    def make_BoW(self,seq,index,sparse_index):
        
        mask = tf.dtypes.cast(self.embedding.compute_mask(index),dtype=tf.float32)
        seq = tf.math.multiply(mask,tf.squeeze(seq))
        seq = tf.reshape(seq,[-1])
        
        
        seq = tf.SparseTensor(indices = sparse_index, values = seq , dense_shape=[self.vocab_size,index.shape[0]])
        seq = tf.sparse.reorder(seq)
        
        linearized = tf.matmul(seq.indices, tf.constant([[index.shape[0]], [1]],dtype=tf.int64))
        y, idx = tf.unique(tf.squeeze(linearized))
        values = tf.math.segment_sum(seq.values, idx)
        y = tf.expand_dims(y, 1)
        indices = tf.concat([y//index.shape[0], y%index.shape[0]], axis=1)
        seq = tf.SparseTensor(indices = indices, values = values , dense_shape=[self.vocab_size,index.shape[0]])
        seq = tf.sparse.reorder(seq)
        
        return tf.sparse.to_dense(seq)
        
            
    def call(self, q_index_float_32, q_index, q_sparse_index, d_index, d_sparse_index):
        
        q = self.make_BoW(q_index_float_32,q_index,q_sparse_index)
        
        d = tf.nn.dropout(self.embedding(d_index),rate=self.dropout_rate)
        d = tf.nn.dropout(self.linear(d),rate=self.dropout_rate)
        d = self.make_BoW(d,d_index,d_sparse_index)
        
        cfreq = tf.math.reduce_sum(d,axis=1)/tf.math.reduce_sum(d)
        
        smoothing = tf.math.log(self.mu/(tf.math.reduce_sum(d,axis=0) + self.mu))
        
        dir_d = tf.math.log(1+d/(1+self.mu*tf.reshape(cfreq, (-1, 1)))) + smoothing
        
        rel = tf.math.reduce_sum(tf.math.multiply(q,dir_d),axis=0)
        
        return rel,d

    def compute_index(self):
        index = [_ for _ in range(self.vocab_size)]
        
        all_embeddings = self.embedding(np.asarray(index))

        return np.reshape(self.linear(all_embeddings).numpy(),(self.vocab_size,))
    
    
    
    
class diff_BM25(Model):
    def __init__(self,embedding_matrix,k1=1.2,b=0.75,dropout_rate=0.1):
        super(BM25, self).__init__()
        self.vocab_size,self.embedding_dim = embedding_matrix.shape
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, 
                                                   self.embedding_dim, 
                                                   weights=[embedding_matrix],
                                                   mask_zero=True)
        self.linear = tf.keras.layers.Dense(1,
                                            input_shape=(self.embedding_dim,),
                                            activation='relu',
                                            bias_initializer=tf.ones_initializer())
        self.k1 = tf.Variable(k1)
        self.b = tf.Variable(b)
        self.dropout_rate=dropout_rate
        
    def make_BoW(self,seq,index,sparse_index):
        
        mask = tf.dtypes.cast(self.embedding.compute_mask(index),dtype=tf.float32)
        seq = tf.math.multiply(mask,tf.squeeze(seq))
        seq = tf.reshape(seq,[-1])
        
        
        seq = tf.SparseTensor(indices = sparse_index, values = seq , dense_shape=[self.vocab_size,index.shape[0]])
        seq = tf.sparse.reorder(seq)
        
        linearized = tf.matmul(seq.indices, tf.constant([[index.shape[0]], [1]],dtype=tf.int64))
        y, idx = tf.unique(tf.squeeze(linearized))
        values = tf.math.segment_sum(seq.values, idx)
        y = tf.expand_dims(y, 1)
        indices = tf.concat([y//index.shape[0], y%index.shape[0]], axis=1)
        seq = tf.SparseTensor(indices = indices, values = values , dense_shape=[self.vocab_size,index.shape[0]])
        seq = tf.sparse.reorder(seq)
        
        return tf.sparse.to_dense(seq)
        
        
    def call(self, q_index_float_32, q_index, q_sparse_index, d_index, d_sparse_index):
        
        q = self.make_BoW(q_index_float_32,q_index,q_sparse_index)
        
        d = tf.nn.dropout(self.embedding(d_index),rate=self.dropout_rate)
        d = tf.nn.dropout(self.linear(d),rate=self.dropout_rate)
        d = self.make_BoW(d,d_index,d_sparse_index)
        
        
        maxdf = tf.keras.backend.max(tf.math.reduce_sum(d,axis = 1))
        
        idf = tf.math.log( (maxdf + 1) / (1+tf.math.reduce_sum(d,axis = 1)))
        
        d_length = tf.math.reduce_sum(d,axis=0)

        avg_d_length = tf.reduce_mean(d_length)
                
        bm25_d = tf.reshape(idf, (-1, 1))*((self.k1+1)*d)/(d + self.k1*((1-self.b) + self.b*d_length/avg_d_length))
        
        rel = tf.math.reduce_sum(tf.math.multiply(q,bm25_d),axis=0)
        
        return rel,d

    def compute_index(self):
        index = [_ for _ in range(self.vocab_size)]
        
        all_embeddings = self.embedding(np.asarray(index))

        return np.reshape(self.linear(all_embeddings).numpy(),(self.vocab_size,))
    

## Useful functions from utils path

In [None]:
def build_inverted_index(Collection, weights):
    """Function that updates the inverted index of a collection by erasing the tokens that have 0 as TDV"""
    inverted_index = dict()
    for key, value in Collection.inverted_index.items():
        if weights[key] == 0:
            continue
        inverted_index[key] = Counter()
        for doc_id in value:
            inverted_index[key][doc_id] += weights[key] * Collection.inverted_index[key][doc_id]
    return inverted_index


def compute_idf(Collection, inverted_index, weights=None):
    """Functions that compute the idf with or without introduction of TDV"""
    nb_docs = len(Collection.doc_index)
    if weights is None:
        return {token: np.log((nb_docs + 1) / (1 + len(inverted_index[token]))) for token in inverted_index}
    else:
        sums = {key: sum(inverted_index[key].values()) for key in inverted_index}
        maxdf = max(sums.values())
        return {token: np.log((maxdf + 1) / (1 + sums[token])) for token in inverted_index}


# Here we give the weights when we want the docslengths to be the number of occurence
# the weights are here for regularization purposes
def compute_docs_length(inverted_index, weights=None):
    """Function that computes document length with TDV or without it"""
    docs_length = Counter()

    if weights is None:
        for term, posting in inverted_index.items():
            for doc_id, nb_occurence in posting.items():
                docs_length[doc_id] += nb_occurence

    else:
        for term, posting in inverted_index.items():
            for doc_id, nb_occurence in posting.items():
                docs_length[doc_id] += nb_occurence / weights[term]

    return docs_length


def compute_collection_frequencies(docs_length, inverted_index):
    """Function that computes frequency of tokens in a  collection"""
    coll_length = sum([value for key, value in docs_length.items()])
    return {token: sum([freq for _, freq in inverted_index[token].items()]) / coll_length for token in inverted_index}



def evaluate_inverted_index(inverted_index):
    """Function that takes an inverted index and calculate its vocabulary size and total number of elements"""
    vocab_size = len(inverted_index)
    tot_nb_elem = 0
    for key,value in inverted_index.items():
        tot_nb_elem += len(value)
    return vocab_size,tot_nb_elem



def compute_metrics(coll_path,Collection,queries_index,qrel,results,model_name,save_res=True):
    """Function that saves the results of retrieval: the top_k documents according to their score for 
    a certain model identified by model_name. Then, it computes different metrics for IR using the pytrec_eval
    package"""
    Collection.save_results(queries_index,results,model_name,top_k=1000)

    with open(model_name, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)
    if not save_res:
        os.remove(model_name)

    measures = {"map","ndcg_cut","recall","P"}

    evaluator = pytrec_eval.RelevanceEvaluator(qrel,measures)

    all_metrics = evaluator.evaluate(run)

    metrics = {'P_5': 0,
     'P_10': 0,
     'P_20': 0,
     'ndcg_cut_5': 0,
     'ndcg_cut_10': 0,
     'ndcg_cut_20': 0,
     'ndcg_cut_1000': 0,
     'map': 0,
     'recall_1000': 0}
    nb_queries = len(all_metrics)
    for key,values in all_metrics.items():
        for metric in metrics:
            metrics[metric] += values[metric]/nb_queries
        
    return metrics


def eval_baseline_index(coll_path,
                        Collection,
                        fold,
                        qrel,
                        plot_values,
                        results_path,
                        model_name,
                        epoch):
    """This function computes the metrics for the baseline models for term matching methods and 
    updates the plot values dictionary for a certain fold and a certain epoch"""
    print('tf')

    results = IR_models.simple_tf(Collection.indexed_queries[fold],
                                  Collection.inverted_index)
    
    if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/tf/'):
        os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/tf/')
    
    metrics = compute_metrics(coll_path,
                              Collection,
                              Collection.queries_index[fold],
                              qrel,
                              results,  
                              results_path + '/fold' + str(fold) + '/' + model_name + '/tf/' + str(epoch))

    plot_values['tf'][0].append(1.0)
    plot_values['tf'][1].append(metrics)
    
    print('tf_idf')

    results = tf_idf(Collection.indexed_queries[fold],
                               Collection.inverted_index,
                               Collection.idf)
    
    if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/tf_idf/'):
        os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/tf_idf/')
    
    metrics = compute_metrics(coll_path,
                              Collection,
                              Collection.queries_index[fold],
                              qrel,
                              results,  
                              results_path + '/fold' + str(fold) + '/' + model_name + '/tf_idf/' + str(epoch))

    plot_values['tf_idf'][0].append(1.0)
    plot_values['tf_idf'][1].append(metrics)

    
    print('DIR')

    results = dir_language_model(Collection.indexed_queries[fold],
                                           Collection.inverted_index,
                                           Collection.docs_length,
                                           Collection.c_freq)
    
    if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/DIR/'):
        os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/DIR/')
    
    metrics = compute_metrics(coll_path,
                              Collection,
                              Collection.queries_index[fold],
                              qrel,
                              results,  
                              results_path + '/fold' + str(fold) + '/' + model_name + '/DIR/' + str(epoch))

    plot_values['DIR'][0].append(1.0)
    plot_values['DIR'][1].append(metrics)

    
    
    
    print('BM25')

    results = Okapi_BM25(Collection.indexed_queries[fold],
                                   Collection.inverted_index,
                                   Collection.docs_length,
                                   Collection.idf)
    
    if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/BM25/'):
        os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/BM25/')
    
    metrics = compute_metrics(coll_path,
                              Collection,
                              Collection.queries_index[fold],
                              qrel,
                              results,  
                              results_path + '/fold' + str(fold) + '/' + model_name + '/BM25/' + str(epoch))

    plot_values['BM25'][0].append(1.0)
    plot_values['BM25'][1].append(metrics)
    
def utils_compute_info_retrieval(Collection,weights,weighted=True):
    """Computes inverted index, idf, document length and c_frequency for a collection with TDV weights"""
    inverted_index = build_inverted_index(Collection,weights)
    if weighted:
        idf = compute_idf(Collection,inverted_index,weights)
        docs_length = compute_docs_length(inverted_index)
        c_freq = compute_collection_frequencies(docs_length,inverted_index)
    else:
        idf = compute_idf(Collection,inverted_index)
        docs_length = compute_docs_length(inverted_index,weights)
        c_freq = compute_collection_frequencies(docs_length,inverted_index)
    return inverted_index,idf,docs_length,c_freq


def eval_learned_index(coll_path,
                       Collection,
                       IR_model,
                       model,
                       qrel,
                       plot_values,
                       plot_path,
                       fold,
                       inverted_index,
                       weights,
                       redefined_idf,
                       redefined_docs_length,
                       redefined_c_freq,
#                        idf,
#                        docs_length,
#                        c_freq,
                       prop_elem_index,
                       results_path,
                       model_name,
                       epoch):
    """Function"""
    
    if IR_model=='tf':
    
        print('tf')

        results = simple_tf(Collection.indexed_queries[fold],
                                      inverted_index)

        if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/tf/'):
            os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/tf/')

#         print(results,flush=True)
            
        metrics = compute_metrics(coll_path,
                                  Collection,
                                  Collection.queries_index[fold],
                                  qrel,
                                  results,
                                  results_path + '/fold' + str(fold) + '/' + model_name + '/tf/' + str(epoch))

        plot_values['tf'][0].append(prop_elem_index)
        plot_values['tf'][1].append(metrics)

        print('weighted_tf')

        results = weighted_simple_tf(Collection.indexed_queries[fold],
                                               inverted_index,
                                               weights)

        if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_tf/'):
            os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_tf/')

        metrics = compute_metrics(coll_path,
                                  Collection,
                                  Collection.queries_index[fold],
                                  qrel,
                                  results,
                                  results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_tf/' + str(epoch))

        plot_values['weighted_tf'][0].append(prop_elem_index)
        plot_values['weighted_tf'][1].append(metrics)

    if IR_model=='tf_idf':
        
        print('tf_idf')

        results = tf_idf(Collection.indexed_queries[fold],
                                   inverted_index,
                                   redefined_idf)

        if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/tf_idf/'):
            os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/tf_idf/')

        metrics = compute_metrics(coll_path,
                                  Collection,
                                  Collection.queries_index[fold],
                                  qrel,
                                  results,  
                                  results_path + '/fold' + str(fold) + '/' + model_name + '/tf_idf/' + str(epoch))

        plot_values['tf_idf'][0].append(prop_elem_index)
        plot_values['tf_idf'][1].append(metrics)




        print('weighted_tf_idf')

        results = weighted_tf_idf(Collection.indexed_queries[fold],
                                            inverted_index,
                                            weights,
                                            redefined_idf)

        if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_tf_idf/'):
            os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_tf_idf/')

        metrics = compute_metrics(coll_path,
                                  Collection,
                                  Collection.queries_index[fold],
                                  qrel,
                                  results,  
                                  results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_tf_idf/' + str(epoch))

        plot_values['weighted_tf_idf'][0].append(prop_elem_index)
        plot_values['weighted_tf_idf'][1].append(metrics)

       
    if IR_model=='DIR':
        
        mu = model.mu.numpy()
        
        print('DIR')

        results = dir_language_model(Collection.indexed_queries[fold],
                                               inverted_index,
                                               redefined_docs_length,
                                               redefined_c_freq,
                                               mu=mu)

        if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/DIR/'):
            os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/DIR/')

        metrics = compute_metrics(coll_path,
                                  Collection,
                                  Collection.queries_index[fold],
                                  qrel,
                                  results,  
                                  results_path + '/fold' + str(fold) + '/' + model_name + '/DIR/' + str(epoch))

        plot_values['DIR'][0].append(prop_elem_index)
        plot_values['DIR'][1].append(metrics)


        print('weighted_DIR')

        results = weighted_dir_language_model(Collection.indexed_queries[fold],
                                                        inverted_index,
                                                        weights,
                                                        redefined_docs_length,
                                                        redefined_c_freq,
                                                        mu=mu)

        if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_DIR/'):
            os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_DIR/')

        metrics = compute_metrics(coll_path,
                                  Collection,
                                  Collection.queries_index[fold],
                                  qrel,
                                  results,  
                                  results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_DIR/' + str(epoch))

        plot_values['weighted_DIR'][0].append(prop_elem_index)
        plot_values['weighted_DIR'][1].append(metrics)

    
    if IR_model=='BM25':
        
        k1 = model.k1.numpy()
        b = model.b.numpy()
        
        print('BM25')

        results = Okapi_BM25(Collection.indexed_queries[fold],
                                       inverted_index,
                                       redefined_docs_length,
                                       redefined_idf,
                                       k1=k1,
                                       b=b)

        if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/BM25/'):
            os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/BM25/')

        metrics = compute_metrics(coll_path,
                                  Collection,
                                  Collection.queries_index[fold],
                                  qrel,
                                  results,  
                                  results_path + '/fold' + str(fold) + '/' + model_name + '/BM25/' + str(epoch))

        plot_values['BM25'][0].append(prop_elem_index)
        plot_values['BM25'][1].append(metrics)



        print('weighted_BM25')

        results = weighted_Okapi_BM25(Collection.indexed_queries[fold],
                                                inverted_index,
                                                weights,
                                                redefined_docs_length,
                                                redefined_idf,
                                                k1=k1,
                                                b=b)

        if not os.path.exists(results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_BM25/'):
            os.makedirs( results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_BM25/')

        metrics = compute_metrics(coll_path,
                                  Collection,
                                  Collection.queries_index[fold],
                                  qrel,
                                  results,  
                                  results_path + '/fold' + str(fold) + '/' + model_name + '/weighted_BM25/' + str(epoch))

        plot_values['weighted_BM25'][0].append(prop_elem_index)
        plot_values['weighted_BM25'][1].append(metrics)

    pickle.dump(plot_values,open(plot_path + '/fold' + str(fold) + '/' + model_name,'wb'))
    
    
    

## Training on TREC Collections

In [None]:
import os
import re
import bpe
import time
import models
import pickle
import timeit
import argparse
import importlib
import IR_models
import collections
import pytrec_eval
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import Counter
import tensorflow_ranking as tfr
from tensorflow.keras import Model
from tensorflow.keras import layers


def main():
    
    parser = argparse.ArgumentParser()
    parser.add_argument('-c','--coll_path', nargs="?", type=str)
    parser.add_argument('-i','--indexed_path', nargs="?", type=str)
    parser.add_argument('-p','--plot_path', nargs="?", type=str)
    parser.add_argument('-r','--results_path', nargs="?", type=str)
    parser.add_argument('-w','--weights_path', nargs="?", type=str)
    parser.add_argument('-f','--folds', nargs="?", type=int,default = 5)
    parser.add_argument('-e','--nb_epoch', nargs="?", type=int)
    parser.add_argument('-l','--l1_weight', nargs="?", type=float)
    parser.add_argument('-d','--dropout_rate', nargs="?", type=float,default=0.0)
    parser.add_argument('--lr', nargs="?", type=float)
    parser.add_argument('-n','--model_name', nargs="?", type=str)
    parser.add_argument('--IR_model', nargs="?", type=str,default='tf')
    parser.add_argument('-u','--update_embeddings', action="store_true")
    
    args = parser.parse_args()

    print(args,flush=True)

    nb_epoch=100
    l1_weight=1e-5
    dropout_rate=0.0
    folds=5
    lr=1e-3
    for collection in ["AP88-89","FT91-94","LA"]:
        coll_path="/home/mrim/rezguiha/work/repro_chap7_res/TREC/"+collection
        indexed_path="/home/mrim/rezguiha/work/repro_chap7_res/TREC/"+collection+ "/indexed_collection"
        results_path="/home/mrim/rezguiha/work/repro_chap7_res/TREC/"+collection+ "/results"
        weights_path="/home/mrim/rezguiha/work/repro_chap7_res/TREC/"+collection+ "/weights"
        plot_path="/home/mrim/rezguiha/work/repro_chap7_res/TREC/"+collection+ "/plots"
        #Loading indexed collection
        Collection = TrecCollection()
        with open(indexed_path,'rb') as f:
            Collection = pickle.load(f)

        Collection.doc_index[-1] = "-1"
        Collection.doc_index["-1"] = -1
        #Loading relevance judgements from collection
        with open(coll_path + 'qrels' , 'r') as f_qrel:
            qrel = pytrec_eval.parse_qrel(f_qrel)
        #????
        id_titl = Collection.vocabulary['titl']
        for i in range(len(Collection.all_indexed_queries)):
            if Collection.all_indexed_queries[i][0]==id_titl and len(Collection.all_indexed_queries[i]) > 1:
                del Collection.all_indexed_queries[i][0]

        for i in range(len(Collection.indexed_queries)):
            for j in range(len(Collection.indexed_queries[i])):
                if Collection.indexed_queries[i][j][0]==id_titl and len(Collection.indexed_queries[i][j]) > 1:
                    del Collection.indexed_queries[i][j][0]

        print('start')
        #Getting collection vocabulary size and total number of elements in collection
        coll_vocab_size,coll_tot_nb_elem = evaluate_inverted_index(Collection.inverted_index)
        #Creating for each fold and for every model a directory for results,weights and plots data
        
        for IR_model in ['tf_idf','DIR','BM25']:
            id_model_name= collection+'_'+IR_model+'_'+str(l1_weight)+'_'+str(dropout_rate)

            for fold in range(args.folds):

                plot_values = dict()

                for model_name in ['tf',
                                   'tf_idf',
                                   'DIR',
                                   'BM25']:
                    plot_values[model_name] = [[],[]]

                if not os.path.exists(results_path + '/fold' + str(fold) + '/' + id_model_name):
                    os.makedirs(results_path + '/fold' + str(fold) + '/' + id_model_name)

                if not os.path.exists(weights_path + '/fold' + str(fold) + '/' + id_model_name):
                    os.makedirs(weights_path + '/fold' + str(fold) + '/' + id_model_name)

                if not os.path.exists(plot_path + '/fold' + str(fold) + '/'):
                    os.makedirs(plot_path + '/fold' + str(fold) + '/')
                #Computing metrics for a certain model and a certain fold and updating plot_values dictionnary
                eval_baseline_index(coll_path,
                                          Collection,
                                          fold,
                                          qrel,
                                          plot_values,
                                          results_path,
                                          id_model_name,
                                          0)
                #Saving plot_values dict of a particular fold as a pickle
                pickle.dump(plot_values,open(args.plot_path + '/fold' + str(fold) + '/' + id_model_name,'wb'))
                # Initialization of batch size, the loss function,te optimizer and the model to train
                batch_gen_time = []
                batch_size = 32
                y_true = tf.ones(batch_size,)
                loss_function = tf.keras.losses.Hinge()
                optimizer = tf.keras.optimizers.Adam(args.lr)

                if args.IR_model=='tf':
                    model = diff_simple_TF(Collection.embedding_matrix,dropout_rate=dropout_rate)

                elif args.IR_model=='tf_idf':
                    model = diff_TF_IDF(Collection.embedding_matrix,dropout_rate=dropout_rate)

                elif args.IR_model=='DIR':
                    model = diff_DIR(Collection.embedding_matrix,dropout_rate=dropout_rate)

                elif args.IR_model=='BM25':
                    model = diff_BM25(Collection.embedding_matrix,dropout_rate=dropout_rate)
                #Training the model
                epoch = 0
                prop_elem_index = 1.0
                while epoch < args.nb_epoch and prop_elem_index > 0.05:

                    begin = time.time()
                    #generation of batches from the trec collection for training
                    query_batches,positive_doc_batches,negative_doc_batches = Collection.generate_training_batches(fold,batch_size)

                    rank_loss = 0.0
                    reg_loss = 0.0
                    all_non_zero = 0.0

                    begin = time.time()

                    for i in range(len(query_batches)):
                        with tf.GradientTape() as tape:
                            #reshaping queries, pos_documents and neg_documents into a numpy ndarray
                            queries = tf.keras.preprocessing.sequence.pad_sequences([Collection.all_indexed_queries[j] for j in query_batches[i]],padding='post')

                            pos_documents = tf.keras.preprocessing.sequence.pad_sequences([Collection.indexed_docs[j] for j in positive_doc_batches[i]],padding='post')

                            neg_documents = tf.keras.preprocessing.sequence.pad_sequences([Collection.indexed_docs[j] for j in negative_doc_batches[i]],padding='post')
                            #Creating sparse querie, pos_document and neg_documents indexes
                            q_sparse_index = [[column,j] for j,raw in enumerate(queries) for column in raw]
                            pos_d_sparse_index = [[column,j] for j,raw in enumerate(pos_documents) for column in raw]
                            neg_d_sparse_index = [[column,j] for j,raw in enumerate(neg_documents) for column in raw]
                            #computing relevance and dense document for the negative and positive documents in the batch
                            pos_res,pos_d = model(np.clip(queries, 0, 1).astype(np.float32),
                                                  queries,
                                                  q_sparse_index,
                                                  pos_documents,
                                                  pos_d_sparse_index)

                            neg_res,neg_d = model(np.clip(queries, 0, 1).astype(np.float32),
                                                  queries,
                                                  q_sparse_index,
                                                  neg_documents,
                                                  neg_d_sparse_index)
                            #Computing the hinge loss and the regularization loss
                            ranking_loss = loss_function(y_true=y_true,y_pred=pos_res-neg_res) 

                            regularization_loss = tf.norm(pos_d+neg_d,ord=1)

                            rank_loss += ranking_loss.numpy()
                            reg_loss += regularization_loss.numpy()

                            all_non_zero += tf.math.count_nonzero(pos_d+neg_d).numpy()

                            loss = (1.0-args.l1_weight)*ranking_loss + args.l1_weight*regularization_loss
                            #Calculating gradients
                            if args.update_embeddings:
                                gradients = tape.gradient(loss, model.trainable_variables)
                            else:
                                gradients = tape.gradient(loss, model.trainable_variables[1:])
                        #Back propagating the gradients
                        if args.update_embeddings:
                            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
                        else:
                            optimizer.apply_gradients(zip(gradients, model.trainable_variables[1:]))

                    #Compute the TDVs after the training and saving them 
                    weights = model.compute_index()

                    pickle.dump(weights,open(args.weights_path + '/fold' + str(fold) + '/' + id_model_name + '/epoch_' + str(epoch),'wb'))

                    inverted_index,redefined_idf,redefined_docs_length,redefined_c_freq = utils_compute_info_retrieval(Collection,
                                                                                                                       weights,
                                                                                                                       weighted=True)

            #             inverted_index,idf,docs_length,c_freq = utils.compute_info_retrieval(Collection,
            #                                                                                  weights,
            #                                                                                  weighted=False)
                    #Computing new vocab_size and total number of elements after introducting the TDV
                    vocab_size,tot_nb_elem = evaluate_inverted_index(inverted_index)

                    print(str(100*vocab_size/coll_vocab_size)[0:5] + '% of the vocabulary is kept')
                    print(str(100*tot_nb_elem/coll_tot_nb_elem)[0:5] + '% of the index is kept',flush=True)

                    prop_elem_index = tot_nb_elem/coll_tot_nb_elem

                    utils.eval_learned_index(coll_path,
                                               Collection,
                                               IR_model,
                                               model,
                                               qrel,
                                               plot_values,
                                               plot_path,
                                               fold,
                                               inverted_index,
                                               weights,
                                               redefined_idf,
                                               redefined_docs_length,
                                               redefined_c_freq,
            #                                        idf,
            #                                        docs_length,
            #                                        c_freq,
                                               prop_elem_index,
                                               results_path,
                                               id_model_name,
                                               epoch+1)
                    epoch += 1

if __name__ == "__main__":
    main()