# Imports

In [None]:
import os
import multiprocessing
import nltk
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import pyterrier as pt
if not pt.started():
    pt.init()
from pyterrier.measures import *

nltk.download('stopwords')
nltk.download('punkt')
import pickle
# es_stemer = SnowballStemmer('spanish')
# es_stopwords = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to /home/edwin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/edwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Spanish test set


In [None]:
# custom_stemmer

In [None]:

def IndexCreator(dataset,filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None):
    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=stemmer, stopwords=stopwords, # Removes the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        index_ = indexer.index(dataset.get_corpus_iter())
    else:
        index_ = pt.IndexRef.of(filename)
    return index_


def custom_preprocess(text):
    toks = word_tokenize(text) # tokenize
    toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
    toks = [custom_stemmer.stem(t) for t in toks] # stem
    return ' '.join(toks) # combine toks back into a string



# Custom Preprocessing
# NB: This custom pre-processing ends up being considerably slower than using Terrier's built-in processor,
# so we use the multiprocessing package to parallelize (400 docs/s vs 2000 docs/s).
def map_doc(document,custom_preprocess=custom_preprocess):
    # this function replaces the document text with the version that uses our custom pre-processing
    return {
        'docno': document['docno'],
        'text': custom_preprocess(document['text'])
    }

def CustomIndexCreator(filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc):

    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=None, stopwords=None,  # Disable the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        with multiprocessing.Pool() as pool:
            index_custom = indexer.index(pool.imap(mapper, dataset.get_corpus_iter()))
    else:
        index_custom = pt.IndexRef.of(filename)
    return index_custom




In [None]:
config={
1:
    {
        "dataset":"irds:wikir/en59k/test",
        "lang":"english",
        "tokeniser":"",
        "stemmer":"EnglishSnowballStemmer",
        "stopwords":None
    },
2:
    {
        "dataset":"irds:wikir/es13k/test",
        "lang":"spanish",
        "tokeniser":"",
        "stemmer":"SpanishSnowballStemmer",
        "stopwords":None
    },
3:
    {
        "dataset":"irds:wikir/fr14k/test",
        "lang":"french",
        "tokeniser":"",
        "stemmer":"FrenchSnowballStemmer",
        "stopwords":None
    },
4:
    {
        "dataset":"irds:wikir/it16k/test",
        "lang":"italian",
        "tokeniser":"",
        "stemmer":"ItalianSnowballStemmer",
        "stopwords":None
    }
    
    
}

In [None]:
result={}

In [None]:
from tqdm import tqdm


In [None]:
result

{}

In [None]:

for i in tqdm(range(1,5)):
    print(i)
    config_data=config[i]
    result[config_data["lang"]]={}

    custom_stemmer=None
    custom_stopwords=None

    dataset = pt.get_dataset(config_data["dataset"])
    custom_stemmer = SnowballStemmer(config_data["lang"])
    custom_stopwords = set(stopwords.words(config_data["lang"]))
    def custom_preprocess(text):
        toks = word_tokenize(text) # tokenize
        toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
        toks = [custom_stemmer.stem(t) for t in toks] # stem
        return ' '.join(toks) # combine toks back into a string

    def map_doc(document,custom_preprocess=custom_preprocess):
        # this function replaces the document text with the version that uses our custom pre-processing
        return {
            'docno': document['docno'],
            'text': custom_preprocess(document['text'])
        }

    index_nostem = IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-nostem',
                                tokeniser="UTFTokeniser",stemmer=None,stopwords=None)
    index_stem=IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-stem', 
                            tokeniser="UTFTokeniser",stemmer=config_data["stemmer"],stopwords=None)
    index_custom = CustomIndexCreator(filename=f'./wikir-{config_data["lang"]}-custom_stem', tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc)




    bm25_nostem = pt.BatchRetrieve(index_nostem, wmodel='BM25')
    bm25_stem = pt.BatchRetrieve(index_stem, wmodel='BM25')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bm25_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BM25')


    tfidf_nostem = pt.BatchRetrieve(index_nostem, wmodel='TF_IDF')
    tfidf_stem = pt.BatchRetrieve(index_stem, wmodel='TF_IDF')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    tfidf_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='TF_IDF')



    bb2_nostem = pt.BatchRetrieve(index_nostem, wmodel='BB2')
    bb2_stem = pt.BatchRetrieve(index_stem, wmodel='BB2')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bb2_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BB2')



    # http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html


    title_qrels = dataset.get_qrels().copy()
    title_qrels.loc[title_qrels.label < 2, 'label'] = 0
    title_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        title_qrels,
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )


    doc_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        dataset.get_qrels(),
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )
    
    
    result[config_data["lang"]]={"title_result":title_result.to_dict(),"doc_result":doc_result.to_dict()}
    
    with open('./data_result.pkl', 'wb') as f:
        pickle.dump(result, f)

  0%|                                       | 0/4 [00:00<?, ?it/s]

1


 25%|██████▊                    | 1/4 [20:29<1:01:28, 1229.55s/it]

2


 50%|███████████████               | 2/4 [34:13<33:01, 990.69s/it]

3


 75%|██████████████████████▌       | 3/4 [47:49<15:11, 911.36s/it]

4


100%|████████████████████████████| 4/4 [1:01:54<00:00, 928.68s/it]


# Imports

In [None]:
import os
import multiprocessing
import nltk
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import pyterrier as pt
if not pt.started():
    pt.init()
from pyterrier.measures import *

nltk.download('stopwords')
nltk.download('punkt')
import pickle
# es_stemer = SnowballStemmer('spanish')
# es_stopwords = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to /home/edwin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/edwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Spanish test set


In [None]:
# custom_stemmer

In [None]:

def IndexCreator(dataset,filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None):
    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=stemmer, stopwords=stopwords, # Removes the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        index_ = indexer.index(dataset.get_corpus_iter())
    else:
        index_ = pt.IndexRef.of(filename)
    return index_


def custom_preprocess(text):
    toks = word_tokenize(text) # tokenize
    toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
    toks = [custom_stemmer.stem(t) for t in toks] # stem
    return ' '.join(toks) # combine toks back into a string



# Custom Preprocessing
# NB: This custom pre-processing ends up being considerably slower than using Terrier's built-in processor,
# so we use the multiprocessing package to parallelize (400 docs/s vs 2000 docs/s).
def map_doc(document,custom_preprocess=custom_preprocess):
    # this function replaces the document text with the version that uses our custom pre-processing
    return {
        'docno': document['docno'],
        'text': custom_preprocess(document['text'])
    }

def CustomIndexCreator(filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc):

    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=None, stopwords=None,  # Disable the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        with multiprocessing.Pool() as pool:
            index_custom = indexer.index(pool.imap(mapper, dataset.get_corpus_iter()))
    else:
        index_custom = pt.IndexRef.of(filename)
    return index_custom




In [None]:
config={
1:
    {
        "dataset":"irds:wikir/en59k/test",
        "lang":"english",
        "tokeniser":"",
        "stemmer":"EnglishSnowballStemmer",
        "stopwords":None
    },
2:
    {
        "dataset":"irds:wikir/es13k/test",
        "lang":"spanish",
        "tokeniser":"",
        "stemmer":"SpanishSnowballStemmer",
        "stopwords":None
    },
3:
    {
        "dataset":"irds:wikir/fr14k/test",
        "lang":"french",
        "tokeniser":"",
        "stemmer":"FrenchSnowballStemmer",
        "stopwords":None
    },
4:
    {
        "dataset":"irds:wikir/it16k/test",
        "lang":"italian",
        "tokeniser":"",
        "stemmer":"ItalianSnowballStemmer",
        "stopwords":None
    }
    
    
}

In [None]:
result={}

In [None]:
from tqdm import tqdm


In [None]:
result

{}

In [None]:

for i in tqdm(range(1,5)):
    print(i)
    config_data=config[i]
    result[config_data["lang"]]={}

    custom_stemmer=None
    custom_stopwords=None

    dataset = pt.get_dataset(config_data["dataset"])
    custom_stemmer = SnowballStemmer(config_data["lang"])
    custom_stopwords = set(stopwords.words(config_data["lang"]))
    def custom_preprocess(text):
        toks = word_tokenize(text) # tokenize
        toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
        toks = [custom_stemmer.stem(t) for t in toks] # stem
        return ' '.join(toks) # combine toks back into a string

    def map_doc(document,custom_preprocess=custom_preprocess):
        # this function replaces the document text with the version that uses our custom pre-processing
        return {
            'docno': document['docno'],
            'text': custom_preprocess(document['text'])
        }

    index_nostem = IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-nostem',
                                tokeniser="UTFTokeniser",stemmer=None,stopwords=None)
    index_stem=IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-stem', 
                            tokeniser="UTFTokeniser",stemmer=config_data["stemmer"],stopwords=None)
    index_custom = CustomIndexCreator(filename=f'./wikir-{config_data["lang"]}-custom_stem', tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc)




    bm25_nostem = pt.BatchRetrieve(index_nostem, wmodel='BM25')
    bm25_stem = pt.BatchRetrieve(index_stem, wmodel='BM25')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bm25_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BM25')


    tfidf_nostem = pt.BatchRetrieve(index_nostem, wmodel='TF_IDF')
    tfidf_stem = pt.BatchRetrieve(index_stem, wmodel='TF_IDF')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    tfidf_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='TF_IDF')



    bb2_nostem = pt.BatchRetrieve(index_nostem, wmodel='BB2')
    bb2_stem = pt.BatchRetrieve(index_stem, wmodel='BB2')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bb2_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BB2')



    # http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html


    title_qrels = dataset.get_qrels().copy()
    title_qrels.loc[title_qrels.label < 2, 'label'] = 0
    title_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        title_qrels,
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )


    doc_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        dataset.get_qrels(),
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )
    
    
    result[config_data["lang"]]={"title_result":title_result.to_dict(),"doc_result":doc_result.to_dict()}
    
    with open('./data_result.pkl', 'wb') as f:
        pickle.dump(result, f)

  0%|                                       | 0/4 [00:00<?, ?it/s]

1


 25%|██████▊                    | 1/4 [20:29<1:01:28, 1229.55s/it]

2


 50%|███████████████               | 2/4 [34:13<33:01, 990.69s/it]

3


 75%|██████████████████████▌       | 3/4 [47:49<15:11, 911.36s/it]

4


100%|████████████████████████████| 4/4 [1:01:54<00:00, 928.68s/it]


# Imports

In [None]:
import os
import multiprocessing
import nltk
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import pyterrier as pt
if not pt.started():
    pt.init()
from pyterrier.measures import *

nltk.download('stopwords')
nltk.download('punkt')
import pickle
# es_stemer = SnowballStemmer('spanish')
# es_stopwords = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to /home/edwin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/edwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Spanish test set


In [None]:
# custom_stemmer

In [None]:

def IndexCreator(dataset,filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None):
    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=stemmer, stopwords=stopwords, # Removes the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        index_ = indexer.index(dataset.get_corpus_iter())
    else:
        index_ = pt.IndexRef.of(filename)
    return index_


def custom_preprocess(text):
    toks = word_tokenize(text) # tokenize
    toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
    toks = [custom_stemmer.stem(t) for t in toks] # stem
    return ' '.join(toks) # combine toks back into a string



# Custom Preprocessing
# NB: This custom pre-processing ends up being considerably slower than using Terrier's built-in processor,
# so we use the multiprocessing package to parallelize (400 docs/s vs 2000 docs/s).
def map_doc(document,custom_preprocess=custom_preprocess):
    # this function replaces the document text with the version that uses our custom pre-processing
    return {
        'docno': document['docno'],
        'text': custom_preprocess(document['text'])
    }

def CustomIndexCreator(filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc):

    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=None, stopwords=None,  # Disable the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        with multiprocessing.Pool() as pool:
            index_custom = indexer.index(pool.imap(mapper, dataset.get_corpus_iter()))
    else:
        index_custom = pt.IndexRef.of(filename)
    return index_custom




In [None]:
config={
1:
    {
        "dataset":"irds:wikir/en59k/test",
        "lang":"english",
        "tokeniser":"",
        "stemmer":"EnglishSnowballStemmer",
        "stopwords":None
    },
2:
    {
        "dataset":"irds:wikir/es13k/test",
        "lang":"spanish",
        "tokeniser":"",
        "stemmer":"SpanishSnowballStemmer",
        "stopwords":None
    },
3:
    {
        "dataset":"irds:wikir/fr14k/test",
        "lang":"french",
        "tokeniser":"",
        "stemmer":"FrenchSnowballStemmer",
        "stopwords":None
    },
4:
    {
        "dataset":"irds:wikir/it16k/test",
        "lang":"italian",
        "tokeniser":"",
        "stemmer":"ItalianSnowballStemmer",
        "stopwords":None
    }
    
    
}

In [None]:
result={}

In [None]:
from tqdm import tqdm


In [None]:
result

{}

In [None]:

for i in tqdm(range(1,5)):
    print(i)
    config_data=config[i]
    result[config_data["lang"]]={}

    custom_stemmer=None
    custom_stopwords=None

    dataset = pt.get_dataset(config_data["dataset"])
    custom_stemmer = SnowballStemmer(config_data["lang"])
    custom_stopwords = set(stopwords.words(config_data["lang"]))
    def custom_preprocess(text):
        toks = word_tokenize(text) # tokenize
        toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
        toks = [custom_stemmer.stem(t) for t in toks] # stem
        return ' '.join(toks) # combine toks back into a string

    def map_doc(document,custom_preprocess=custom_preprocess):
        # this function replaces the document text with the version that uses our custom pre-processing
        return {
            'docno': document['docno'],
            'text': custom_preprocess(document['text'])
        }

    index_nostem = IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-nostem',
                                tokeniser="UTFTokeniser",stemmer=None,stopwords=None)
    index_stem=IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-stem', 
                            tokeniser="UTFTokeniser",stemmer=config_data["stemmer"],stopwords=None)
    index_custom = CustomIndexCreator(filename=f'./wikir-{config_data["lang"]}-custom_stem', tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc)




    bm25_nostem = pt.BatchRetrieve(index_nostem, wmodel='BM25')
    bm25_stem = pt.BatchRetrieve(index_stem, wmodel='BM25')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bm25_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BM25')


    tfidf_nostem = pt.BatchRetrieve(index_nostem, wmodel='TF_IDF')
    tfidf_stem = pt.BatchRetrieve(index_stem, wmodel='TF_IDF')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    tfidf_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='TF_IDF')



    bb2_nostem = pt.BatchRetrieve(index_nostem, wmodel='BB2')
    bb2_stem = pt.BatchRetrieve(index_stem, wmodel='BB2')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bb2_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BB2')



    # http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html


    title_qrels = dataset.get_qrels().copy()
    title_qrels.loc[title_qrels.label < 2, 'label'] = 0
    title_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        title_qrels,
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )


    doc_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        dataset.get_qrels(),
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )
    
    
    result[config_data["lang"]]={"title_result":title_result.to_dict(),"doc_result":doc_result.to_dict()}
    
    with open('./data_result.pkl', 'wb') as f:
        pickle.dump(result, f)

  0%|                                       | 0/4 [00:00<?, ?it/s]

1


 25%|██████▊                    | 1/4 [20:29<1:01:28, 1229.55s/it]

2


 50%|███████████████               | 2/4 [34:13<33:01, 990.69s/it]

3


 75%|██████████████████████▌       | 3/4 [47:49<15:11, 911.36s/it]

4


100%|████████████████████████████| 4/4 [1:01:54<00:00, 928.68s/it]


# Imports

In [None]:
import os
import multiprocessing
import nltk
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import pyterrier as pt
if not pt.started():
    pt.init()
from pyterrier.measures import *

nltk.download('stopwords')
nltk.download('punkt')
import pickle
# es_stemer = SnowballStemmer('spanish')
# es_stopwords = set(stopwords.words('spanish'))

[nltk_data] Downloading package stopwords to /home/edwin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/edwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

def IndexCreator(dataset,filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None):
    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=stemmer, stopwords=stopwords, # Removes the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        index_ = indexer.index(dataset.get_corpus_iter())
    else:
        index_ = pt.IndexRef.of(filename)
    return index_


def custom_preprocess(text):
    toks = word_tokenize(text) # tokenize
    toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
    toks = [custom_stemmer.stem(t) for t in toks] # stem
    return ' '.join(toks) # combine toks back into a string



# Custom Preprocessing
# NB: This custom pre-processing ends up being considerably slower than using Terrier's built-in processor,
# so we use the multiprocessing package to parallelize (400 docs/s vs 2000 docs/s).
def map_doc(document,custom_preprocess=custom_preprocess):
    # this function replaces the document text with the version that uses our custom pre-processing
    return {
        'docno': document['docno'],
        'text': custom_preprocess(document['text'])
    }

def CustomIndexCreator(filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc):

    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=None, stopwords=None,  # Disable the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        with multiprocessing.Pool() as pool:
            index_custom = indexer.index(pool.imap(mapper, dataset.get_corpus_iter()))
    else:
        index_custom = pt.IndexRef.of(filename)
    return index_custom




In [None]:
config={
1:
    {
        "dataset":"irds:wikir/en59k/test",
        "lang":"english",
        "tokeniser":"",
        "stemmer":"EnglishSnowballStemmer",
        "stopwords":None
    },
2:
    {
        "dataset":"irds:wikir/es13k/test",
        "lang":"spanish",
        "tokeniser":"",
        "stemmer":"SpanishSnowballStemmer",
        "stopwords":None
    },
3:
    {
        "dataset":"irds:wikir/fr14k/test",
        "lang":"french",
        "tokeniser":"",
        "stemmer":"FrenchSnowballStemmer",
        "stopwords":None
    },
4:
    {
        "dataset":"irds:wikir/it16k/test",
        "lang":"italian",
        "tokeniser":"",
        "stemmer":"ItalianSnowballStemmer",
        "stopwords":None
    }
    
    
}

In [None]:
result={}

In [None]:
from tqdm import tqdm


In [None]:

for i in tqdm(range(1,5)):
    print(i)
    config_data=config[i]
    result[config_data["lang"]]={}

    custom_stemmer=None
    custom_stopwords=None

    dataset = pt.get_dataset(config_data["dataset"])
    custom_stemmer = SnowballStemmer(config_data["lang"])
    custom_stopwords = set(stopwords.words(config_data["lang"]))
    def custom_preprocess(text):
        toks = word_tokenize(text) # tokenize
        toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
        toks = [custom_stemmer.stem(t) for t in toks] # stem
        return ' '.join(toks) # combine toks back into a string

    def map_doc(document,custom_preprocess=custom_preprocess):
        # this function replaces the document text with the version that uses our custom pre-processing
        return {
            'docno': document['docno'],
            'text': custom_preprocess(document['text'])
        }

    index_nostem = IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-nostem',
                                tokeniser="UTFTokeniser",stemmer=None,stopwords=None)
    index_stem=IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-stem', 
                            tokeniser="UTFTokeniser",stemmer=config_data["stemmer"],stopwords=None)
    index_custom = CustomIndexCreator(filename=f'./wikir-{config_data["lang"]}-custom_stem', tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc)




    bm25_nostem = pt.BatchRetrieve(index_nostem, wmodel='BM25')
    bm25_stem = pt.BatchRetrieve(index_stem, wmodel='BM25')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bm25_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BM25')


    tfidf_nostem = pt.BatchRetrieve(index_nostem, wmodel='TF_IDF')
    tfidf_stem = pt.BatchRetrieve(index_stem, wmodel='TF_IDF')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    tfidf_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='TF_IDF')



    bb2_nostem = pt.BatchRetrieve(index_nostem, wmodel='BB2')
    bb2_stem = pt.BatchRetrieve(index_stem, wmodel='BB2')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bb2_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BB2')



    # http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html


    title_qrels = dataset.get_qrels().copy()
    title_qrels.loc[title_qrels.label < 2, 'label'] = 0
    title_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        title_qrels,
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )


    doc_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        dataset.get_qrels(),
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )
    
    
    result[config_data["lang"]]={"title_result":title_result.to_dict(),"doc_result":doc_result.to_dict()}
    
    with open('./data_result.pkl', 'wb') as f:
        pickle.dump(result, f)

  0%|                                       | 0/4 [00:00<?, ?it/s]

1


 25%|██████▊                    | 1/4 [20:29<1:01:28, 1229.55s/it]

2


 50%|███████████████               | 2/4 [34:13<33:01, 990.69s/it]

3


 75%|██████████████████████▌       | 3/4 [47:49<15:11, 911.36s/it]

4


100%|████████████████████████████| 4/4 [1:01:54<00:00, 928.68s/it]


In [None]:
config={
1:
    {
        "dataset":"irds:wikir/en59k/test",
        "lang":"english",
        "tokeniser":"",
        "stemmer":"EnglishSnowballStemmer",
        "stopwords":None
    },
2:
    {
        "dataset":"irds:wikir/es13k/test",
        "lang":"spanish",
        "tokeniser":"",
        "stemmer":"SpanishSnowballStemmer",
        "stopwords":None
    },
3:
    {
        "dataset":"irds:wikir/fr14k/test",
        "lang":"french",
        "tokeniser":"",
        "stemmer":"FrenchSnowballStemmer",
        "stopwords":None
    },
4:
    {
        "dataset":"irds:wikir/it16k/test",
        "lang":"italian",
        "tokeniser":"",
        "stemmer":"ItalianSnowballStemmer",
        "stopwords":None
    }
    
    
}

In [None]:
result={}

In [None]:
from tqdm import tqdm


In [None]:
result

{}

In [None]:

for i in tqdm(range(1,5)):
    print(i)
    config_data=config[i]
    result[config_data["lang"]]={}

    custom_stemmer=None
    custom_stopwords=None

    dataset = pt.get_dataset(config_data["dataset"])
    custom_stemmer = SnowballStemmer(config_data["lang"])
    custom_stopwords = set(stopwords.words(config_data["lang"]))
    def custom_preprocess(text):
        toks = word_tokenize(text) # tokenize
        toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
        toks = [custom_stemmer.stem(t) for t in toks] # stem
        return ' '.join(toks) # combine toks back into a string

    def map_doc(document,custom_preprocess=custom_preprocess):
        # this function replaces the document text with the version that uses our custom pre-processing
        return {
            'docno': document['docno'],
            'text': custom_preprocess(document['text'])
        }

    index_nostem = IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-nostem',
                                tokeniser="UTFTokeniser",stemmer=None,stopwords=None)
    index_stem=IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-stem', 
                            tokeniser="UTFTokeniser",stemmer=config_data["stemmer"],stopwords=None)
    index_custom = CustomIndexCreator(filename=f'./wikir-{config_data["lang"]}-custom_stem', tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc)




    bm25_nostem = pt.BatchRetrieve(index_nostem, wmodel='BM25')
    bm25_stem = pt.BatchRetrieve(index_stem, wmodel='BM25')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bm25_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BM25')


    tfidf_nostem = pt.BatchRetrieve(index_nostem, wmodel='TF_IDF')
    tfidf_stem = pt.BatchRetrieve(index_stem, wmodel='TF_IDF')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    tfidf_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='TF_IDF')



    bb2_nostem = pt.BatchRetrieve(index_nostem, wmodel='BB2')
    bb2_stem = pt.BatchRetrieve(index_stem, wmodel='BB2')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bb2_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BB2')



    # http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html


    title_qrels = dataset.get_qrels().copy()
    title_qrels.loc[title_qrels.label < 2, 'label'] = 0
    title_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        title_qrels,
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )


    doc_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        dataset.get_qrels(),
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )
    
    
    result[config_data["lang"]]={"title_result":title_result.to_dict(),"doc_result":doc_result.to_dict()}
    
    with open('./data_result.pkl', 'wb') as f:
        pickle.dump(result, f)

  0%|                                       | 0/4 [00:00<?, ?it/s]

1


 25%|██████▊                    | 1/4 [20:29<1:01:28, 1229.55s/it]

2


 50%|███████████████               | 2/4 [34:13<33:01, 990.69s/it]

3


 75%|██████████████████████▌       | 3/4 [47:49<15:11, 911.36s/it]

4


100%|████████████████████████████| 4/4 [1:01:54<00:00, 928.68s/it]


In [None]:

def IndexCreator(dataset,filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None):
    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=stemmer, stopwords=stopwords, # Removes the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        index_ = indexer.index(dataset.get_corpus_iter())
    else:
        index_ = pt.IndexRef.of(filename)
    return index_


def custom_preprocess(text):
    toks = word_tokenize(text) # tokenize
    toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
    toks = [custom_stemmer.stem(t) for t in toks] # stem
    return ' '.join(toks) # combine toks back into a string



# Custom Preprocessing
# NB: This custom pre-processing ends up being considerably slower than using Terrier's built-in processor,
# so we use the multiprocessing package to parallelize (400 docs/s vs 2000 docs/s).
def map_doc(document,custom_preprocess=custom_preprocess):
    # this function replaces the document text with the version that uses our custom pre-processing
    return {
        'docno': document['docno'],
        'text': custom_preprocess(document['text'])
    }

def CustomIndexCreator(filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc):

    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=None, stopwords=None,  # Disable the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        with multiprocessing.Pool() as pool:
            index_custom = indexer.index(pool.imap(mapper, dataset.get_corpus_iter()))
    else:
        index_custom = pt.IndexRef.of(filename)
    return index_custom




In [None]:
config={
1:
    {
        "dataset":"irds:wikir/en59k/test",
        "lang":"english",
        "tokeniser":"",
        "stemmer":"EnglishSnowballStemmer",
        "stopwords":None
    },
2:
    {
        "dataset":"irds:wikir/es13k/test",
        "lang":"spanish",
        "tokeniser":"",
        "stemmer":"SpanishSnowballStemmer",
        "stopwords":None
    },
3:
    {
        "dataset":"irds:wikir/fr14k/test",
        "lang":"french",
        "tokeniser":"",
        "stemmer":"FrenchSnowballStemmer",
        "stopwords":None
    },
4:
    {
        "dataset":"irds:wikir/it16k/test",
        "lang":"italian",
        "tokeniser":"",
        "stemmer":"ItalianSnowballStemmer",
        "stopwords":None
    }
    
    
}

In [None]:
result={}

In [None]:
from tqdm import tqdm


In [None]:
result

{}

In [None]:

for i in tqdm(range(1,5)):
    print(i)
    config_data=config[i]
    result[config_data["lang"]]={}

    custom_stemmer=None
    custom_stopwords=None

    dataset = pt.get_dataset(config_data["dataset"])
    custom_stemmer = SnowballStemmer(config_data["lang"])
    custom_stopwords = set(stopwords.words(config_data["lang"]))
    def custom_preprocess(text):
        toks = word_tokenize(text) # tokenize
        toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
        toks = [custom_stemmer.stem(t) for t in toks] # stem
        return ' '.join(toks) # combine toks back into a string

    def map_doc(document,custom_preprocess=custom_preprocess):
        # this function replaces the document text with the version that uses our custom pre-processing
        return {
            'docno': document['docno'],
            'text': custom_preprocess(document['text'])
        }

    index_nostem = IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-nostem',
                                tokeniser="UTFTokeniser",stemmer=None,stopwords=None)
    index_stem=IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-stem', 
                            tokeniser="UTFTokeniser",stemmer=config_data["stemmer"],stopwords=None)
    index_custom = CustomIndexCreator(filename=f'./wikir-{config_data["lang"]}-custom_stem', tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc)




    bm25_nostem = pt.BatchRetrieve(index_nostem, wmodel='BM25')
    bm25_stem = pt.BatchRetrieve(index_stem, wmodel='BM25')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bm25_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BM25')


    tfidf_nostem = pt.BatchRetrieve(index_nostem, wmodel='TF_IDF')
    tfidf_stem = pt.BatchRetrieve(index_stem, wmodel='TF_IDF')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    tfidf_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='TF_IDF')



    bb2_nostem = pt.BatchRetrieve(index_nostem, wmodel='BB2')
    bb2_stem = pt.BatchRetrieve(index_stem, wmodel='BB2')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bb2_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BB2')



    # http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html


    title_qrels = dataset.get_qrels().copy()
    title_qrels.loc[title_qrels.label < 2, 'label'] = 0
    title_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        title_qrels,
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )


    doc_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        dataset.get_qrels(),
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )
    
    
    result[config_data["lang"]]={"title_result":title_result.to_dict(),"doc_result":doc_result.to_dict()}
    
    with open('./data_result.pkl', 'wb') as f:
        pickle.dump(result, f)

  0%|                                       | 0/4 [00:00<?, ?it/s]

1


 25%|██████▊                    | 1/4 [20:29<1:01:28, 1229.55s/it]

2


 50%|███████████████               | 2/4 [34:13<33:01, 990.69s/it]

3


 75%|██████████████████████▌       | 3/4 [47:49<15:11, 911.36s/it]

4


100%|████████████████████████████| 4/4 [1:01:54<00:00, 928.68s/it]
