# Imports

In [None]:
import os
import multiprocessing
import nltk
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import pyterrier as pt
if not pt.started():
    pt.init()
from pyterrier.measures import *

nltk.download('stopwords')
nltk.download('punkt')
import pickle
# es_stemer = SnowballStemmer('spanish')
# es_stopwords = set(stopwords.words('spanish'))

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
[nltk_data] Downloading package stopwords to /home/edwin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/edwin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# custom_stemmer

In [None]:

def IndexCreator(dataset,filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None):
    set_trace()
    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=stemmer, stopwords=stopwords, # Removes the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        index_ = indexer.index(dataset.get_corpus_iter())
    else:
        index_ = pt.IndexRef.of(filename)
    return index_


def custom_preprocess(text):
    toks = word_tokenize(text) # tokenize
    toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
    toks = [custom_stemmer.stem(t) for t in toks] # stem
    return ' '.join(toks) # combine toks back into a string



# Custom Preprocessing
# NB: This custom pre-processing ends up being considerably slower than using Terrier's built-in processor,
# so we use the multiprocessing package to parallelize (400 docs/s vs 2000 docs/s).
def map_doc(document,custom_preprocess=custom_preprocess):
    # this function replaces the document text with the version that uses our custom pre-processing
    return {
        'docno': document['docno'],
        'text': custom_preprocess(document['text'])
    }

def CustomIndexCreator(filename, tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc):

    if not os.path.exists(filename):
        indexer = pt.IterDictIndexer(filename, 
            stemmer=None, stopwords=None,  # Disable the default PorterStemmer (English)
            tokeniser=tokeniser) # Replaces the default EnglishTokeniser, which makes assumptions specific to English
        with multiprocessing.Pool() as pool:
            index_custom = indexer.index(pool.imap(mapper, dataset.get_corpus_iter()))
    else:
        index_custom = pt.IndexRef.of(filename)
    return index_custom




In [None]:
config={
1:
    {
        "dataset":"irds:wikir/en59k/test",
        "lang":"english",
        "tokeniser":"",
        "stemmer":"EnglishSnowballStemmer",
        "stopwords":None
    },
2:
    {
        "dataset":"irds:wikir/es13k/test",
        "lang":"spanish",
        "tokeniser":"",
        "stemmer":"SpanishSnowballStemmer",
        "stopwords":None
    },
3:
    {
        "dataset":"irds:wikir/fr14k/test",
        "lang":"french",
        "tokeniser":"",
        "stemmer":"FrenchSnowballStemmer",
        "stopwords":None
    },
4:
    {
        "dataset":"irds:wikir/it16k/test",
        "lang":"italian",
        "tokeniser":"",
        "stemmer":"ItalianSnowballStemmer",
        "stopwords":None
    }
    
    
}

In [None]:
result={}

In [None]:
from tqdm import tqdm


In [None]:

for i in tqdm(range(1,5)):
    print(i)
    config_data=config[i]
    result[config_data["lang"]]={}

    custom_stemmer=None
    custom_stopwords=None

    dataset = pt.get_dataset(config_data["dataset"])
    custom_stemmer = SnowballStemmer(config_data["lang"])
    custom_stopwords = set(stopwords.words(config_data["lang"]))
    def custom_preprocess(text):
        toks = word_tokenize(text) # tokenize
        toks = [t for t in toks if t.lower() not in custom_stopwords] # remove stop words
        toks = [custom_stemmer.stem(t) for t in toks] # stem
        return ' '.join(toks) # combine toks back into a string

    def map_doc(document,custom_preprocess=custom_preprocess):
        # this function replaces the document text with the version that uses our custom pre-processing
        return {
            'docno': document['docno'],
            'text': custom_preprocess(document['text'])
        }

    index_nostem = IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-nostem',
                                tokeniser="UTFTokeniser",stemmer=None,stopwords=None)
    index_stem=IndexCreator(dataset=dataset,filename=f'./wikir-{config_data["lang"]}-stem', 
                            tokeniser="UTFTokeniser",stemmer=config_data["stemmer"],stopwords=None)
    index_custom = CustomIndexCreator(filename=f'./wikir-{config_data["lang"]}-custom_stem', tokeniser="UTFTokeniser",stemmer=None,stopwords=None,mapper=map_doc)




    bm25_nostem = pt.BatchRetrieve(index_nostem, wmodel='BM25')
    bm25_stem = pt.BatchRetrieve(index_stem, wmodel='BM25')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bm25_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BM25')


    tfidf_nostem = pt.BatchRetrieve(index_nostem, wmodel='TF_IDF')
    tfidf_stem = pt.BatchRetrieve(index_stem, wmodel='TF_IDF')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    tfidf_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='TF_IDF')



    bb2_nostem = pt.BatchRetrieve(index_nostem, wmodel='BB2')
    bb2_stem = pt.BatchRetrieve(index_stem, wmodel='BB2')
    # to apply the es_preprocess function to the query text, use a pt.apply.query transformer
    bb2_custom = pt.apply.query(lambda row: custom_preprocess(row.query)) >> pt.BatchRetrieve(index_custom, wmodel='BB2')



    # http://terrier.org/docs/current/javadoc/org/terrier/matching/models/package-summary.html


    title_qrels = dataset.get_qrels().copy()
    title_qrels.loc[title_qrels.label < 2, 'label'] = 0
    title_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        title_qrels,
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )


    doc_result=pt.Experiment(
        [bm25_nostem, bm25_stem, bm25_custom,tfidf_nostem, tfidf_stem, tfidf_custom,bb2_nostem, bb2_stem, bb2_custom],
        dataset.get_topics(),
        dataset.get_qrels(),
        [nDCG@5, nDCG@10, nDCG@20,"P_5","P_10","P_15","P_20","P_100","recip_rank","map", NumQ],
        names=['BM25 nostem', 'BM25 stem', 'BM25 custom','TFIDF nostem', 'TFIDF stem', 'TFIDF custom','BB2 nostem', 'BB2 stem', 'BB2 custom'],
        round=4
    )
    
    
    result[config_data["lang"]]={"title_result":title_result.to_dict(),"doc_result":doc_result.to_dict()}
    
    with open('./data_result.pkl', 'wb') as f:
        pickle.dump(result, f)

In [None]:
import pandas as pd
with open('./data_result.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

In [None]:
pd.DataFrame.from_dict(loaded_dict["english"]["title_result"])

Unnamed: 0,name,nDCG@5,nDCG@10,nDCG@20,P_5,P_10,P_15,P_20,P_100,recip_rank,map,NumQ
0,BM25 nostem,0.3457,0.3789,0.4024,0.094,0.0572,0.0418,0.0333,0.0081,0.3291,0.3291,1000.0
1,BM25 stem,0.3456,0.3844,0.4077,0.0926,0.0582,0.0418,0.0338,0.0082,0.3333,0.3333,1000.0
2,BM25 custom,0.3708,0.4026,0.4279,0.1002,0.0599,0.0439,0.035,0.0084,0.352,0.352,1000.0
3,TFIDF nostem,0.3655,0.3998,0.4217,0.098,0.0595,0.0429,0.0341,0.0083,0.3488,0.3488,1000.0
4,TFIDF stem,0.3667,0.4052,0.4303,0.0976,0.0606,0.0439,0.0353,0.0084,0.3531,0.3531,1000.0
5,TFIDF custom,0.3715,0.4035,0.4288,0.1004,0.0601,0.0439,0.0351,0.0084,0.3525,0.3525,1000.0
6,BB2 nostem,0.3382,0.3696,0.3932,0.092,0.0556,0.0407,0.0325,0.008,0.3218,0.3218,1000.0
7,BB2 stem,0.3394,0.3761,0.3996,0.0912,0.0569,0.0415,0.0331,0.0081,0.3266,0.3266,1000.0
8,BB2 custom,0.3684,0.4016,0.4273,0.099,0.0598,0.0437,0.035,0.0084,0.3512,0.3512,1000.0


In [None]:
pd.DataFrame.from_dict(loaded_dict["spanish"]["title_result"])

Unnamed: 0,name,nDCG@5,nDCG@10,nDCG@20,P_5,P_10,P_15,P_20,P_100,recip_rank,map,NumQ
0,BM25 nostem,0.2785,0.3197,0.345,0.0758,0.0505,0.0372,0.0303,0.0077,0.274,0.274,1298.0
1,BM25 stem,0.2654,0.3047,0.3304,0.0723,0.0482,0.0361,0.0292,0.0076,0.2625,0.2625,1299.0
2,BM25 custom,0.3005,0.3367,0.361,0.0826,0.0525,0.0387,0.0311,0.0078,0.2895,0.2895,1296.0
3,TFIDF nostem,0.2998,0.3374,0.3621,0.0811,0.052,0.0382,0.0309,0.0079,0.2921,0.2921,1298.0
4,TFIDF stem,0.2925,0.3313,0.355,0.0791,0.0514,0.0378,0.0304,0.0078,0.2863,0.2863,1299.0
5,TFIDF custom,0.3023,0.3377,0.3619,0.0829,0.0524,0.0387,0.031,0.0078,0.2912,0.2912,1296.0
6,BB2 nostem,0.242,0.2742,0.2956,0.0658,0.0428,0.0313,0.0257,0.0066,0.2368,0.2368,1298.0
7,BB2 stem,0.2363,0.267,0.2885,0.0645,0.0416,0.031,0.0251,0.0065,0.2312,0.2312,1299.0
8,BB2 custom,0.3017,0.3361,0.3612,0.0828,0.052,0.0386,0.031,0.0079,0.2906,0.2906,1296.0


In [None]:
pd.DataFrame.from_dict(loaded_dict["french"]["title_result"])

Unnamed: 0,name,nDCG@5,nDCG@10,nDCG@20,P_5,P_10,P_15,P_20,P_100,recip_rank,map,NumQ
0,BM25 nostem,0.3587,0.3911,0.4094,0.0949,0.0574,0.0413,0.0323,0.008,0.3434,0.3434,1398.0
1,BM25 stem,0.3539,0.386,0.4083,0.094,0.0569,0.0413,0.0329,0.0081,0.339,0.339,1398.0
2,BM25 custom,0.3807,0.4134,0.4369,0.1006,0.0603,0.0439,0.0348,0.0083,0.3643,0.3643,1397.0
3,TFIDF nostem,0.3746,0.407,0.4251,0.0987,0.0594,0.0425,0.0332,0.0081,0.3579,0.3579,1398.0
4,TFIDF stem,0.3754,0.4071,0.4285,0.0994,0.0595,0.0432,0.034,0.0082,0.358,0.358,1398.0
5,TFIDF custom,0.3808,0.4135,0.437,0.1006,0.0603,0.0439,0.0348,0.0083,0.3644,0.3644,1397.0
6,BB2 nostem,0.3485,0.3794,0.3972,0.0916,0.0553,0.0399,0.0311,0.0077,0.3345,0.3345,1398.0
7,BB2 stem,0.3406,0.3687,0.389,0.0909,0.0541,0.0393,0.0311,0.0076,0.3238,0.3238,1398.0
8,BB2 custom,0.3803,0.4127,0.4361,0.1004,0.0601,0.0439,0.0347,0.0083,0.3638,0.3638,1397.0


In [None]:
# pd.DataFrame.from_dict(loaded_dict["italian"]["title_result"])

In [None]:
# loaded_dict.keys()

dict_keys(['english', 'spanish', 'french'])