In [None]:
from matplotlib import pyplot as plt
%matplotlib notebook

import sys 
sys.path.append('..')

from ParserQuery import *
from Index import Index
from Weighter import WeighterVector, WeighterSchema3
from IRModel import Vectoriel, LanguageModel, BM25Model
from Evaluation import *
from EvalIRModel import *

from diversity.Clustering import ClusteringDiversifier
from diversity.GreedyRanking import GreedyRankingMMR, calculate_vector_presentation
from diversity.RandomRanking import RandomDiversifier
from sklearn.metrics.pairwise import cosine_similarity

import time
import pandas as pd

%load_ext autoreload
%autoreload 2

In [None]:
#'''
data_path = '../easyCLEF08/'
index_path = '../indexes/easyCLEF08/'
index_name = 'easyCLEF08'
#'''

'''
data_path = '../data/easyCLEF08/'
index_path = '../indexes/easyCLEF08/'
index_name = 'easyCLEF08'
'''

filename_queries = data_path+'/easyCLEF08_query.txt'
filename_jugements = data_path+'/easyCLEF08_gt.txt'

eval_ir = EvalIRModel(filename_queries, filename_jugements)

In [None]:
train_prop = 1 # pas de test
seed = 42
results = {}
# baseline
k1 = 2
b = 0.95
# cluster

# greedy
alpha = 0.75

In [None]:
index = Index(index_name, data_path+'/easyCLEF08_text.txt', index_path=index_path)
#index.indexation()
weighter2 = WeighterVector(index)
#weighter2.calculeNorms()
weighter3 = WeighterSchema3(index)
#weighter3.calculeNorms()
baseline = BM25Model(weighter2)
cluster = ClusteringDiversifier(index)
greedyMMR = GreedyRankingMMR(index)
randomRank = RandomDiversifier(seed)

In [None]:
documents_to_order = 20 
results['baseline'] = eval_ir.evalModel(baseline,
                                        ranking_call =lambda m,q : m.getRanking(q, k1=k1, b=b),
                                        train_prop=train_prop, seed=seed, mode='train')
wrap_cluster = lambda m, q, cluster=cluster: cluster.diversify(m.getRanking(q, k1=k1, b=b),
                                                              by_top_n=100, n_clusters=20)[0].values
results['cluster'] = eval_ir.evalModel(baseline, ranking_call=wrap_cluster,
                                      train_prop=train_prop, seed=seed, mode='train')

wrap_greedy = lambda m, q, greedy=greedyMMR: greedy.diversify(q, m.getRanking(q, k1=k1, b=b), 
                                                              doc_limit=100, order_n=documents_to_order, alpha=alpha)

results['greedy'] = eval_ir.evalModel(baseline, ranking_call=wrap_greedy,
                                      train_prop=train_prop, seed=seed, mode='train')

wrap_random = lambda m, q, random_=randomRank: random_.diversify(m.getRanking(q, k1=k1, b=b),
                                                                doc_limit=100, order_n=documents_to_order)
results['random'] = eval_ir.evalModel(baseline, ranking_call=wrap_random,
                                     train_prop=train_prop, seed=seed, mode='train')

In [None]:
fig = plt.figure()

ax1 = fig.add_subplot(1, 1, 1)
plt.title('Precision @ 20')

ax1.bar(1, results['baseline']['precision_at_20'], yerr=results['baseline']['precision_at_20_std'], capsize=4)
ax1.bar(2, results['random']['precision_at_20'], yerr=results['random']['precision_at_20_std'], capsize=4)
ax1.bar(3, results['greedy']['precision_at_20'], yerr=results['greedy']['precision_at_20_std'], capsize=4)
ax1.bar(4, results['cluster']['precision_at_20'], yerr=results['cluster']['precision_at_20_std'], capsize=4)
fig.legend(['Baseline', 'Mélange aléatoire' , 'Algorithme glouton', 'Clustering'])
plt.ylim(0.25, 0.75)
plt.ylabel('Precision')
plt.savefig('comparaison_des_methods_100_doc_precision')

In [None]:
fig = plt.figure()

ax1 = fig.add_subplot(1, 1, 1)
plt.title('Cluster Recall @ 20')

ax1.bar(1, results['baseline']['cluster_recall_at_20'], yerr=results['baseline']['cluster_recall_at_20_std'], capsize=4)
ax1.bar(2, results['random']['cluster_recall_at_20'], yerr=results['random']['cluster_recall_at_20_std'], capsize=4)
ax1.bar(3, results['greedy']['cluster_recall_at_20'], yerr=results['greedy']['cluster_recall_at_20_std'], capsize=4)
ax1.bar(4, results['cluster']['cluster_recall_at_20'], yerr=results['cluster']['cluster_recall_at_20_std'], capsize=4)
fig.legend(['Baseline', 'Mélange aléatoire' , 'Algorithme glouton', 'Clustering'])
plt.ylim(0.25, 0.75)
plt.ylabel('CR')
plt.savefig('comparaison_des_methods_100_doc_cluster_recall')

In [None]:
results['cluster']['cluster_recall_at_20']