# TREC 2018 Precision Medicine

In [1]:
import json
from json2html import *
from IPython.display import HTML
import pandas

import os, sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from trec_utils import utils, running, evaluation

In [2]:
config = utils.load_config()

## Load and split TOPICS and GOLD STANDARD from last year (30 topics)
Also, split them into training, test, and development set.

In [3]:
topics_all = utils.get_topics('./topics/topics2017.xml')
qrels_all = utils.get_qrels('./gold-standard/abstracts.2017.qrels')

In [4]:
topics_train, topics_test, topics_dev = utils.split_topics(topics_all)
qrels_train, qrels_test, qrels_dev = utils.split_qrels(qrels_all, topics_train, topics_test, topics_dev)

In [5]:
assert(set([topic for topic in topics_train['topic']])==set([topic for topic in qrels_train['topic']]))
assert(set([topic for topic in topics_test['topic']])==set([topic for topic in qrels_test['topic']]))
assert(set([topic for topic in topics_dev['topic']])==set([topic for topic in qrels_dev['topic']]))

In [6]:
full_run_params = running.get_default_run_params()
full_run_params['query_template'] = 'baseline.json'
full_run_df, full_run_params = running.run(topics_all)
results, aggregated = evaluation.evaluate(qrels_all, full_run_df)
aggregated
#assert(aggregated['ndcg'] == 0.5338)

RUN: FIXME TOPICS: 30 {'gene_tie_breaker': 0.5, 'disease_tie_breaker': 0.5, 'query_template': 'baseline.json', 'gene_boost': 1, 'disease_boost': 1.5}


{'P_10': 0.48, 'Rprec': 0.2847, 'ndcg': 0.5338, 'recall_1000': 0.6139}

In [7]:
run_params = running.get_default_run_params()
run_params['query_template'] = 'baseline2.json'
full_run_df, run_params = running.run(topics_all)
results, aggregated = evaluation.evaluate(qrels_all, full_run_df)
aggregated
#assert(aggregated['ndcg'] == 0.5338)

RUN: FIXME TOPICS: 30 {'gene_tie_breaker': 0.5, 'disease_tie_breaker': 0.5, 'query_template': 'baseline2.json', 'gene_boost': 1, 'disease_boost': 1.5}


{'P_10': 0.48, 'Rprec': 0.2847, 'ndcg': 0.5338, 'recall_1000': 0.6139}

In [8]:
#full_run_df

Test run with default_params

In [9]:
training_run_df, training_run_params = running.run(topics_train)

RUN: FIXME TOPICS: 12 {'gene_tie_breaker': 0.5, 'disease_tie_breaker': 0.5, 'query_template': 'baseline2.json', 'gene_boost': 1, 'disease_boost': 1.5}


In [10]:
training_results, training_aggregated = evaluation.evaluate(qrels_train, training_run_df)
training_score = pandas.Series(training_aggregated, name='training ' + '(' + str(len(topics_train)) + ' topics)')

In [11]:
test_run_df, test_run_params = running.run(topics_test, training_run_params)

RUN: FIXME TOPICS: 9 {'gene_tie_breaker': 0.5, 'disease_tie_breaker': 0.5, 'query_template': 'baseline2.json', 'gene_boost': 1, 'disease_boost': 1.5}


In [12]:
training_score

P_10           0.5917
Rprec          0.2626
ndcg           0.5026
recall_1000    0.4999
Name: training (12 topics), dtype: float64

In [13]:
test_results, test_aggregated = evaluation.evaluate(qrels_test, test_run_df)
test_score = pandas.Series(test_aggregated, name='test ' + '(' + str(len(topics_test)) + ' topics)')

In [14]:
pandas.DataFrame([training_score, test_score])

Unnamed: 0,P_10,Rprec,ndcg,recall_1000
training (12 topics),0.5917,0.2626,0.5026,0.4999
test (9 topics),0.4444,0.3171,0.5561,0.6852


In [16]:
results, aggregated = evaluation.evaluate(qrels_all, abcd)

In [17]:
running.experiment(topics_all, qrels_all)

{'gene_tie_breaker': [0.1, 0.5], 'disease_tie_breaker': [0.1, 0.5], 'query_template': ['variable.json'], 'gene_boost': [1, 2, 5], 'disease_boost': [1, 2, 5]}
RUN: FIXME TOPICS: 30 {'gene_tie_breaker': '0.1', 'disease_tie_breaker': '0.1', 'query_template': 'variable.json', 'gene_boost': '1', 'disease_boost': '1'}
{'P_10': 0.3067, 'Rprec': 0.1952, 'ndcg': 0.4139, 'recall_1000': 0.5343}
RUN: FIXME TOPICS: 30 {'gene_tie_breaker': '0.1', 'disease_tie_breaker': '0.1', 'query_template': 'variable.json', 'gene_boost': '2', 'disease_boost': '1'}
{'P_10': 0.3033, 'Rprec': 0.1852, 'ndcg': 0.4003, 'recall_1000': 0.5224}
RUN: FIXME TOPICS: 30 {'gene_tie_breaker': '0.1', 'disease_tie_breaker': '0.1', 'query_template': 'variable.json', 'gene_boost': '5', 'disease_boost': '1'}
{'P_10': 0.2933, 'Rprec': 0.181, 'ndcg': 0.3936, 'recall_1000': 0.5175}
RUN: FIXME TOPICS: 30 {'gene_tie_breaker': '0.5', 'disease_tie_breaker': '0.1', 'query_template': 'variable.json', 'gene_boost': '1', 'disease_boost': '1'}
