In [344]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from matplotlib import pyplot

%matplotlib inline

from collections import defaultdict
from datetime import timedelta, date, datetime
import json


In [345]:
data_file_names = ['grossveranstaltungen', 'sport-fussball',
              'unfaelle', 'demonstrationen',
              'immigration', 'straftaten-anschlaege'
                  ]

In [346]:
data = {}
for name in data_file_names:
    with open(f"{name}.json") as data_file:
        topic_data = json.load(data_file)
        data[name] = topic_data

In [347]:
client = Elasticsearch(hosts='robin-in.space', timeout=20)

In [348]:
class ArticleQueryBuilder():
    def __init__(self, article):
        self.article = article

        
    def simple_and_query(self):
        return Q("match", title=self.article['title']) & Q("match", body=self.article['body'])
        
        
    def simple_or_query(self):
        return Q("match", title=self.article['title']) | Q("match", body=self.article['body'])
    
    
    def date_filter(self, before=timedelta(weeks=2),
                            after=timedelta(weeks=8)):
        date = datetime.strptime(self.article['date'], '%Y-%m-%d')
        return Q("bool", filter=[Q('range', 
                            published={
                                'gte': date - before,
                                'lte': date + after
                            })])
    
    def location_query(self):
        return Q("bool", should=[
            Q("match", body=self.article['location']), 
            Q("match", title=self.article['location']),
            Q("match", officeName=self.article['location'])])

In [349]:
query = ab.simple_and_query() & ab.location_query() & ab.date_filter()

In [350]:
s = Search(using=client)
res = s.query(query).execute()

In [351]:
def construct_queries(article):
    builder = ArticleQueryBuilder(article)
    return {
        'and': builder.simple_and_query(),
        'or':builder.simple_or_query(),
        'and & date':builder.simple_and_query() & builder.date_filter(),
        'or & date':builder.simple_or_query() & builder.date_filter(),
        'and & loc':builder.simple_and_query() & builder.location_query(),
        'or & loc':builder.simple_or_query() & builder.location_query(),
        'and | loc':builder.simple_and_query() | builder.location_query(),
        'or | loc':builder.simple_or_query() | builder.location_query(),
        'and & loc & date':builder.simple_and_query() & builder.location_query() & builder.date_filter(),
        'or & loc & date':builder.simple_or_query() & builder.location_query() & builder.date_filter(),
        '(and | loc) & date':(builder.simple_and_query() | builder.location_query()) & builder.date_filter(),
        '(or | loc) & date':(builder.simple_or_query() | builder.location_query()) & builder.date_filter()
    }

In [352]:
def eval_query(query, pr_ids, client):
    res = Search(using=client).query(query).execute()
    return [(index, hit.meta.id) for index, hit in enumerate(res) if hit.meta.id in pr_ids]

In [353]:
def eval_event(event, client):
    results = defaultdict(dict)
    for article in event['articles']:
        queries = construct_queries(article)
        for name, query in queries.items(): 
            res = eval_query(query, event['pr-ids'], client)
            results[article['title']][name] = res
    return results

In [363]:
def eval_topic(topic, client):
    return [eval_event(event, client) for event in topic]

In [364]:
topic_results_2 = {topic_name: eval_topic(topic['data'], client) for 
                 topic_name, topic in data.items()}

GET http://robin-in.space:9200/_search [status:N/A request:20.021s]
Traceback (most recent call last):
  File "/home/robin/code/police-pr-search/.venv/lib/python3.6/site-packages/urllib3/connectionpool.py", line 384, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "/home/robin/code/police-pr-search/.venv/lib/python3.6/site-packages/urllib3/connectionpool.py", line 380, in _make_request
    httplib_response = conn.getresponse()
  File "/usr/local/lib/python3.6/http/client.py", line 1331, in getresponse
    response.begin()
  File "/usr/local/lib/python3.6/http/client.py", line 297, in begin
    version, status, reason = self._read_status()
  File "/usr/local/lib/python3.6/http/client.py", line 258, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "/usr/local/lib/python3.6/socket.py", line 586, in readinto
    return self._sock.recv_into(b)
socket.timeout: timed out

During handling of the above exception, an

ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='robin-in.space', port=9200): Read timed out. (read timeout=20))

In [356]:
topic_results2

NameError: name 'topic_results2' is not defined

In [330]:
def precision_at_k(query_result, k):
    precision = 0
    for result in query_result:
        rank, _ = result
        if rank + 1 <= k:
            precision += 1
    return precision / len(query_result) if len(query_result) > 0 else 0

In [335]:
def calc_precision_at_k_by_topic(topic_result, k):
    query_results = []
    for event_result in topic_result:
        for title, query_dict in event_result.items():
            for query_type, results in query_dict.items():
                query_results.append(results)
    precision = 0
    for result in query_results:
        for hit in result:
            rank, _ = hit
            if rank + 1 <= k:
                precision += 1
                
    return precision / (len(query_results) * k) # We assume that at least k elements are retrieved per query

In [340]:
for topic in data_file_names:
    calc_precision_at_k_by_topic(topic_results[topic], 1)

KeyError: 'straftaten-anschlaege'

In [342]:
topic_results.keys()

dict_keys(['grossveranstaltungen', 'sport-fussball', 'unfaelle', 'demonstrationen', 'immigration'])

In [358]:
def split_topic_results_into_query_types(topic_results):
    query_type_results = defaultdict(list)
    for topic_name, topic_result in topic_results.items():
        for event_result in topic_result:
            for title, query_dict in event_result.items():
                for query_type, results in query_dict.items():
                    query_type_results[query_type].append(results)
    return query_type_results

In [359]:
def calc_precision_at_k_by_query_type(topic_results, k):
    query_type_results = split_topic_results_into_query_types(topic_results)
    precisions = {}
    for query_type, results in query_type_results.items():
        precision = 0
        for result in results:
            for hit in result:
                rank, _ = hit
                if rank + 1 <= k:
                    precision += 1
        precisions[query_type] = precision / (len(results) * k)
    return precisions

In [361]:
calc_precision_at_k_by_query_type(topic_results, 1)

{'and': 0.6,
 'or': 0.7714285714285715,
 'and & date': 0.6,
 'or & date': 0.8,
 'and & loc': 0.6,
 'or & loc': 0.8,
 'and | loc': 0.6,
 'or | loc': 0.8,
 'and & loc & date': 0.6,
 'or & loc & date': 0.8,
 '(and | loc) & date': 0.6,
 '(or | loc) & date': 0.8}

In [None]:
def calc_mean_mrr_by_query(topic_results):
    pass

In [362]:
topic_results

{'grossveranstaltungen': [defaultdict(dict,
              {'Rhein in Flammen: Polizei ermittelt wegen sexueller Übergriffe': {'and': [(0,
                  'NfDKVmUB_NHj735hjcZN')],
                'or': [(0, 'NfDKVmUB_NHj735hjcZN')],
                'and & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (2, 'jPDKVmUB_NHj735hjb5L')],
                'or & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (3, 'jPDKVmUB_NHj735hjb5L')],
                'and & loc': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (7, 'jPDKVmUB_NHj735hjb5L')],
                'or & loc': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (8, 'jPDKVmUB_NHj735hjb5L')],
                'and | loc': [(0, 'NfDKVmUB_NHj735hjcZN')],
                'or | loc': [(0, 'NfDKVmUB_NHj735hjcZN')],
                'and & loc & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (1, 'jPDKVmUB_NHj735hjb5L')],
                'or & loc & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (1, 'jPDKVmUB_NHj735hjb5L')