In [224]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q

from collections import defaultdict
from datetime import timedelta, date, datetime
import json

from matplo

In [225]:
data_file_names = ['grossveranstaltungen', 'sport-fussball',
              'unfaelle', 'demonstrationen',
              'immigration', # 'straftaten-anschlaege'
                  ]

In [226]:
data = {}
for name in data_file_names:
    with open(f"{name}.json") as data_file:
        topic_data = json.load(data_file)
        data[name] = topic_data

In [227]:
client = Elasticsearch(hosts='robin-in.space')

In [228]:
class ArticleQueryBuilder():
    def __init__(self, article):
        self.article = article

        
    def simple_and_query(self):
        return Q("match", title=self.article['title']) & Q("match", body=self.article['body'])
        
        
    def simple_or_query(self):
        return Q("match", title=self.article['title']) | Q("match", body=self.article['body'])
    
    
    def date_filter(self, before=timedelta(weeks=2),
                            after=timedelta(weeks=8)):
        date = datetime.strptime(self.article['date'], '%Y-%m-%d')
        return Q("bool", filter=[Q('range', 
                            published={
                                'gte': date - before,
                                'lte': date + after
                            })])
    
    def location_query(self):
        return Q("bool", should=[
            Q("match", body=self.article['location']), 
            Q("match", title=self.article['location']),
            Q("match", officeName=self.article['location'])])

In [229]:
query = ab.simple_and_query() & ab.location_query() & ab.date_filter()

In [230]:
s = Search(using=client)
res = s.query(query).execute()

In [231]:
def construct_queries(article):
    builder = ArticleQueryBuilder(article)
    return [
        builder.simple_and_query(),
        builder.simple_or_query(),
        builder.simple_and_query() & builder.date_filter(),
        builder.simple_or_query() & builder.date_filter(),
        builder.simple_and_query() & builder.location_query(),
        builder.simple_or_query() & builder.location_query(),
        builder.simple_and_query() | builder.location_query(),
        builder.simple_or_query() | builder.location_query(),
        builder.simple_and_query() & builder.location_query() & builder.date_filter(),
        builder.simple_or_query() & builder.location_query() & builder.date_filter(),
        (builder.simple_and_query() | builder.location_query()) & builder.date_filter(),
        (builder.simple_or_query() | builder.location_query()) & builder.date_filter()
    ]

In [232]:
def eval_query(query, pr_ids, client):
    res = Search(using=client).query(query).execute()
    return [(index, hit.meta.id) for index, hit in enumerate(res) if hit.meta.id in pr_ids]

In [233]:
def eval_event(event, client):
    results = defaultdict(list)
    for article in event['articles']:
        queries = construct_queries(article)
        for query in queries: 
            res = eval_query(query, event['pr-ids'], client)
            results[article['title']].append(res)
    return results

In [234]:
def eval_topic(topic, client):
    return [eval_event(event, client) for event in topic]

In [235]:
topic_results = {topic_name: eval_topic(topic['data'], client) for 
                 topic_name, topic in data.items()}

In [203]:
topic_results

{'grossveranstaltungen': [defaultdict(list,
              {'Rhein in Flammen: Polizei ermittelt wegen sexueller Übergriffe': [[(0,
                  'NfDKVmUB_NHj735hjcZN')],
                [(0, 'NfDKVmUB_NHj735hjcZN')],
                [(0, 'NfDKVmUB_NHj735hjcZN'), (2, 'jPDKVmUB_NHj735hjb5L')],
                [(0, 'NfDKVmUB_NHj735hjcZN'), (3, 'jPDKVmUB_NHj735hjb5L')],
                [(0, 'NfDKVmUB_NHj735hjcZN'), (7, 'jPDKVmUB_NHj735hjb5L')],
                [(0, 'NfDKVmUB_NHj735hjcZN'), (8, 'jPDKVmUB_NHj735hjb5L')],
                [(0, 'NfDKVmUB_NHj735hjcZN')],
                [(0, 'NfDKVmUB_NHj735hjcZN')],
                [(0, 'NfDKVmUB_NHj735hjcZN'), (1, 'jPDKVmUB_NHj735hjb5L')],
                [(0, 'NfDKVmUB_NHj735hjcZN'), (1, 'jPDKVmUB_NHj735hjb5L')],
                [(0, 'NfDKVmUB_NHj735hjcZN'), (1, 'jPDKVmUB_NHj735hjb5L')],
                [(0, 'NfDKVmUB_NHj735hjcZN'), (1, 'jPDKVmUB_NHj735hjb5L')],
                [(0, 'NfDKVmUB_NHj735hjcZN')],
                [(0, 'NfDKVmU