In [533]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from matplotlib import pyplot
from newspaper import Article

%matplotlib inline

from collections import defaultdict
from datetime import timedelta, date, datetime
import json


In [534]:
data_file_names = ['grossveranstaltungen', 'sport-fussball',
              'unfaelle', 'demonstrationen',
              'immigration', 'straftaten-anschlaege'
                  ]

In [535]:
data = {}
for name in data_file_names:
    with open(f"{name}.json") as data_file:
        topic_data = json.load(data_file)
        data[name] = topic_data

In [536]:
client = Elasticsearch(hosts='robin-in.space', timeout=30)

In [537]:
class ArticleQueryBuilder():
    def __init__(self, article):
        self.article = article

        
    def simple_and_query(self):
        return Q("match", title=self.article['title']) & Q("match", body=self.article['body'])
        
        
    def simple_or_query(self):
        return Q("match", title=self.article['title']) | Q("match", body=self.article['body'])
    
    
    def date_filter(self, before=timedelta(weeks=2),
                            after=timedelta(weeks=8)):
        date = datetime.strptime(self.article['date'], '%Y-%m-%d')
        return Q("bool", filter=[Q('range', 
                            published={
                                'gte': date - before,
                                'lte': date + after
                            })])
    
    def location_query(self):
        return Q("bool", should=[
            Q("match", body=self.article['location']), 
            Q("match", title=self.article['location']),
            Q("match", officeName=self.article['location'])])

In [538]:
query = ab.simple_and_query() & ab.location_query() & ab.date_filter()

In [539]:
s = Search(using=client)
res = s.query(query).execute()

In [541]:
res[0].URL

'https://www.presseportal.de/blaulicht/pm/117683/3856357'

In [381]:
def construct_queries(article):
    builder = ArticleQueryBuilder(article)
    return {
        'and': builder.simple_and_query(),
        'or':builder.simple_or_query(),
        'and & date':builder.simple_and_query() & builder.date_filter(),
        'or & date':builder.simple_or_query() & builder.date_filter(),
        'and & loc':builder.simple_and_query() & builder.location_query(),
        'or & loc':builder.simple_or_query() & builder.location_query(),
        'and | loc':builder.simple_and_query() | builder.location_query(),
        'or | loc':builder.simple_or_query() | builder.location_query(),
        'and & loc & date':builder.simple_and_query() & builder.location_query() & builder.date_filter(),
        'or & loc & date':builder.simple_or_query() & builder.location_query() & builder.date_filter(),
        '(and | loc) & date':(builder.simple_and_query() | builder.location_query()) & builder.date_filter(),
        '(or | loc) & date':(builder.simple_or_query() | builder.location_query()) & builder.date_filter()
    }

In [382]:
def eval_query(query, pr_ids, client):
    res = Search(using=client).query(query).execute()
    return [(index, hit.meta.id) for index, hit in enumerate(res) if hit.meta.id in pr_ids]

In [383]:
def eval_event(event, client):
    results = defaultdict(dict)
    for article in event['articles']:
        queries = construct_queries(article)
        for name, query in queries.items(): 
            res = eval_query(query, event['pr-ids'], client)
            results[article['title']][name] = res
    return results

In [384]:
def eval_topic(topic, client):
    return [eval_event(event, client) for event in topic]

In [385]:
topic_results_2 = {topic_name: eval_topic(topic['data'], client) for 
                 topic_name, topic in data.items()}

In [389]:
topic_results_2.keys()

dict_keys(['grossveranstaltungen', 'sport-fussball', 'unfaelle', 'demonstrationen', 'immigration', 'straftaten-anschlaege'])

In [394]:
def precision_at_k(query_result, k):
    precision = 0
    for hit in query_result:
        rank, _ = hit
        if rank + 1 <= k:
            precision += 1
    return precision / k # We assume that always 
                         # at least k documents have been retrieved

In [395]:
def calc_mean_precision_at_k_by_topic(topic_result, k):
    query_results = []
    for event_result in topic_result:
        for title, query_dict in event_result.items():
            for query_type, results in query_dict.items():
                query_results.append(results)
    
    return sum(precision_at_k(result) for result in query_results) \
        / len(query_results)

In [410]:
precision_at_1_by_topic = {
    topic: calc_precision_at_k_by_topic(topic_results_2[topic], 1)
    for topic in data_file_names
}

In [411]:
precision_at_1_by_topic

{'grossveranstaltungen': 0.8333333333333334,
 'sport-fussball': 0.71875,
 'unfaelle': 0.8095238095238095,
 'demonstrationen': 0.6666666666666666,
 'immigration': 0.4,
 'straftaten-anschlaege': 0.7878787878787878}

In [398]:
def split_topic_results_into_query_types(topic_results):
    query_type_results = defaultdict(list)
    for topic_name, topic_result in topic_results.items():
        for event_result in topic_result:
            for title, query_dict in event_result.items():
                for query_type, results in query_dict.items():
                    query_type_results[query_type].append(results)
    return query_type_results

In [434]:
len([el for el in split_topic_results_into_query_types(topic_results_2)['or'] if len(el) > 0 and el[0][0]==0])

35

In [434]:
len([el for el in split_topic_results_into_query_types(topic_results_2)['or'] if len(el) > 0 and el[0][0]==0])

35

In [428]:
split_topic_results_into_query_types(topic_results_2)['or']

[[(0, 'NfDKVmUB_NHj735hjcZN')],
 [(0, 'NfDKVmUB_NHj735hjcZN')],
 [(0, 'NfDKVmUB_NHj735hjcZN')],
 [(0, 'vui9VmUB_NHj735h56bE')],
 [(0, 'T_LMVmUB_NHj735h9bsl')],
 [(0, 'T_LMVmUB_NHj735h9bsl')],
 [(0, 'OuW5VmUB_NHj735h9z8O')],
 [(0, 'gfHLVmUB_NHj735hCSQU')],
 [(0, 'dfHLVmUB_NHj735hiJ5E')],
 [],
 [(0, '4e3FVmUB_NHj735hFq4M')],
 [],
 [(0, 'y-a7VmUB_NHj735htL_I')],
 [(1, '8fTOVmUB_NHj735h2zuh')],
 [(0, '1vLMVmUB_NHj735hR0dY')],
 [(0, 'puvCVmUB_NHj735hPtV-')],
 [(0, 'zem-VmUB_NHj735hkDdN')],
 [(0, 'GPTOVmUB_NHj735hmham')],
 [(0, 'kPDKVmUB_NHj735hjbxL')],
 [(0, 'v_LMVmUB_NHj735hwKmH')],
 [(0, 'yfDJVmUB_NHj735hqEGf'), (2, 'ZvDJVmUB_NHj735h-GUG')],
 [(0, 'HOe9VmUB_NHj735hLOoi'), (5, 'z-W6VmUB_NHj735hUKfk')],
 [],
 [],
 [(0, 'tei9VmUB_NHj735hjjjS')],
 [(0, 'ceO4VmUB_NHj735hRc0c')],
 [(0, 'ceO4VmUB_NHj735hRc0c')],
 [(0, 'ceO4VmUB_NHj735hRc0c')],
 [(0, 'zvDJVmUB_NHj735h-HEI')],
 [(0, 'WO3DVmUB_NHj735h4Aq9')],
 [(6, 'kue8VmUB_NHj735h_8Ik')],
 [(0, 'CuO3VmUB_NHj735h8IvE')],
 [(1, 'zu7GVmUB_NHj735hoHP

In [441]:
def calc_precision_at_k_by_query_type(topic_results, k):
    query_type_results = split_topic_results_into_query_types(topic_results)
    precisions = {}
    for query_type, results_by_type in query_type_results.items():
        sum_precision_at_k = 0
        for results in results_by_type:
            sum_precision_at_k += precision_at_k(results, k)
        precisions[query_type] = sum_precision_at_k / len(results_by_type)
    return precisions

In [451]:
calc_precision_at_k_by_query_type(topic_results_2, 3)

{'and': 0.2753623188405797,
 'or': 0.31884057971014496,
 'and & date': 0.2971014492753623,
 'or & date': 0.33333333333333337,
 'and & loc': 0.3043478260869565,
 'or & loc': 0.34782608695652173,
 'and | loc': 0.29710144927536225,
 'or | loc': 0.34057971014492755,
 'and & loc & date': 0.32608695652173914,
 'or & loc & date': 0.3695652173913043,
 '(and | loc) & date': 0.31884057971014496,
 '(or | loc) & date': 0.36231884057971014}

In [None]:
def calc_mean_mrr_by_query(topic_results):
    pass

In [362]:
topic_results

{'grossveranstaltungen': [defaultdict(dict,
              {'Rhein in Flammen: Polizei ermittelt wegen sexueller Übergriffe': {'and': [(0,
                  'NfDKVmUB_NHj735hjcZN')],
                'or': [(0, 'NfDKVmUB_NHj735hjcZN')],
                'and & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (2, 'jPDKVmUB_NHj735hjb5L')],
                'or & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (3, 'jPDKVmUB_NHj735hjb5L')],
                'and & loc': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (7, 'jPDKVmUB_NHj735hjb5L')],
                'or & loc': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (8, 'jPDKVmUB_NHj735hjb5L')],
                'and | loc': [(0, 'NfDKVmUB_NHj735hjcZN')],
                'or | loc': [(0, 'NfDKVmUB_NHj735hjcZN')],
                'and & loc & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (1, 'jPDKVmUB_NHj735hjb5L')],
                'or & loc & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (1, 'jPDKVmUB_NHj735hjb5L')

In [593]:
art = Article('https://www.braunschweiger-zeitung.de/braunschweig/article152184288/45-Jaehriger-filmte-junge-Frauen-unter-den-Rock.html'
             ,language='de')

In [594]:
art.download()
art.parse()
art.nlp()

In [595]:
art.title

'45-Jähriger filmte junge Frauen unter den Rock'

In [598]:
art.publish_date 

In [596]:
art.summary

'„Viel Arbeit, aber nichts Besonderes“, so lautete am Montag die Bilanz der Polizei nach dem spektakulären Karnevalssonntag, an dem rund 200 000 Menschen in der Stadt unterwegs waren (wir berichteten).\nDie Beamten leiteten 37 Strafverfahren ein, darunter zwölf wegen Körperverletzungen und zehn wegen Taschendiebstahls.\nVor allem auf der Langen Straße waren die Taschendiebe aktiv.\nMit Messern, so Polizeisprecher...'

In [597]:
art.text

'„Viel Arbeit, aber nichts Besonderes“, so lautete am Montag die Bilanz der Polizei nach dem spektakulären Karnevalssonntag, an dem rund 200 000 Menschen in der Stadt unterwegs waren (wir berichteten). Die Beamten leiteten 37 Strafverfahren ein, darunter zwölf wegen Körperverletzungen und zehn wegen Taschendiebstahls. Vor allem auf der Langen Straße waren die Taschendiebe aktiv. Mit Messern, so Polizeisprecher...\n\n\n\n'

In [542]:
import spacy
nlp = spacy.load('de')

In [544]:
a = nlp('hallo ich wohne in Berlin')

In [610]:
doc = nlp(art.text)
art.meta_data

defaultdict(dict,
            {'description': 'Die Polizei hatte beim Schoduvel viel zu tun. Ärger gab es rund um einen Wagen der Eintracht-Fans.',
             'keywords': 'Frauen,Polizei,Fans,Ärger,Leipzig,Montag,Sprecher,Personen,Sonntag,Gegner',
             'author': 'Von Norbert Jonscher',
             'robots': 'index,follow,noodp',
             'viewport': 'width=device-width, shrink-to-fit=no, user-scalable=yes',
             'news_keywords': 'Frauen,Polizei,Fans,Ärger,Leipzig,Montag,Sprecher,Personen,Sonntag,Gegner',
             'DC.date.issued': '2016-02-08T15:29:04+01:00',
             'last-modified': '2016-09-22T18:51:33+02:00',
             'copyright': 'BZV Medienhaus GmbH, Braunschweig, Germany',
             'revisit-after': '1 hour',
             'og': {'title': '45-Jähriger filmte junge Frauen unter den Rock',
              'description': 'Die Polizei hatte beim Schoduvel viel zu tun. Ärger gab es rund um einen Wagen der Eintracht-Fans.',
              'url': 'http

In [600]:
locs =[ent for ent in doc.ents if ent.label_ == "LOC"]

In [582]:
[ent for ent in doc.ents if ent.label_ == "DATE"]

[]

In [601]:
locs

[Stadt, Langen Straße, Messern]

In [611]:
res2

NameError: name 'res2' is not defined

In [614]:
topic_results_2

{'grossveranstaltungen': [defaultdict(dict,
              {'Rhein in Flammen: Polizei ermittelt wegen sexueller Übergriffe': {'and': [(0,
                  'NfDKVmUB_NHj735hjcZN')],
                'or': [(0, 'NfDKVmUB_NHj735hjcZN')],
                'and & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (2, 'jPDKVmUB_NHj735hjb5L')],
                'or & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (3, 'jPDKVmUB_NHj735hjb5L')],
                'and & loc': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (7, 'jPDKVmUB_NHj735hjb5L')],
                'or & loc': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (8, 'jPDKVmUB_NHj735hjb5L')],
                'and | loc': [(0, 'NfDKVmUB_NHj735hjcZN')],
                'or | loc': [(0, 'NfDKVmUB_NHj735hjcZN')],
                'and & loc & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (1, 'jPDKVmUB_NHj735hjb5L')],
                'or & loc & date': [(0, 'NfDKVmUB_NHj735hjcZN'),
                 (1, 'jPDKVmUB_NHj735hjb5L')

In [615]:
s={1,2,3}
s

{1, 2, 3}

In [616]:
type(s)

set

In [617]:
s.add(4)

In [618]:
s

{1, 2, 3, 4}

In [619]:
s.add(2)

In [620]:
s

{1, 2, 3, 4}