In [1]:
import pymongo
from pymongo import MongoClient
from pprint import pprint, pformat, PrettyPrinter
from bson.son import SON
from os.path import exists
import json
from ipy_table import *

In [2]:
# Connect to database and create variables for the collections
client = MongoClient()
db = client.lobbyradar
entities = db.entities
relations = db.relations

# Print some statistics about our database dump
print("Collections in database: " + ", ".join(db.collection_names(include_system_collections=False)))
print("Entities: %s documents" % entities.count())
print("Relations: %s documents" % relations.count())

Collections in database: entities, relations
Entities: 26380 documents
Relations: 32137 documents


# The entities collection

Als erstes werfen wir einen Blick in das erste Dokument der Collection um eine Ahnung zu erhalten, womit wir es überhaupt zu tun haben. Die Dokumente in einer MongoDB werden im JSON-Format abgespeichert.

In [3]:
# Get a sample document to have a look at and print it
first_entity_document = entities.find_one()
pprint(first_entity_document)

{u'_id': ObjectId('54bd3c748b934da06340f4c1'),
 u'aliases': [u'DIE LINKE',
              u'Die Linke',
              u'Partei DIE LINKE',
              u'DIE LINKE.',
              u'Linkspartei',
              u'Linkspartei.PDS',
              u'PDS',
              u'WASG',
              u'Partei des Demokratischen Sozialismus',
              u'Arbeit & soziale Gerechtigkeit \u2013 Die Wahlalternative',
              u'Wahlalternative',
              u'Wahlalternative Arbeit und soziale Gerechtigkeit'],
 u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 807000),
 u'data': [{u'auto': True,
            u'created': datetime.datetime(2015, 5, 28, 18, 11, 9, 657000),
            u'desc': u'Partei',
            u'format': u'string',
            u'id': u'2b1adb60a31d37cf9cc0fdccb75149456a425095c6bf5e77abac117ed1a69d0f',
            u'key': u'partei',
            u'updated': datetime.datetime(2015, 5, 28, 18, 11, 9, 657000),
            u'value': u'Die Linke'},
           {u'auto': True,

In [4]:
# See what CDU looks like
#pprint(list(entities.find({"aliases": {"$eq": "CDU"}})))
pprint(list(entities.find({"name":"SPD"})))

[{u'_id': ObjectId('552ff9cdaf9ee96e1c1df7c0'),
  u'aliases': [],
  u'created': datetime.datetime(2015, 4, 16, 18, 4, 59, 967000),
  u'data': [{u'auto': True,
             u'created': datetime.datetime(2015, 5, 28, 18, 11, 10, 90000),
             u'desc': u'Partei',
             u'format': u'string',
             u'id': u'47f00e8c0c72caf8c1c4b3b5f0141df8755d5fb32f4b7f718857e45c58aff865',
             u'key': u'partei',
             u'updated': datetime.datetime(2015, 5, 28, 18, 11, 10, 90000),
             u'value': u'SPD'}],
  u'importer': u'seitenwechsler',
  u'name': u'SPD',
  u'search': [u'spd'],
  u'slug': u'spd',
  u'tags': [u'partei'],
  u'type': u'entity',
  u'updated': datetime.datetime(2015, 5, 28, 18, 11, 12, 860000)}]


### Gregor Gysi as example

In [5]:
gysi = entities.find_one({"name": "Gregor Gysi"})
gysi.get("_id")

ObjectId('54c2a4b4fe6a42c82bbab01c')

In [6]:
gysi_relations = relations.find({"entities": {"$eq": gysi.get("_id")}})
pprint(list(gysi_relations))

[{u'_id': ObjectId('54c2a4f6fe6a42c82bbabc0b'),
  u'created': datetime.datetime(2015, 1, 23, 19, 45, 58, 459000),
  u'data': [{u'auto': True,
             u'created': datetime.datetime(2015, 1, 23, 19, 45, 58, 433000),
             u'desc': u'Parteispende',
             u'format': u'donation',
             u'id': u'3321f46291fbf0dbe70a3450274aa1da3e5bcdfc75dded5358f3fe5e803e0f54',
             u'importer': u'Parteispenden',
             u'key': u'donation',
             u'updated': datetime.datetime(2015, 1, 23, 19, 45, 58, 433000),
             u'value': {u'amount': 11938.62,
                        u'sources': [u'http://apps.opendatacity.de/parteispenden-recherche/assets/data/parteispenden.json'],
                        u'year': 1994}},
            {u'auto': True,
             u'created': datetime.datetime(2015, 1, 23, 19, 48, 2, 138000),
             u'desc': u'Parteispende',
             u'format': u'donation',
             u'id': u'844d9989c48677087381316fc07428c57d45096f31daaaf7

Die direkten Felder des Dokuments sind (abgesehen von der Id):
 _aliases, created, data, importer, name, search, slug, tags, type, updated_
 
 Zur anschließenden Analyse wurde das Tool __Variety__ verwendet (https://github.com/variety/variety). Es hilft dabei, die Daten und ihr Struktur zu verstehen und einen Überblick über sie zu bekommen.

In [7]:
# We need to find the tool variety before we can use it
VARIETY = "../variety/variety.js"
if not exists(VARIETY):
     print("variety.js script NICHT in %s gefunden. Ausversehen verschoben?" % VARIETY)
        
# Save database analysis as JSON for processing 
shell_capture = ! mongo --quiet lobbyradar --eval "var collection = 'entities', outputFormat='json'" {VARIETY}
variety_schema_json = shell_capture.n

# And print as ascii
! mongo --quiet lobbyradar --eval "var collection = 'entities', outputFormat='ascii'" {VARIETY}

+------------------------------------------------------------------------------------------------+
| key                      | types                        | occurrences | percents               |
| ------------------------ | ---------------------------- | ----------- | ---------------------- |
| _id                      | ObjectId                     |       26380 | 100.000000000000000000 |
| aliases                  | Array                        |       26380 | 100.000000000000000000 |
| created                  | Date                         |       26380 | 100.000000000000000000 |
| data                     | Array                        |       26380 | 100.000000000000000000 |
| importer                 | String,null                  |       26380 | 100.000000000000000000 |
| name                     | String                       |       26380 | 100.000000000000000000 |
| search                   | Array                        |       26380 | 100.000000000000000000 |


## Tags

Da Tags ein Array ist, muss es in der Pipeline zunächst in seine Elemente aufgeteilt werden.

In [8]:
# Aggregation, die das "tags"-Array der Dokumente zerlegt und für jeden Wert einen neuen Eintrag im Resultset erzeugt.
# Diese werden anschließend gruppiert, die Gruppengrößen gezählt und entsprechend sortiert
pipeline = [
    {"$unwind": "$tags"},
    {"$group": {"_id": "$tags", "count": {"$sum": 1}}},
    {"$sort": {"count": -1} }
]

distinct_tags_count = list(db.entities.aggregate(pipeline, cursor={}))
pprint(distinct_tags_count)
print("Total: %s distinct tags." % len(distinct_tags_count))

[{u'_id': u'lobbyismus', u'count': 16998},
 {u'_id': u'lobbyist', u'count': 14740},
 {u'_id': u'executive', u'count': 13095},
 {u'_id': u'parteispenden', u'count': 3164},
 {u'_id': u'lobbyorganisation', u'count': 2222},
 {u'_id': u'representative', u'count': 1915},
 {u'_id': u'nebeneinkuenfte', u'count': 1874},
 {u'_id': u'parteispenden13', u'count': 979},
 {u'_id': u'dax', u'count': 636},
 {u'_id': u'bundestag', u'count': 630},
 {u'_id': u'mdb', u'count': 629},
 {u'_id': u'aufsichtsrat', u'count': 433},
 {u'_id': u'laender', u'count': 340},
 {u'_id': u'landesregierung', u'count': 332},
 {u'_id': u'thinktank', u'count': 330},
 {u'_id': u'seitenwechsler', u'count': 288},
 {u'_id': u'vorstand', u'count': 188},
 {u'_id': u'politik', u'count': 162},
 {u'_id': u'verwaltung', u'count': 150},
 {u'_id': u'kabinette', u'count': 150},
 {u'_id': u'Bau', u'count': 149},
 {u'_id': u'Banken / Versicherungen', u'count': 135},
 {u'_id': u'Finanzen', u'count': 133},
 {u'_id': u'Verwaltung / Politik', u

In [20]:
# Get an example of tag "lobbyismus"
pprint(list(db.entities.find({ "tags": {"$eq": "parteispenden"}}).limit(5)))

[{u'_id': ObjectId('54bd3c768b934da06340f50a'),
  u'aliases': [],
  u'created': datetime.datetime(2015, 1, 19, 17, 18, 46, 830000),
  u'data': [{u'auto': True,
             u'created': datetime.datetime(2015, 1, 23, 19, 44, 50, 892000),
             u'desc': u'Quelle',
             u'format': u'link',
             u'id': u'870f0255a688738d3f1cbac751e60f4f48e0cac85842ce881dbdbabc8e502d58',
             u'key': u'source',
             u'updated': datetime.datetime(2015, 1, 23, 19, 44, 50, 892000),
             u'value': {u'remark': u'created by parteispenden importer',
                        u'url': u'http://apps.opendatacity.de/parteispenden-recherche/assets/data/parteispenden.json'}},
            {u'auto': True,
             u'created': datetime.datetime(2015, 1, 23, 19, 44, 50, 892000),
             u'desc': u'Adresse',
             u'format': u'address',
             u'id': u'7e3ebbe597747c947566f51876b736521b64daf724df9e0542b7b1cc0b355488',
             u'key': u'address',
        

## Type

In [9]:
# Aggregation mit Gruppierung der Dokumente nach Feld "type", zählen der Gruppen und aufsteigend danach sortieren
pipeline = [ 
    { "$group": {"_id": "$type", "count": {"$sum": 1}} },
    { "$sort": {"count":1} }
]

distinct_types_count = list(db.entities.aggregate(pipeline, cursor={}))
pprint(distinct_types_count)
print("Total: %s distinct types." % len(distinct_types_count))

[{u'_id': u'entity', u'count': 6552}, {u'_id': u'person', u'count': 19828}]
Total: 2 distinct types.


# Importer

In [10]:
# Aggregation mit Gruppierung der Dokumente nach Feld "importer", zählen der Gruppen und aufsteigend danach sortieren
pipeline = [ 
    { "$group": {"_id": "$importer", "count": {"$sum": 1}} },
    { "$sort": {"count": -1} }
]

distinct_importer_count = list(db.entities.aggregate(pipeline, cursor={}))
pprint(distinct_importer_count)
print("Total: %s distinct importer." % len(distinct_importer_count))

[{u'_id': u'lobbyliste', u'count': 16804},
 {u'_id': u'parteispenden', u'count': 2914},
 {u'_id': u'bundestag', u'count': 2046},
 {u'_id': u'pr', u'count': 1767},
 {u'_id': None, u'count': 1063},
 {u'_id': u'dax', u'count': 541},
 {u'_id': u'laender', u'count': 370},
 {u'_id': u'parteispenden13', u'count': 311},
 {u'_id': u'thinktanks', u'count': 309},
 {u'_id': u'seitenwechsler', u'count': 214},
 {u'_id': u'kabinette', u'count': 34},
 {u'_id': u'parteispenden14', u'count': 6},
 {u'_id': u'parteien', u'count': 1}]
Total: 13 distinct importer.


## Data

In [11]:
# Werte von "Desc"
pipeline = [
    {"$unwind": "$data"},
    {"$group": {"_id": {"desc": "$data.desc", "key": "$data.key"}, "count": {"$sum": 1}}},
    {"$sort": {"count": -1} }
]

distinct_desc_types = list(db.entities.aggregate(pipeline, cursor={}))
pprint(distinct_desc_types)

[{u'_id': {u'desc': u'Quelle', u'key': u'source'}, u'count': 27828},
 {u'_id': {u'desc': u'Titel', u'key': u'titles'}, u'count': 14560},
 {u'_id': {u'desc': u'Adresse', u'key': u'address'}, u'count': 12712},
 {u'_id': {u'desc': u'Link', u'key': u'link'}, u'count': 5433},
 {u'_id': {u'desc': u'Nachname', u'key': u'surname'}, u'count': 3445},
 {u'_id': {u'desc': u'Vornamen', u'key': u'names'}, u'count': 3441},
 {u'_id': {u'desc': u'Foto', u'key': u'photo'}, u'count': 2433},
 {u'_id': {u'desc': u'Beschreibungstext', u'key': u'description'},
  u'count': 2370},
 {u'_id': {u'desc': u'Anzahl der Mitglieder', u'key': u'members'},
  u'count': 2118},
 {u'_id': {u'desc': u'Anzahl der Mitgliedsorganisationen',
           u'key': u'organisations'},
  u'count': 953},
 {u'_id': {u'desc': u'Thema', u'key': u'topic'}, u'count': 684},
 {u'_id': {u'desc': u'Bundesland', u'key': u'bundesland'}, u'count': 629},
 {u'_id': {u'desc': u'Benutzename BT-Cert', u'key': u'btcertuid'},
  u'count': 608},
 {u'_id': {

In [12]:
pprint(entities.find_one({ "data.desc": {"$eq": "Anzahl der Mitglieder"}}))

{u'_id': ObjectId('54bd3c768b934da06340f4c5'),
 u'aliases': [],
 u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
 u'data': [{u'auto': True,
            u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
            u'desc': u'Quelle',
            u'format': u'link',
            u'id': u'9f8cdd62674e492bfbf99f1950696edf49aec861cee09b275fc227a3dd4c0d97',
            u'key': u'source',
            u'updated': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
            u'value': {u'remark': u'created by lobbyliste importer',
                       u'url': u'http://bundestag.de/blob/189476/8989cc5f5f65426215d7e0233704b20a/lobbylisteaktuell-data.pdf'}},
           {u'auto': True,
            u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
            u'desc': u'Adresse',
            u'format': u'address',
            u'id': u'0191e4e640931e690d1f700a4334dd24be8a57a5284883892b74f4462af77970',
            u'key': u'address',
            u'update

In [13]:
pprint(entities.find_one({ "data.desc": {"$eq": "Anzahl der Mitarbeiter"}}))

{u'_id': ObjectId('54bd3c808b934da063410df4'),
 u'aliases': [u'DGAP'],
 u'created': datetime.datetime(2015, 1, 19, 17, 18, 45, 552000),
 u'data': [{u'auto': True,
            u'created': datetime.datetime(2015, 1, 30, 13, 16, 43, 872000),
            u'desc': u'Quelle',
            u'format': u'link',
            u'id': u'9fa39ace3522476a01edc7803c68efd2bc3d99c7fe0b7c97ea24a02b2cf4a704',
            u'key': u'source',
            u'updated': datetime.datetime(2015, 1, 30, 13, 16, 43, 872000),
            u'value': {u'remark': u'created by bundestag importer',
                       u'url': u'http://www.bundestag.de/bundestag/abgeordnete18/biografien/M/missfelder_philipp/258790'}},
           {u'auto': True,
            u'created': datetime.datetime(2015, 1, 30, 13, 16, 43, 872000),
            u'desc': u'Adresse',
            u'format': u'address',
            u'id': u'b9083c019f8d8aa1a221b5318bac2a45e81736728a12aa5b353391ebff4666bc',
            u'key': u'address',
            u'updat

In [14]:
# Print an example value of each description type
for desc in distinct_desc_types:
    desc_type = desc['_id']['desc']
    entity_with_desc_type = db.entities.find_one({ "data.desc": {"$eq": desc_type} }, { "data": 1 })
    #entity_with_desc_type = db.entities.find_one({ "data.desc": {"$eq": desc_type} }, { "data": {"$elemMatch": {"data.desc": desc_type}} })
    pprint(entity_with_desc_type)
    
    

{u'_id': ObjectId('54bd3c768b934da06340f4c5'),
 u'data': [{u'auto': True,
            u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
            u'desc': u'Quelle',
            u'format': u'link',
            u'id': u'9f8cdd62674e492bfbf99f1950696edf49aec861cee09b275fc227a3dd4c0d97',
            u'key': u'source',
            u'updated': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
            u'value': {u'remark': u'created by lobbyliste importer',
                       u'url': u'http://bundestag.de/blob/189476/8989cc5f5f65426215d7e0233704b20a/lobbylisteaktuell-data.pdf'}},
           {u'auto': True,
            u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
            u'desc': u'Adresse',
            u'format': u'address',
            u'id': u'0191e4e640931e690d1f700a4334dd24be8a57a5284883892b74f4462af77970',
            u'key': u'address',
            u'updated': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
            u'value': {u'a

In [15]:
db.entities.find_one({ "data.desc": {"$eq": "Quelle"} })

{u'_id': ObjectId('54bd3c768b934da06340f4c5'),
 u'aliases': [],
 u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
 u'data': [{u'auto': True,
   u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
   u'desc': u'Quelle',
   u'format': u'link',
   u'id': u'9f8cdd62674e492bfbf99f1950696edf49aec861cee09b275fc227a3dd4c0d97',
   u'key': u'source',
   u'updated': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
   u'value': {u'remark': u'created by lobbyliste importer',
    u'url': u'http://bundestag.de/blob/189476/8989cc5f5f65426215d7e0233704b20a/lobbylisteaktuell-data.pdf'}},
  {u'auto': True,
   u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
   u'desc': u'Adresse',
   u'format': u'address',
   u'id': u'0191e4e640931e690d1f700a4334dd24be8a57a5284883892b74f4462af77970',
   u'key': u'address',
   u'updated': datetime.datetime(2015, 1, 19, 17, 18, 44, 930000),
   u'value': {u'addr': u'Hinter der katholischen Kirche 3',
    u'city': u'Berlin',
    u

In [16]:
pprint(list(entities.find({ "name": "Angela Merkel" })))

[{u'_id': ObjectId('54c2a4b4fe6a42c82bbaafac'),
  u'aliases': [u'Dr. Angela Merkel', u'Angela Merkel'],
  u'created': datetime.datetime(2015, 1, 23, 19, 44, 50, 772000),
  u'data': [{u'auto': True,
             u'created': datetime.datetime(2015, 1, 30, 13, 16, 19, 123000),
             u'desc': u'Quelle',
             u'format': u'link',
             u'id': u'9dc681ff88ef78470f58dad0a22fde397147a9f46b416478d2fd87233fa728ab',
             u'key': u'source',
             u'updated': datetime.datetime(2015, 1, 30, 13, 16, 19, 123000),
             u'value': {u'remark': u'created by bundestag importer',
                        u'url': u'http://www.bundestag.de/bundestag/abgeordnete18/biografien/M/merkel_angela/258788'}},
            {u'auto': True,
             u'created': datetime.datetime(2015, 1, 30, 13, 16, 19, 123000),
             u'desc': u'Vornamen',
             u'format': u'string',
             u'id': u'dfdee4542498db7db194d7f76a2b7912d5de8efa322e97eb49c7bee72028fe7a',
        

## Data