In [1]:
import pymongo
from pymongo import MongoClient
from pprint import pprint, pformat
from bson.son import SON
import os.

In [2]:
# Connect to database and create variables for the collections
client = MongoClient()
db = client.lobbyradar
entities = db.entities
relations = db.relations

# Print some statistics about our database dump
print("Collections in database: " + ", ".join(db.collection_names(include_system_collections=False)))
print("Entities: %s documents" % entities.count())
print("Relations: %s documents" % relations.count())

Collections in database: entities, relations
Entities: 26380 documents
Relations: 32137 documents


# The entities collection

Für die erste Untersuchung der Collections wird das Tool Variety verwendet (https://github.com/variety/variety). Es hilft dabei, die Daten zu verstehen und einen Überblick über sie zu bekommen.

In [None]:
%%!

In [10]:
entities_documents = entities.find().limit(10)
pprint(list(entities_documents))

[{u'_id': ObjectId('54bd3c748b934da06340f4c1'),
  u'aliases': [u'DIE LINKE',
               u'Die Linke',
               u'Partei DIE LINKE',
               u'DIE LINKE.',
               u'Linkspartei',
               u'Linkspartei.PDS',
               u'PDS',
               u'WASG',
               u'Partei des Demokratischen Sozialismus',
               u'Arbeit & soziale Gerechtigkeit \u2013 Die Wahlalternative',
               u'Wahlalternative',
               u'Wahlalternative Arbeit und soziale Gerechtigkeit'],
  u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 807000),
  u'data': [{u'auto': True,
             u'created': datetime.datetime(2015, 5, 28, 18, 11, 9, 657000),
             u'desc': u'Partei',
             u'format': u'string',
             u'id': u'2b1adb60a31d37cf9cc0fdccb75149456a425095c6bf5e77abac117ed1a69d0f',
             u'key': u'partei',
             u'updated': datetime.datetime(2015, 5, 28, 18, 11, 9, 657000),
             u'value': u'Die Linke'},
    

In [None]:
print("Distict entity types:" + pformat(db.entities.distinct("type")))

In [None]:
from bson.son import SON

# Aggregation pipeline that groups by type (entity/person) and counts the sum of documents in each group in a field "counter"
# The second step sorts the groups by counter and id 
pipeline = [
    {"$group": {"_id": "$type", "counter": {"$sum": 1}}},   # First step
    {"$sort": SON([("counter", -1), ("_id", -1)])}          # Second step
]

# Output as list to get all values at once instead of a cursor
print("Entitiy types count:")
print(list(db.entities.aggregate(pipeline, cursor={})))
print("Relation types count:")
print(list(db.relations.aggregate(pipeline, cursor={})))

In [None]:
# Aggregation mit Gruppierung der Dokumente aus der collection entities nach Feld "importer", zählen der Gruppen und aufsteigend danach sortieren
pipeline = [ {"$group": {"_id": "$importer", "count": {"$sum": 1}}} , {"$sort": {"count":1} }]
list(db.entities.aggregate(pipeline, cursor={}))


In [None]:
# Entitäten mit Attribut "importer": "parteispenden14"
query = {'type': 'entity', 'importer': 'parteispenden14'}
projection = {'_id': 0, 'name': 1}
for i in db.entities.find(query, projection):
     pprint(i)

print("%s Entitäten mit importer parteispenden14 gefunden" % db.entities.find(query, projection).count())

In [None]:
# Gruppierung nach importer und Zählen der Elemente der Gruppen. Dann sortieren
pipeline = [ {"$group": {"_id": "$importer", "count": {"$sum": 1}}} , {"$sort": {"count":1} }]
list(db.entities.aggregate(pipeline, cursor={}))

In [None]:
# Query on a array fields (here tags field)                                                                                                                                                    
query = {'tags': "representative"}                                                                                                                                                             
one_representative = db.entities.find_one(query)                                                                                                                                                                    
print("\"%s\" is a representative." % one_representative.get("name"))
one_representative.get("tags")

In [None]:
# in compararion with                                                                                                                                                                          
print("Die Datenbank enthält %s Entitäten mit Tag Medien und %s Entitäten mit Tag lobbyismus" % (db.entities.find({'tags': "Medien"}).count(), db.entities.find({'tags': "lobbyismus"}).count()))