### Make sure mongodb service is up and running
### _First time only_: Import entities and relations into database

Bash:
```bash
mongorestore --db lobbyradar --collection entities dumps/entities.bson
mongorestore --db lobbyradar --collection relations dumps/relations.bson
```

In [14]:
from notebook.nbextensions import install_nbextension, check_nbextension
install_nbextension('https://goo.gl/5TK96v', user=True, destination="vim_binding.js")
check_nbextension("vim_binding.js", user=True)

downloading https://goo.gl/5TK96v to /tmp/tmpYlEbLE/5TK96v
copying /tmp/tmpYlEbLE/5TK96v -> /home/reiscracker/.local/share/jupyter/nbextensions/vim_binding.js


True

In [15]:
%%javascript
Jupyter.utils.load_extensions('vim_binding')

<IPython.core.display.Javascript object>

## Connect to database

In [16]:
import pymongo
from pymongo import MongoClient
from pprint import pprint, pformat

client = MongoClient() # Default address localhost
db = client.lobbyradar

## The collections

### Numbers

In [17]:
# The database has two collections
print("Collections in database: " + ", ".join(db.collection_names(include_system_collections=False)))

print("Entities: %s documents" % db.entities.count())
print("Relations: %s documents" % db.relations.count())

Collections in database: entities, relations
Entities: 26380 documents
Relations: 32137 documents


### Simple queries

In [18]:
print("Distict entity types:" + pformat(db.entities.distinct("type")))

Distict entity types:[u'entity', u'person']


In [19]:
print("Distict relation types:" + pformat(db.relations.distinct("type")))

Distict relation types:[u'general', u'publication']


In [20]:
# Some commands
print("First Entity document in database:" + pformat(db.entities.find_one()) + "\n")
#entity_cursor = db.entities.find() # Would find all entities
#entity_cursor.next() # Iterator for resultset

First Entity document in database:{u'_id': ObjectId('54bd3c748b934da06340f4c1'),
 u'aliases': [u'DIE LINKE',
              u'Die Linke',
              u'Partei DIE LINKE',
              u'DIE LINKE.',
              u'Linkspartei',
              u'Linkspartei.PDS',
              u'PDS',
              u'WASG',
              u'Partei des Demokratischen Sozialismus',
              u'Arbeit & soziale Gerechtigkeit \u2013 Die Wahlalternative',
              u'Wahlalternative',
              u'Wahlalternative Arbeit und soziale Gerechtigkeit'],
 u'created': datetime.datetime(2015, 1, 19, 17, 18, 44, 807000),
 u'data': [{u'auto': True,
            u'created': datetime.datetime(2015, 5, 28, 18, 11, 9, 657000),
            u'desc': u'Partei',
            u'format': u'string',
            u'id': u'2b1adb60a31d37cf9cc0fdccb75149456a425095c6bf5e77abac117ed1a69d0f',
            u'key': u'partei',
            u'updated': datetime.datetime(2015, 5, 28, 18, 11, 9, 657000),
            u'value': u'Die L

In [21]:
print("First relation document in database:" + pformat(db.relations.find_one()) + "\n")
#relation_cursor = db.entities.find()
#relation_cursor.next()

First relation document in database:{u'_id': ObjectId('54bd3c968b934da063413717'),
 u'created': datetime.datetime(2015, 1, 19, 17, 19, 18, 521000),
 u'data': [{u'desc': u'Verbindung',
            u'format': u'association',
            u'importer': u'created by lobbyliste importer',
            u'key': u'association',
            u'value': {u'position': u'Vorstand',
                       u'sources': [u'http://bundestag.de/blob/189476/8989cc5f5f65426215d7e0233704b20a/lobbylisteaktuell-data.pdf'],
                       u'type': u'executive'}}],
 u'entities': [ObjectId('54bd3c768b934da06340f4c5'),
               ObjectId('54bd3c768b934da06340f4c7')],
 u'importer': u'lobbyliste',
 u'tags': [],
 u'type': u'general',
 u'updated': datetime.datetime(2015, 1, 19, 17, 19, 18, 521000),
 u'weight': 1}



### Aggregations

In [22]:
from bson.son import SON

# Aggregation pipeline that groups by type (entity/person) and counts the sum of documents in each group in a field "counter"
# The second step sorts the groups by counter and id 
pipeline = [
    {"$group": {"_id": "$type", "counter": {"$sum": 1}}},   # First step
    {"$sort": SON([("counter", -1), ("_id", -1)])}          # Second step
]

# Output as list to get all values at once instead of a cursor
print("Entitiy types count:")
print(list(db.entities.aggregate(pipeline, cursor={})))
print("Relation types count:")
print(list(db.relations.aggregate(pipeline, cursor={})))

Entitiy types count:
[{u'_id': u'person', u'counter': 19828}, {u'_id': u'entity', u'counter': 6552}]
Relation types count:
[{u'_id': u'general', u'counter': 32136}, {u'_id': u'publication', u'counter': 1}]


In [23]:
# Aggregation mit Gruppierung der Dokumente aus der collection entities nach Feld "importer", zählen der Gruppen und aufsteigend danach sortieren
pipeline = [ {"$group": {"_id": "$importer", "count": {"$sum": 1}}} , {"$sort": {"count":1} }]
list(db.entities.aggregate(pipeline, cursor={}))

[{u'_id': u'parteien', u'count': 1},
 {u'_id': u'parteispenden14', u'count': 6},
 {u'_id': u'kabinette', u'count': 34},
 {u'_id': u'seitenwechsler', u'count': 214},
 {u'_id': u'thinktanks', u'count': 309},
 {u'_id': u'parteispenden13', u'count': 311},
 {u'_id': u'laender', u'count': 370},
 {u'_id': u'dax', u'count': 541},
 {u'_id': None, u'count': 1063},
 {u'_id': u'pr', u'count': 1767},
 {u'_id': u'bundestag', u'count': 2046},
 {u'_id': u'parteispenden', u'count': 2914},
 {u'_id': u'lobbyliste', u'count': 16804}]

In [28]:
query = {'type': 'entity', 'importer': 'parteispenden14'}
projection = {'_id': 0, 'name': 1}
for i in db.entities.find(query, projection):
     pprint(i)

print("%s Entitäten mit importer parteispenden14 gefunden" % db.entities.find(query, projection).count())

{u'_id': ObjectId('550f0eabe073b81931ff68ff'),
 u'name': u'Firma R & W Industriebeteiligungen GmbH'}
{u'_id': ObjectId('550f0eabe073b81931ff6902'),
 u'name': u'R&W Industriebeteiligungen GmbH'}
{u'_id': ObjectId('550f0eabe073b81931ff6906'), u'name': u'MLPD'}
3 Entitäten mit importer parteispenden14 gefunden
