In [125]:
import pymongo
from datetime import datetime
import numpy as np

In [22]:
db = pymongo.MongoClient()
dail = db.texts.dail
houses = db.houses.houses

Retrieve the start and end dates for an individual Dáil, and retrieve documents for that dail.

In [74]:
dail4 = houses.find_one({"house_type":'0', 'house_num':"4"}, { "house_num":1, "start_term":1, "end_term":1})
print("Dáil:", dail4['house_num'], "\nStart date:", dail4['start_term'].date(), "\nEnd date:", dail4['end_term'].date())

Dáil: 4 
Start date: 1923-09-19 
End date: 1927-05-23


### Comparison of different aggregration methods

1. MongoDb's aggregration pipeline (fastest)
2. Numpy sum function over Mongodb object converted to list
3. Numpy sum over Mongodb generator object (slowest and overly complicated)
3. Python sum function over Mongodb generator object

In [120]:
print("MongoDB aggregation pipeline\n---")

%time d = dail.aggregate([{"$match": {"date": {"$gte":dail4["start_term"], "$lte":dail4['end_term']}}}, {"$group": {"_id": None, "sum": {"$sum": "$len_doc"}}}]).next()

print("\nTotal number of tokens (words) uttered: {:,.0f}".format(d['sum']))

MongoDB aggregation pipeline
---
CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 296 ms

Total number of tokens (words) uttered: 12,992,570


In [121]:
obj = dail.find({"date": {"$gte":dail4["start_term"], "$lte":dail4['end_term']}}, {"len_doc":1})
print("Numpy sum over list to array\n---")
%time a = np.sum(np.array(list(f['len_doc'] for f in obj)))
print("\nTotal number of tokens (words) uttered: {:,.0f}".format(a))

Numpy sum over list to array
---
CPU times: user 328 ms, sys: 12 ms, total: 340 ms
Wall time: 743 ms

Total number of tokens (words) uttered: 12,992,570


In [123]:
obj = dail.find({"date": {"$gte":dail4["start_term"], "$lte":dail4['end_term']}}, {"len_doc":1})
print("Numpy sum over array from generator.\n---")
print("Counting...")
%time c = obj.count()
print("Creating array of requisite length...")
%time a = np.zeros(c)
print("Replacing elements in array from obj...")
%time for i, el in enumerate(obj): a[i] = el['len_doc']
print("Summing array...")
%time a= np.sum(a)
print("\nTotal number of tokens (words) uttered: {:,.0f}".format(a))

Numpy sum over array from generator.
---
Counting...
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 169 ms
Creating array of requisite length...
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 171 µs
Replacing elements in array from obj...
CPU times: user 408 ms, sys: 24 ms, total: 432 ms
Wall time: 701 ms
Summing array...
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 232 µs

Total number of tokens (words) uttered: 12,992,570


In [124]:
obj = dail.find({"date": {"$gte":dail4["start_term"], "$lte":dail4['end_term']}}, {"len_doc":1})
print("Python sum over object.\n---")
%time a = sum(f['len_doc'] for f in obj)
print("\nTotal number of tokens (words) uttered: {:,.0f}".format(a))

Python sum over object.
---
CPU times: user 360 ms, sys: 4 ms, total: 364 ms
Wall time: 686 ms

Total number of tokens (words) uttered: 12,992,570


In [62]:
obj = dail.find({"date": {"$gte":dail4["start_term"], "$lte":dail4['end_term']}}, {"len_doc":1})

1