# MongoDB review

In [1]:
# import the required libraries

import pymongo
from datetime import datetime

import pandas as pd

In [2]:
# Open a connection to tne Mongo server
client = pymongo.MongoClient('mongodb://localhost:27351')

In [3]:
client

MongoClient(host=['localhost:27351'], document_class=dict, tz_aware=False, connect=True)

In [4]:
# remove the doctor who db if it already exists
try:
    client.drop_database('doctorwho')
except NameError:
    print("DB doesn't exist yet.")

In [5]:
# Create a doctor who database akd a test_collection within it.
dw_db = client.doctorwho
tc = dw_db.test_collection


## Create

In [6]:
# insert a single document
tc.insert_one({'name':'William', 'birthyear': 1908})

<pymongo.results.InsertOneResult at 0x7fccc0d0ed80>

In [7]:
tc.find_one()

{'_id': ObjectId('5ae05ec60fd01f06464543aa'),
 'birthyear': 1908,
 'name': 'William'}

In [8]:
# insert a few (zip takes some lists and returns a list of tuples)
for n, b, c in zip('Patrick Jon Gom Peter Colin Sylvester Paul Christopher David Matt Peter'.split(),
                  [1920, 1919, 1934, 1951, 1943, 1943, 1959, 1964, 1971, 1982, 1958],
                  [1,2,3,4,5,6,7,8,9,10,11]):
    tc.insert_one({'name': n, 'birthyear': b, 'c': c})

## Read

In [9]:
tc.find_one()

{'_id': ObjectId('5ae05ec60fd01f06464543aa'),
 'birthyear': 1908,
 'name': 'William'}

In [10]:
tc.find_one({'name':'Peter'})

{'_id': ObjectId('5ae05ec60fd01f06464543ae'),
 'birthyear': 1951,
 'c': 4,
 'name': 'Peter'}

In [11]:
tc.find_one({'birthyear':1943})

{'_id': ObjectId('5ae05ec60fd01f06464543af'),
 'birthyear': 1943,
 'c': 5,
 'name': 'Colin'}

In [12]:
# find() will find all the documents that match the query, 
# returning a cursor that can be iterated over
tc.find({'name': 'Peter'})

<pymongo.cursor.Cursor at 0x7fccc047ada0>

In [13]:
# convert cursor to a list to make a list of dicts
list(tc.find({'name': 'Peter'}))

[{'_id': ObjectId('5ae05ec60fd01f06464543ae'),
  'birthyear': 1951,
  'c': 4,
  'name': 'Peter'},
 {'_id': ObjectId('5ae05ec60fd01f06464543b5'),
  'birthyear': 1958,
  'c': 11,
  'name': 'Peter'}]

In [14]:
# we can iterate directly over the cursor 
# but this is a one pass process
cursor = tc.find({'name': 'Peter'})
for p in cursor:
    print(p)

{'c': 4, 'birthyear': 1951, 'name': 'Peter', '_id': ObjectId('5ae05ec60fd01f06464543ae')}
{'c': 11, 'birthyear': 1958, 'name': 'Peter', '_id': ObjectId('5ae05ec60fd01f06464543b5')}


In [15]:
# if we try to print again we get nothing as the cursor has
# already been iterated over once
for p in cursor:
    print(p)

### using the cursor to tell us how many documents match the query

In [16]:
q = {'name': 'Peter'}
tc.find(q).count()

2

In [17]:
# Mongo projection
for p in tc.find({'name': 'Peter'}, ['birthyear']):
    print(p)

{'birthyear': 1951, '_id': ObjectId('5ae05ec60fd01f06464543ae')}
{'birthyear': 1958, '_id': ObjectId('5ae05ec60fd01f06464543b5')}


`_id` is included by default.  we can adjust using include/exclude

In [18]:
for p in tc.find({'name': 'Peter'}, {'birthyear':1,'_id':0}):
    print(p)

{'birthyear': 1951}
{'birthyear': 1958}


### Limit the number of documents returned

In [19]:
for p in tc.find({'birthyear': {'$lt':1950}}, ['name','birthyear'], limit=3):
    print(p['name'])

William
Patrick
Jon


## Reading into DataFrames


In [20]:
# we can create a DF from a list of Dicts
# so via a cast we can create a DF using the results of a find() 
pd.DataFrame(list(tc.find({})))

Unnamed: 0,_id,birthyear,c,name
0,5ae05ec60fd01f06464543aa,1908,,William
1,5ae05ec60fd01f06464543ab,1920,1.0,Patrick
2,5ae05ec60fd01f06464543ac,1919,2.0,Jon
3,5ae05ec60fd01f06464543ad,1934,3.0,Gom
4,5ae05ec60fd01f06464543ae,1951,4.0,Peter
5,5ae05ec60fd01f06464543af,1943,5.0,Colin
6,5ae05ec60fd01f06464543b0,1943,6.0,Sylvester
7,5ae05ec60fd01f06464543b1,1959,7.0,Paul
8,5ae05ec60fd01f06464543b2,1964,8.0,Christopher
9,5ae05ec60fd01f06464543b3,1971,9.0,David


## Update

adding a surname to one of the records""

In [21]:
patrick = tc.find_one({"name": "Patrick"})
print(patrick)

{'c': 1, 'birthyear': 1920, 'name': 'Patrick', '_id': ObjectId('5ae05ec60fd01f06464543ab')}


In [22]:
result = tc.update_one({'name': 'Patrick'}, {'$set': {'surname': 'Trouton'}})
# we can interrogate the result to see what effect the update had on the collection
result.matched_count, result.modified_count

(1, 1)

(one document found, one document updated).

In [23]:
for p in tc.find({'name': 'Patrick'}):
    print(p)

{'c': 1, 'birthyear': 1920, 'name': 'Patrick', 'surname': 'Trouton', '_id': ObjectId('5ae05ec60fd01f06464543ab')}


In [24]:
# Update many
# set a new key:value for those doctors named Peter
result = tc.update_many({'name': 'Peter'}, {'$set': {'multi_updated': True}})
result.matched_count, result.modified_count

(2, 2)

In [25]:
for p in tc.find():
    print(p)

{'birthyear': 1908, 'name': 'William', '_id': ObjectId('5ae05ec60fd01f06464543aa')}
{'c': 1, 'birthyear': 1920, 'name': 'Patrick', 'surname': 'Trouton', '_id': ObjectId('5ae05ec60fd01f06464543ab')}
{'c': 2, 'birthyear': 1919, 'name': 'Jon', '_id': ObjectId('5ae05ec60fd01f06464543ac')}
{'c': 3, 'birthyear': 1934, 'name': 'Gom', '_id': ObjectId('5ae05ec60fd01f06464543ad')}
{'c': 4, 'birthyear': 1951, 'name': 'Peter', 'multi_updated': True, '_id': ObjectId('5ae05ec60fd01f06464543ae')}
{'c': 5, 'birthyear': 1943, 'name': 'Colin', '_id': ObjectId('5ae05ec60fd01f06464543af')}
{'c': 6, 'birthyear': 1943, 'name': 'Sylvester', '_id': ObjectId('5ae05ec60fd01f06464543b0')}
{'c': 7, 'birthyear': 1959, 'name': 'Paul', '_id': ObjectId('5ae05ec60fd01f06464543b1')}
{'c': 8, 'birthyear': 1964, 'name': 'Christopher', '_id': ObjectId('5ae05ec60fd01f06464543b2')}
{'c': 9, 'birthyear': 1971, 'name': 'David', '_id': ObjectId('5ae05ec60fd01f06464543b3')}
{'c': 10, 'birthyear': 1982, 'name': 'Matt', '_id': Ob

In [26]:
# remove the new value using the unset method (the value is ignored)
tc.update_many({'name':'Peter'},{'$unset':{'multi_updated': ''}})
for p in tc.find():
    print(p)

{'birthyear': 1908, 'name': 'William', '_id': ObjectId('5ae05ec60fd01f06464543aa')}
{'c': 1, 'birthyear': 1920, 'name': 'Patrick', 'surname': 'Trouton', '_id': ObjectId('5ae05ec60fd01f06464543ab')}
{'c': 2, 'birthyear': 1919, 'name': 'Jon', '_id': ObjectId('5ae05ec60fd01f06464543ac')}
{'c': 3, 'birthyear': 1934, 'name': 'Gom', '_id': ObjectId('5ae05ec60fd01f06464543ad')}
{'c': 4, 'birthyear': 1951, 'name': 'Peter', '_id': ObjectId('5ae05ec60fd01f06464543ae')}
{'c': 5, 'birthyear': 1943, 'name': 'Colin', '_id': ObjectId('5ae05ec60fd01f06464543af')}
{'c': 6, 'birthyear': 1943, 'name': 'Sylvester', '_id': ObjectId('5ae05ec60fd01f06464543b0')}
{'c': 7, 'birthyear': 1959, 'name': 'Paul', '_id': ObjectId('5ae05ec60fd01f06464543b1')}
{'c': 8, 'birthyear': 1964, 'name': 'Christopher', '_id': ObjectId('5ae05ec60fd01f06464543b2')}
{'c': 9, 'birthyear': 1971, 'name': 'David', '_id': ObjectId('5ae05ec60fd01f06464543b3')}
{'c': 10, 'birthyear': 1982, 'name': 'Matt', '_id': ObjectId('5ae05ec60fd01f0

In [27]:
# to make individual updates to each document, we can use the 
# documents `_id`

import datetime
for p in tc.find():
    tc.update_one({'_id': p['_id']}, {'$set': {'age': datetime.datetime.now().year - p['birthyear']}})

for p in tc.find():
    print(p)

{'birthyear': 1908, 'name': 'William', 'age': 110, '_id': ObjectId('5ae05ec60fd01f06464543aa')}
{'c': 1, 'age': 98, 'birthyear': 1920, 'name': 'Patrick', 'surname': 'Trouton', '_id': ObjectId('5ae05ec60fd01f06464543ab')}
{'c': 2, 'birthyear': 1919, 'name': 'Jon', 'age': 99, '_id': ObjectId('5ae05ec60fd01f06464543ac')}
{'c': 3, 'birthyear': 1934, 'name': 'Gom', 'age': 84, '_id': ObjectId('5ae05ec60fd01f06464543ad')}
{'c': 4, 'birthyear': 1951, 'name': 'Peter', 'age': 67, '_id': ObjectId('5ae05ec60fd01f06464543ae')}
{'c': 5, 'birthyear': 1943, 'name': 'Colin', 'age': 75, '_id': ObjectId('5ae05ec60fd01f06464543af')}
{'c': 6, 'birthyear': 1943, 'name': 'Sylvester', 'age': 75, '_id': ObjectId('5ae05ec60fd01f06464543b0')}
{'c': 7, 'birthyear': 1959, 'name': 'Paul', 'age': 59, '_id': ObjectId('5ae05ec60fd01f06464543b1')}
{'c': 8, 'birthyear': 1964, 'name': 'Christopher', 'age': 54, '_id': ObjectId('5ae05ec60fd01f06464543b2')}
{'c': 9, 'birthyear': 1971, 'name': 'David', 'age': 47, '_id': Obje

## Activity 3
Classify the people into 2 groups born in or before 1945 and those born after

`'age': 'old'` and '`age':'young'`

In [28]:
tc.update_many({'birthyear': {'$lte': 1945}}, {'$set': {'age': 'old'}})
tc.update_many({'birthyear': {'$gt': 1945}}, {'$set': {'age': 'young'}})

for p in tc.find():
    print(p)

{'birthyear': 1908, 'name': 'William', 'age': 'old', '_id': ObjectId('5ae05ec60fd01f06464543aa')}
{'c': 1, 'age': 'old', 'birthyear': 1920, 'name': 'Patrick', 'surname': 'Trouton', '_id': ObjectId('5ae05ec60fd01f06464543ab')}
{'c': 2, 'birthyear': 1919, 'name': 'Jon', 'age': 'old', '_id': ObjectId('5ae05ec60fd01f06464543ac')}
{'c': 3, 'birthyear': 1934, 'name': 'Gom', 'age': 'old', '_id': ObjectId('5ae05ec60fd01f06464543ad')}
{'c': 4, 'birthyear': 1951, 'name': 'Peter', 'age': 'young', '_id': ObjectId('5ae05ec60fd01f06464543ae')}
{'c': 5, 'birthyear': 1943, 'name': 'Colin', 'age': 'old', '_id': ObjectId('5ae05ec60fd01f06464543af')}
{'c': 6, 'birthyear': 1943, 'name': 'Sylvester', 'age': 'old', '_id': ObjectId('5ae05ec60fd01f06464543b0')}
{'c': 7, 'birthyear': 1959, 'name': 'Paul', 'age': 'young', '_id': ObjectId('5ae05ec60fd01f06464543b1')}
{'c': 8, 'birthyear': 1964, 'name': 'Christopher', 'age': 'young', '_id': ObjectId('5ae05ec60fd01f06464543b2')}
{'c': 9, 'birthyear': 1971, 'name':

## Embedded Documents
Values in documents can be themselves documents.  For instance, we can encapsulate each person's name in a subdocument.

In [29]:
# delete the test collection data
tc.drop()
# insert a few doctors
for f, s, b in zip('William Patrick Jon Tom Peter Colin Sylvester Paul Christopher David Matt Peter'.split(),
                  'Hartnell Troughten Pertwee Baker McCoy McGann Eccleston Tennant Smith Capaldi'.split(),
                  [1908, 1920, 1919, 1934, 1951, 1943, 1943, 1959, 1964, 1971, 1982, 1958]):
    tc.insert_one({'name': {'forename': f, 'surname': s}, 'birthyear': b})
for p in tc.find():
    print(p)


{'birthyear': 1908, 'name': {'surname': 'Hartnell', 'forename': 'William'}, '_id': ObjectId('5ae05ec60fd01f06464543b6')}
{'birthyear': 1920, 'name': {'surname': 'Troughten', 'forename': 'Patrick'}, '_id': ObjectId('5ae05ec60fd01f06464543b7')}
{'birthyear': 1919, 'name': {'surname': 'Pertwee', 'forename': 'Jon'}, '_id': ObjectId('5ae05ec60fd01f06464543b8')}
{'birthyear': 1934, 'name': {'surname': 'Baker', 'forename': 'Tom'}, '_id': ObjectId('5ae05ec60fd01f06464543b9')}
{'birthyear': 1951, 'name': {'surname': 'McCoy', 'forename': 'Peter'}, '_id': ObjectId('5ae05ec60fd01f06464543ba')}
{'birthyear': 1943, 'name': {'surname': 'McGann', 'forename': 'Colin'}, '_id': ObjectId('5ae05ec60fd01f06464543bb')}
{'birthyear': 1943, 'name': {'surname': 'Eccleston', 'forename': 'Sylvester'}, '_id': ObjectId('5ae05ec60fd01f06464543bc')}
{'birthyear': 1959, 'name': {'surname': 'Tennant', 'forename': 'Paul'}, '_id': ObjectId('5ae05ec60fd01f06464543bd')}
{'birthyear': 1964, 'name': {'surname': 'Smith', 'for

We can also include a list of notable stories for each person. Note the use of the dot notation to identify keys in a sub-document.

In [31]:
result = tc.update_one({'name.forename': 'William', 'name.surname': 'Hartnell'},
                       {'$set': {'episodes': ['An Unearthly Child', 'The Daleks', 'The Tenth Planet']}})
result.matched_count, result.modified_count

(1, 1)

In [32]:
tc.find_one({'name.forename': 'William'})

{'_id': ObjectId('5ae05ec60fd01f06464543b6'),
 'birthyear': 1908,
 'episodes': ['An Unearthly Child', 'The Daleks', 'The Tenth Planet'],
 'name': {'forename': 'William', 'surname': 'Hartnell'}}

# Importing Data

we will now import a larger dataset.  Using the [Ultimate Doctor Who](http://www.ultimatedoctorwho.com/).  First let's look at the data in the csv

In [34]:
!head data/Ultimate_Doctor_Who_resave.csv

Story ID,Year,Season,Title,No. of parts,Pt. 1 air date,Pt. 1 viewers (in millons),Pt. 2 air date,Pt.2 viewers,Pt. 3 air date,Pt.3 viewers,Pt. 4 air date,Pt.4 viewers,Pt.5 air date,Pt. 5 viewers,Pt.6 air date,Pt.6 viewers,Pt. 7 air date,Pt.7 viewers,pt. 8 air date,pt. 8 viewers,pt. 9 air date,pt. 9 viewers,pt. 10 air date,pt. 10 viewers,pt. 11 air date,pt. 11 viewers,pt. 12 air date,pt. 12 viewers,Type of Broadcast,Doctor Number,Doctor,Guest Doctor(s),Companion 1,Companion 2,Companion 3,Companion 4,Companion 5,Companion 6,Companion 7,Companion 8,Appearance of UNIT,Recurring Villains,Firsts
1,1963,1,An Unearthly Child,4,11/26/63,4.4,11/30/63,5.9,12/7/63,6.9,12/14/63,6.4,,,,,,,,,,,,,,,,,Serial,1,William Hartnell,,Susan Foreman,Barbara Wright,Ian Chesterton,,,,,,,,
2,1964,1,The Daleks,7,12/21/63,6.9,12/28/13,6.4,1/4/64,8.9,1/11/64,9.9,1/18/64,9.9,1/25/64,10.4,2/1/64,10.4,,,,,,,,,,,Serial,1,William Hartnell,,Susan Foreman,Barbara Wright,Ian Chesterton,,,,,,,Daleks,
3,1964,1,The Edge of D

Looks like a CSV file.

## `mongoimport`

important parameters:

    - `drop`, drops collection
    - `db` and `collection` specify where the data should go.
    - `headerline` indicates first line is a header
    - `ignore blanks` empty values will not be created
    - `file` where the data resides.
   

In [35]:
!/usr/bin/mongoimport --port 27351 --drop --db doctorwho --collection episodes \
    --type csv --headerline --ignoreBlanks \
    --file data/Ultimate_Doctor_Who_resave.csv

2018-04-25T11:18:28.805+0000	connected to: localhost:27351
2018-04-25T11:18:28.805+0000	dropping: doctorwho.episodes
2018-04-25T11:18:28.826+0000	imported 244 documents


In [36]:
# Open the imported database and collection
episodes = dw_db.episodes

In [37]:
# see the size of the dataset
episodes.find().count()

244

In [38]:
# look at a sample document
episodes.find_one()

{'Companion 1': 'Susan Foreman',
 'Companion 2': 'Barbara Wright',
 'Companion 3': 'Ian Chesterton',
 'Doctor': 'William Hartnell',
 'Doctor Number': 1,
 'No': {' of parts': 4},
 'Pt': {' 1 air date': '11/26/63',
  ' 1 viewers (in millons)': 4.4,
  ' 2 air date': '11/30/63',
  ' 3 air date': '12/7/63',
  ' 4 air date': '12/14/63',
  '2 viewers': 5.9,
  '3 viewers': 6.9,
  '4 viewers': 6.4},
 'Season': 1,
 'Story ID': 1,
 'Title': 'An Unearthly Child',
 'Type of Broadcast': 'Serial',
 'Year': 1963,
 '_id': ObjectId('5ae064042efe9851395cc307')}

_note that 'No. of parts' became_
        
        'No': {' of parts': 4}

_this is due to the way mongoympost handles dots in the column name._

## Cleaning the dataset.

Collect the various companions into a list.  and delete the fields using `'$push'` and `'$unset'`

`$push` adds an item to a list and creates one if it doesn't already exist.

In [40]:
# iterate through all episodes
for e in episodes.find():
    # iterate through each key
    for key in list(e.keys()):
        # check if it is a companion key
        if key.startswith('Companion '):
            # update the information
            episodes.update_one({'_id': e['_id']},
                               {'$push': {'Companions': e[key]},
                               '$unset': {key: 1}})

In [41]:
episodes.find_one()

{'Companions': ['Barbara Wright', 'Ian Chesterton', 'Susan Foreman'],
 'Doctor': 'William Hartnell',
 'Doctor Number': 1,
 'No': {' of parts': 4},
 'Pt': {' 1 air date': '11/26/63',
  ' 1 viewers (in millons)': 4.4,
  ' 2 air date': '11/30/63',
  ' 3 air date': '12/7/63',
  ' 4 air date': '12/14/63',
  '2 viewers': 5.9,
  '3 viewers': 6.9,
  '4 viewers': 6.4},
 'Season': 1,
 'Story ID': 1,
 'Title': 'An Unearthly Child',
 'Type of Broadcast': 'Serial',
 'Year': 1963,
 '_id': ObjectId('5ae064042efe9851395cc307')}