In [None]:
# Import the required libraries

import pymongo
import datetime
import collections

import pandas as pd
import scipy.stats

In [None]:
# Open a connection to the Mongo server, open the accidents database and name the collections of accidents and labels
client = pymongo.MongoClient('mongodb://localhost:27351/')

db = client.accidents
accidents = db.accidents
labels = db.labels

In [None]:
# Load the expanded names of keys and human-readable codes into memory

expanded_name = collections.defaultdict(str)
for e in labels.find({'expanded': {"$exists": True}}):
    expanded_name[e['label']] = e['expanded']
    
label_of = collections.defaultdict(str)
for l in labels.find({'codes': {"$exists": True}}):
    for c in l['codes']:
        try:
            label_of[l['label'], int(c)] = l['codes'][c]
        except ValueError: 
            label_of[l['label'], c] = l['codes'][c]

# Activity 1

In [None]:
# Pull out all the accidents at 30mph or above, group by speed and severity, 
#   and show totals at each speed/severity combination.
pipeline = [{'$match': {'Speed_limit': {'$gte': 30}}},
            {'$group': {'_id': {'Speed_limit': '$Speed_limit', 
                                'Accident_Severity': '$Accident_Severity'},
                        'num_accidents': {'$sum': 1}}}]
results = list(accidents.aggregate(pipeline))
results

# Activity 2

In [None]:
results_long_df = pd.DataFrame([
        {'Accident_Severity': r['_id']['Accident_Severity'], 
         'Speed_limit': r['_id']['Speed_limit'], 
         'num_accidents': r['num_accidents']}
        for r in results])
results_long_df

In [None]:
results_df = results_long_df.pivot(index='Speed_limit',columns='Accident_Severity', values='num_accidents')
results_df.columns = [label_of['Accident_Severity', c] for c in results_df.columns]
results_df.columns.name = "Severity"
results_df.index.name = "Speed limit"
results_df

A lot of the tasks in this section will end up producing results in a similar form: a list of groups, each with a key consisting of two items. Rather than getting distracted by repeatedly applying the same sequence of reshaping steps, we can build a small function to do it for us:

In [None]:
def results_to_table(results, index_name, column_name, results_name, 
                     fillna=None,
                     relabel_index=False, relabel_columns=False,
                     index_label=None, column_label=None):
    
    # Move items in dicts-of-dicts to the top level.
    def flatten(d):
        new_d = {}
        for k in d:
            if isinstance(d[k], dict):
                new_d.update(flatten(d[k]))
            else:
                new_d[k] = d[k]
        return new_d

    df = pd.DataFrame([flatten(r) for r in results])
    df = df.pivot(index=index_name, columns=column_name, values=results_name)
    
    # Optionally, fiddle with names and labels to make the DataFrame pretty.
    if not fillna is None:
        df.fillna(fillna, inplace=True)
    if relabel_columns:
        df.columns = [label_of[column_name, c] for c in df.columns]
    if relabel_index:
        df.index = [label_of[index_name, r] for r in df.index]
    if column_label:
        df.columns.name = column_label
    else:
        df.columns.name = column_name
    if index_label:
        df.index.name = index_label
    else:
        df.index.name = index_name
    return df

The function in use:

In [None]:
results_df = results_long_df.pivot(index='Speed_limit',columns='Accident_Severity', values='num_accidents')
results_df.columns = [label_of['Accident_Severity', c] for c in results_df.columns]
results_df

In [None]:
ax = results_df.plot(kind='bar')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Activity 3

In [None]:
pipeline = [{'$group': {'_id': {'Number_of_Casualties': '$Number_of_Casualties', 
                                'Number_of_Vehicles': '$Number_of_Vehicles'},
                        'num_accidents': {'$sum': 1}}}]
results = list(accidents.aggregate(pipeline))
results

In [None]:
results_to_table(results, 'Number_of_Casualties', 'Number_of_Vehicles', 'num_accidents', fillna=0)

# Activity 4

In [None]:
pipeline = [{'$match': {'Speed_limit': {'$gte': 30}}},
            {'$group': {'_id': {'Junction_Detail': '$Junction_Detail', 
                                'Accident_Severity': '$Accident_Severity'},
                        'num_accidents': {'$sum': 1}}}]
results = list(accidents.aggregate(pipeline))
results

In [None]:
results_df = results_to_table(results, 'Junction_Detail', 'Accident_Severity', 'num_accidents',
                 fillna=0,
                 relabel_index=True, relabel_columns=True, 
                 index_label="Junction detail", column_label="Accident severity")
results_df

In [None]:
ax = results_df.plot(kind='bar')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Activity 5
Create a group for each speed limit, then `project` to calculate the average numbers of casualties and vehicles.

In [None]:
pipeline = [{'$group': {'_id': {'Speed_limit': '$Speed_limit'},
                                 'total_casualties': {'$sum': '$Number_of_Casualties'},
                                 'total_vehicles': {'$sum': '$Number_of_Vehicles'},
                                 'num_accidents': {'$sum': 1}}},
            {'$project': {'Speed_limit': '$_id.Speed_limit',
                                   '_id': 0,
                                   'average_casualties': {'$divide': ['$total_casualties', '$num_accidents']},
                                   'average_vehicles': {'$divide': ['$total_vehicles', '$num_accidents']}}},
            {'$sort': {'Speed_limit': 1}}]
results = list(accidents.aggregate(pipeline))
results

In [None]:
# Put the results in a DataFrame
results_df = pd.DataFrame(
    {'Average Casualties': {r['Speed_limit']: r['average_casualties'] for r in results},
     'Average Vehicles': {r['Speed_limit']: r['average_vehicles'] for r in results}})
results_df.index.name = 'Speed limit'
results_df

In [None]:
ax = results_df.plot(kind='bar')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Activity 6

In [None]:
pipeline = [{'$unwind': '$Casualties'},
            {'$group': {'_id': {'Age_Band_of_Casualty': '$Casualties.Age_Band_of_Casualty',
                                'Casualty_Severity': '$Casualties.Casualty_Severity'},
                        'num_accidents': {'$sum': 1}}}]
results = list(accidents.aggregate(pipeline))
results

In [None]:
rdf = results_to_table(results, 'Age_Band_of_Casualty', 'Casualty_Severity', 'num_accidents',
                       fillna=0,
                 relabel_index=True, relabel_columns=True, 
                 index_label="Age band of casualty", column_label="Casualty severity")
rdf.index = ['Unknown'] + list(rdf.index[1:])
rdf

In [None]:
ax = results_df.plot(kind='bar')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

# Activity 7
This is a complex query, so I've described the stages in the pipeline below.

In [None]:
pipeline = [
    # 1: keep only the lists of subdocuments
    {'$project': {'Vehicles': 1, 'Casualties': 1}},
    
    # 2: unwind the lists
    {'$unwind': '$Vehicles'},
    {'$unwind': '$Casualties'},
    
    #3: discard the documents we know we won't need
    {'$match': {'Casualties.Casualty_Class': 2,
                'Casualties.Age_Band_of_Casualty': {'$ne': -1},
                'Vehicles.Age_Band_of_Driver': {'$ne': -1}}},
    
    # 4: flag documents that have a the correct vehicle for this casualty
    {'$project': {'casualty_for_vehicle': {'$eq': ['$Vehicles.Vehicle_Reference', '$Casualties.Vehicle_Reference']},
                  'Age_Band_of_Driver': '$Vehicles.Age_Band_of_Driver',
                  'Age_Band_of_Casualty': '$Casualties.Age_Band_of_Casualty'}},
    
    # 5: retain only those documents with the flag set
    {'$match': {'casualty_for_vehicle': True}},
    
    # 6: count documents in each group
    {'$group': {'_id': {'Age_Band_of_Driver': '$Age_Band_of_Driver',
                        'Age_Band_of_Casualty': '$Age_Band_of_Casualty'}, 
                'count': {'$sum': 1}}}
]
results = list(accidents.aggregate(pipeline))
results

Fuller explanation of the stages.

1. We're only interested in the contents of the casualty and vehicle sub-documents, so we can discard all the information about the accident as a whole.
1. Unwind both lists of subdocuments. This gives a new document for each combination of vehicle and casualty in the accident. For example, if there were two vehicles and three casualties involved in an accident, these `unwind` stages would produce six documents, one for each vehicle/casualty combination.
1. We can discard some of these documents now. We keep only documents where the age band of both driver and casualty are known, and where the casualty is a passenger.
1. Vehicles and casualties have a vehicle reference. We would like to match directly on them, keeping only documents where they're the same. Unfortunately, Mongo doesn't allow that. Therefore, we create a new key in the document, which is `True` if the casualty and vehicle match, `False` otherwise. We also drop other keys apart from the age bands.
1. Now we can filter and on the `casualty_for_vehicle` flag from the previous step, and retain only those documents where it's set.
1. Finally, do the standard grouping and counting to find the results we're after.

In [None]:
results_df = results_to_table(results, 'Age_Band_of_Casualty', 'Age_Band_of_Driver', 'count',
                 fillna=0,
                 relabel_index=True, relabel_columns=True, 
                 index_label="Age band of casualty", column_label="Age band of driver")
results_df

In [None]:
results_unlabelled_df = results_to_table(results, 'Age_Band_of_Casualty', 'Age_Band_of_Driver', 'count',
                 fillna=0,
                 index_label="Age band of casualty", column_label="Age band of driver")
results_unlabelled_df

In [None]:
results_long_df = results_unlabelled_df.stack().reset_index()
results_long_df

In [None]:
plt.scatter(results_long_df['Age band of casualty'], 
            results_long_df['Age band of driver'],
            s=np.sqrt(results_long_df[0])*1.5,
            alpha=0.5
            )
plt.xlabel('Age band of casualty')
plt.ylabel('Age band of driver')
plt.show()

In [None]:
drivers, passengers = zip(*[(group['_id']['Age_Band_of_Driver'], group['_id']['Age_Band_of_Casualty']) 
    for group in results 
    for _ in range(group['count'])] )

scipy.stats.spearmanr(drivers, passengers)  

The _p_ value of zero means this correlation cannot be explained by chance: there is some connection between the ages of drivers and passengers. 