In [None]:
# Import the required libraries

import pymongo
import datetime
import collections

import pandas as pd
import scipy.stats

import folium
import uuid

In [None]:
# Open a connection to the Mongo server, open the accidents database and name the collections of accidents and labels
client = pymongo.MongoClient('mongodb://localhost:27351/')

db = client.accidents
accidents = db.accidents
labels = db.labels
roads = db.roads

In [None]:
# Load the expanded names of keys and human-readable codes into memory
expanded_name = collections.defaultdict(str)
for e in labels.find({'expanded': {"$exists": True}}):
    expanded_name[e['label']] = e['expanded']
    
label_of = collections.defaultdict(str)
for l in labels.find({'codes': {"$exists": True}}):
    for c in l['codes']:
        try:
            label_of[l['label'], int(c)] = l['codes'][c]
        except ValueError: 
            label_of[l['label'], c] = l['codes'][c]

In [None]:
def results_to_table(results, index_name, column_name, results_name, 
                     fillna=None,
                     relabel_index=False, relabel_columns=False,
                     index_label=None, column_label=None):
    
    # Move items in dicts-of-dicts to the top level.
    def flatten(d):
        new_d = {}
        for k in d:
            if isinstance(d[k], dict):
                new_d.update(flatten(d[k]))
            else:
                new_d[k] = d[k]
        return new_d

    df = pd.DataFrame([flatten(r) for r in results])
    df = df.pivot(index=index_name, columns=column_name, values=results_name)
    
    # Optionally, fiddle with names and labels to make the DataFrame pretty.
    if not fillna is None:
        df.fillna(fillna, inplace=True)
    if relabel_columns:
        df.columns = [label_of[column_name, c] for c in df.columns]
    if relabel_index:
        df.index = [label_of[index_name, r] for r in df.index]
    if column_label:
        df.columns.name = column_label
    else:
        df.columns.name = column_name
    if index_label:
        df.index.name = index_label
    else:
        df.index.name = index_name
    return df

# Activity 1

In [None]:
pipeline = [
    {'$project': {'ONS LA Name': '$ONS LA Name',
                  'class': {'$substr': ['$RCat', 1, 1]},
                  'LenNet': '$LenNet'}},
    {'$match': {'class': {'$in': ['R', 'U']}}},
    {'$group': {'_id': {'ONS_LA_Name': '$ONS LA Name', 
                        'class': '$class'},
                    'length': {'$sum': '$LenNet'},
                    'count': {'$sum': 1}}}]
results = list(roads.aggregate(pipeline))
results

In [None]:
ons_class_df = results_to_table(results, 'ONS_LA_Name', 'class', 'count', fillna=0)
ons_class_df

In [None]:
plt.scatter(ons_class_df['R'], 
            ons_class_df['U']
            )
plt.xlabel('No rural')
plt.ylabel('No urban')
plt.show()

In [None]:
# Which is that district in the top-right corner?
ons_class_df.loc[ons_class_df['R'].idxmax()]

Find the most rural and urban districts.

In [None]:
ons_class_df.sort_values('U', ascending=False).head()

In [None]:
ons_class_df.sort_values('R', ascending=False).head()

In [None]:
ons_class_df[(ons_class_df['R'] > 150) | (ons_class_df['U'] > 150)]

In [None]:
ons_class_df[(ons_class_df['R'] < 2) | (ons_class_df['U'] < 2)]

Now to look at the road lengths.

In [None]:
ons_class_len_df = results_to_table(results, 'ONS_LA_Name', 'class', 'length', fillna=0)
ons_class_len_df

In [None]:
plt.scatter(ons_class_len_df['R'], 
            ons_class_len_df['U']
            )
plt.xlabel('Len rural')
plt.ylabel('Len urban')
plt.show()

Again, look at the most urban and most rural.

In [None]:
ons_class_len_df.sort_values('U', ascending=False).head()

In [None]:
ons_class_len_df.sort_values('R', ascending=False).head()

There are a lot of districts on that left hand edge of the scatter plot. Can we pick them out? How about by finding the ratio of urban:rural lengths, and finding the outliers.

In [None]:
ons_class_len_df['UvR'] = ons_class_len_df['U'] / ons_class_len_df['R']
ons_class_len_df.sort_values('UvR').head(10)

In [None]:
ons_class_len_df.sort_values('UvR', ascending=False).head(10)

How many districts are urban-only?

In [None]:
ons_class_len_df[ons_class_len_df['R'] == 0].count(), ons_class_len_df.count()

In [None]:
24/206

### Observations
There are 206 districts. 24 of them (about 12%) are entirely urban, with no rural road sections. Five are entirely rural. The entirely urban districts are in the centers of large cities, with London being dominant here.

Several districts have many and long sections in both categories. This includes several London communter belt districts (Kent, Surrey, Hertfordshire) districts. 

## Activity 2

In [None]:
pipeline = [{'$group': {'_id': '$RCat',
                                 'Fd2WMV': {'$avg': '$Fd2WMV'},
                                 'FdBUS': {'$avg': '$FdBUS'},
                                 'FdCar': {'$avg': '$FdCar'},
                                 'FdHGV': {'$avg': '$FdHGV'},
                                 'FdLGV': {'$avg': '$FdLGV'},
                                 'FdPC': {'$avg': '$FdPC'}}}]

results = list(roads.aggregate(pipeline))
results

In [None]:
# Put the results in a DataFrame

results_df = pd.DataFrame(results)
results_df.set_index('_id', inplace=True)
results_df.index = [label_of['RCat', r] for r in results_df.index]
results_df.columns = [expanded_name[c] for c in results_df.columns]
results_df

In [None]:
results_df.plot(kind='bar')

To see if the vehicle mixes are different on different roads, we'll have to use a chi-squared test. First, we need to ignore the pedal cycles category, as there are no cyclists on motorways. Then we can find the expected volumes for each vehicle type, assuming no variation between road types.

In [None]:
reduced_results_df = results_df.drop('Pedal cycles', 1)
reduced_results_df.sort_index(axis=0, inplace=True)
reduced_results_df.sort_index(axis=1, inplace=True)
reduced_results_df

In [None]:
# Reused from notebook 14.3
def expected_of_df(actual_df):
    df = pd.DataFrame(
        {c: 
         {r: actual_df[c].sum() * actual_df.loc[r].sum() / actual_df.sum().sum()
                  for r in actual_df[c].index} 
              for c in actual_df})
    # Fix the order of columns and rows
    df = df[actual_df.columns]
    df = df.reindex(actual_df.index)
    return df

In [None]:
expected_results_df = expected_of_df(reduced_results_df)
expected_results_df

In [None]:
scipy.stats.chisquare(reduced_results_df, expected_results_df, axis=None)

Yes, different road types really do have different traffic mixes.