In [None]:
# Import the required libraries

import pymongo
import datetime
import collections

import pandas as pd
import scipy.stats

In [None]:
# Open a connection to the Mongo server, open the accidents database and name the collections of accidents and labels
client = pymongo.MongoClient('mongodb://localhost:27351/')

db = client.accidents
accidents = db.accidents
labels = db.labels

In [None]:
# Load the expanded names of keys and human-readable codes into memory

expanded_name = collections.defaultdict(str)
for e in labels.find({'expanded': {"$exists": True}}):
    expanded_name[e['label']] = e['expanded']
    
label_of = collections.defaultdict(str)
for l in labels.find({'codes': {"$exists": True}}):
    for c in l['codes']:
        try:
            label_of[l['label'], int(c)] = l['codes'][c]
        except ValueError: 
            label_of[l['label'], c] = l['codes'][c]

# Activity 1

In [None]:
big_accidents = [a for a in accidents.find({'Number_of_Casualties': {'$gt': 8}}, 
                                           ['Accident_Index', 
                                            'Number_of_Casualties', 'Number_of_Vehicles', 
                                            'Vehicles.Vehicle_Type', 'Vehicles.Vehicle_Reference', 
                                            'Casualties.Vehicle_Reference'])]
len(big_accidents)

In [None]:
big_accidents

In [None]:
for a in big_accidents:
    print('Acc index {}; {} casualties, {} vehicles'.format(a['Accident_Index'], 
                                                            a['Number_of_Casualties'], 
                                                            a['Number_of_Vehicles']))
    for v in a['Vehicles']:
        print('\t{}: {} casualties'.format(label_of[('Vehicle_Type', v['Vehicle_Type'])].split()[0],
                                           len([c 
                                                for c in a['Casualties'] 
                                                if c['Vehicle_Reference'] == v['Vehicle_Reference']])
                                           ))

Many of these involve buses and minibuses, which explains why the accidents with the most casualties have few vehicles: they're all the passengers on a bus.

# Activity 2

In [None]:
# What are the vehicle types?
[(code, label_of[(key, code)]) for key, code in label_of if key == 'Vehicle_Type']

In [None]:
# Build a DataFrame, one row for each accident
coach_unrolled_df = pd.DataFrame(list(accidents.find({'Vehicles.Vehicle_Type': {'$in': [10, 11]}}, 
                                                       ['Number_of_Casualties', 'Number_of_Vehicles'])))
len(coach_unrolled_df)

In [None]:
# Build a DataFrame, one row for each accident
non_coach_unrolled_df = pd.DataFrame(list(accidents.find({'Vehicles.Vehicle_Type': {'$not': {'$in': [10, 11]}}}, 
                                                       ['Number_of_Casualties', 'Number_of_Vehicles'])))
len(non_coach_unrolled_df)

Check we've got them all ...

In [None]:
len(coach_unrolled_df) + len(non_coach_unrolled_df) == accidents.find().count()

### Find the regression for non-coach accidents

In [None]:
# Count the number of each severity
coach_df = pd.crosstab(coach_unrolled_df['Number_of_Casualties'], 
                                      coach_unrolled_df['Number_of_Vehicles'])
coach_df

In [None]:
coach_regressionline = scipy.stats.linregress(coach_unrolled_df['Number_of_Casualties'],
                                       coach_unrolled_df['Number_of_Vehicles'])

# The regression line is of the form y = m x + b
coach_m = coach_regressionline[0]
coach_b = coach_regressionline[1]
(coach_m, coach_b)

In [None]:
# Reshape
coach_long_df = coach_df.stack().reset_index()
coach_long_df

plt.scatter(coach_long_df['Number_of_Casualties'], 
            coach_long_df['Number_of_Vehicles'],
            s=np.sqrt(coach_long_df[0])*1.5,
            alpha=0.5
            )

x = np.linspace(0, 30, 20)
plt.plot(x, coach_m*x + coach_b)

plt.xlabel('Number of casualties')
plt.ylabel('Number of vehicles')
plt.show()

In [None]:
scipy.stats.pearsonr(coach_unrolled_df['Number_of_Casualties'], 
                     coach_unrolled_df['Number_of_Vehicles'])

This shows very little correlation, and the extremely small *p* value means we can reject the null hypothesis that the number of casualties is independent of the number of vehicles. In other words, there's small but real correlation: more vehicles leads to more casualties.

### Find the regression for coach accidents

In [None]:
# Count the number of each severity
non_coach_df = pd.crosstab(non_coach_unrolled_df['Number_of_Casualties'], 
                                      non_coach_unrolled_df['Number_of_Vehicles'])
non_coach_df

In [None]:
non_coach_regressionline = scipy.stats.linregress(non_coach_unrolled_df['Number_of_Casualties'],
                                       non_coach_unrolled_df['Number_of_Vehicles'])

# The regression line is of the form y = m x + b
non_coach_m = non_coach_regressionline[0]
non_coach_b = non_coach_regressionline[1]
(non_coach_m, non_coach_b)

In [None]:
# Reshape
non_coach_long_df = non_coach_df.stack().reset_index()
non_coach_long_df

plt.scatter(non_coach_long_df['Number_of_Casualties'], 
            non_coach_long_df['Number_of_Vehicles'],
            s=np.sqrt(non_coach_long_df[0])*1.5,
            alpha=0.5
            )

x = np.linspace(0, 30, 20)
plt.plot(x, non_coach_m*x + non_coach_b)

plt.xlabel('Number of casualties')
plt.ylabel('Number of vehicles')
plt.show()

In [None]:
scipy.stats.pearsonr(non_coach_unrolled_df['Number_of_Casualties'], 
                     non_coach_unrolled_df['Number_of_Vehicles'])

This shows even less correlation than the case for non-coach accidents. Again, the very small *p* value means we can reject the null hypothesis that the number of casualties is independent of the number of vehicles. In other words, there's a slight, but real, correlation, between the number of vehicles and the number of accidents.