In [None]:
# Import the required libraries

import pymongo
import datetime
import collections

import pandas as pd
import scipy.stats

In [None]:
# Open a connection to the Mongo server, open the accidents database and name the collections of accidents and labels
# client = pymongo.MongoClient('mongodb://localhost:27017/')
client = pymongo.MongoClient('mongodb://localhost:27351/')

db = client.accidents
accidents = db.accidents
labels = db.labels

In [None]:
# Load the expanded names of keys and human-readable codes into memory

expanded_name = collections.defaultdict(str)
for e in labels.find({'expanded': {"$exists": True}}):
    expanded_name[e['label']] = e['expanded']
    
label_of = collections.defaultdict(str)
for l in labels.find({'codes': {"$exists": True}}):
    for c in l['codes']:
        try:
            label_of[l['label'], int(c)] = l['codes'][c]
        except ValueError: 
            label_of[l['label'], c] = l['codes'][c]

# Activity 1

In [None]:
sorted([(code, label_of[key, code]) for key, code in label_of if key == 'Age_Band_of_Driver'])

In [None]:
sorted([(code, label_of[key, code]) for key, code in label_of if key == 'Age_Band_of_Casualty'])

In [None]:
sorted([(code, label_of[key, code]) for key, code in label_of if key == 'Casualty_Class'])

In [None]:
driver_passenger_unrolled_df = pd.DataFrame([
    {'Age_Band_of_Driver': v['Age_Band_of_Driver'], 
     'Age_Band_of_Casualty': c['Age_Band_of_Casualty']}
    for a in accidents.find({})
    for c in a['Casualties']
    for v in a['Vehicles']
    if c['Vehicle_Reference'] == v['Vehicle_Reference']
    if c['Casualty_Class'] == 2
    if c['Age_Band_of_Casualty'] != -1
    if v['Age_Band_of_Driver'] != -1 ])
driver_passenger_unrolled_df

In [None]:
driver_passenger_df = pd.crosstab(driver_passenger_unrolled_df['Age_Band_of_Driver'],
                                  driver_passenger_unrolled_df['Age_Band_of_Casualty'])
driver_passenger_df

In [None]:
driver_passenger_long_df = driver_passenger_df.stack().reset_index()
driver_passenger_long_df

In [None]:
plt.scatter(driver_passenger_long_df['Age_Band_of_Driver'], 
            driver_passenger_long_df['Age_Band_of_Casualty'],
            s=np.sqrt(driver_passenger_long_df[0])*1.5,
            alpha=0.5 )
plt.xlabel('Driver age band')
plt.ylabel('Passenger age band')

plt.show()

We can now apply the `spearmanr` function to calculate Spearman's *ρ* (rho) value of correlation. 

Note that we have to give each accident on its own row: if there are 44,000 passenger casualties, the `spearman` function must be passed lists with 44,000 items.

Recall that values near +1 show good positive correlation, values near -1 show good negative correlation, and values near 0 show no particular correlation. The `scipy` function returns a second value, the *p* value of the result. 

In [None]:
scipy.stats.spearmanr(driver_passenger_unrolled_df['Age_Band_of_Driver'], 
                      driver_passenger_unrolled_df['Age_Band_of_Casualty'])

This shows a resonable correlation: the older the driver, the older the passengers. 

The *p* value of zero means that this correlation cannot be explained by chance: we should reject the null hypothesis that the ages of drivers and passengers are unrelated. In other words, the correlation is a real one.

# Activity 2

In [None]:
# What are the weather types?
[(code, label_of[key, code]) for key, code in label_of if key == 'Weather_Conditions']

In [None]:
# Build a DataFrame, one row for each accident
speed_by_weather_unrolled_df = pd.DataFrame(list(accidents.find({}, ['Speed_limit', 'Weather_Conditions'])))

# Count the number of each severity
speed_by_weather_df = pd.crosstab(speed_by_weather_unrolled_df['Speed_limit'], 
                                      speed_by_weather_unrolled_df['Weather_Conditions'])


In [None]:
speed_by_weather_df

In [None]:
speed_by_weather_df.columns = [label_of['Weather_Conditions', w] for w in speed_by_weather_df.columns]
speed_by_weather_df

We need to remove some of the rows and columns, to ensure that every cell has at least 5 elements. Let's remove the 10 and 20mph zone data, and the 'missing' weather column.

In [None]:
# speed_by_weather_df.drop('Data missing or out of range', axis=1, inplace=True)
speed_by_weather_df.drop(10, axis=0, inplace=True)
speed_by_weather_df.drop(20, axis=0, inplace=True)
speed_by_weather_df

In [None]:
ax = speed_by_weather_df.plot(kind='bar')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
#Copied from main notebook
def expected_of_df(actual_df):
    df = pd.DataFrame(
        {c: 
         {r: actual_df[c].sum() * actual_df.loc[r].sum() / actual_df.sum().sum()
                  for r in actual_df[c].index} 
              for c in actual_df})
    df = df[actual_df.columns]
    df = df.reindex(actual_df.index)
    return df

In [None]:
expected_speed_by_weather_df = expected_of_df(speed_by_weather_df)
expected_speed_by_weather_df

In [None]:
ax = expected_speed_by_weather_df.plot(kind='bar')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [None]:
scipy.stats.chisquare(speed_by_weather_df, expected_speed_by_weather_df, axis=None)

The very small *p* value shows that this is a significant result: weather conditions affect accident rates differently on different roads.

Note that the chi-squared test doesn't tell us anything about *how* the weather conditions affect accident rates, only that they do.