In [17]:
# Loading in the pandas module
import pandas as pd

# Reading in the data
deaths = pd.read_csv('datasets/deaths.csv')

# Print out the shape of the dataset
print(deaths.shape)

# Printing out the first 5 rows
deaths.head()

(489, 3)


Unnamed: 0,Death,X coordinate,Y coordinate
0,1,51.513418,-0.13793
1,1,51.513418,-0.13793
2,1,51.513418,-0.13793
3,1,51.513361,-0.137883
4,1,51.513361,-0.137883


In [19]:
# Summarizing the content of deaths
deaths.info()

# Define the new names of your columns
newcols = {
    'Death': 'death_count',
    'X coordinate': 'x_latitude', 
    'Y coordinate': 'y_longitude' 
    }

# Rename your columns
deaths.rename(columns=newcols, inplace=True)

# Describe the dataset 
deaths.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489 entries, 0 to 488
Data columns (total 3 columns):
Death           489 non-null int64
X coordinate    489 non-null float64
Y coordinate    489 non-null float64
dtypes: float64(2), int64(1)
memory usage: 11.5 KB


Unnamed: 0,death_count,x_latitude,y_longitude
count,489.0,489.0,489.0
mean,1.0,51.513398,-0.136403
std,0.0,0.000705,0.001503
min,1.0,51.511856,-0.140074
25%,1.0,51.512964,-0.137562
50%,1.0,51.513359,-0.136226
75%,1.0,51.513875,-0.135344
max,1.0,51.515834,-0.132933


In [21]:
# Create `locations` by subsetting only Latitude and Longitude from the dataset 
locations = deaths[['x_latitude','y_longitude']]

# Create `deaths_list` by transforming the DataFrame to list of lists 
deaths_list = locations.values.tolist()

# Check the length of the list
len(deaths_list)

489

In [23]:
# Plot the data on map (map location is provided) using folium and for loop for plotting all the points
import folium

map = folium.Map(location=[51.5132119,-0.13666], tiles='Stamen Toner', zoom_start=17)
for point in range(0, len(deaths_list)):
    folium.CircleMarker(deaths_list[point], radius=8, color='red', fill=True, fill_color='red', opacity = 0.4).add_to(map)
map

In [25]:
# Import the data
pumps = pd.read_csv('datasets/pumps.csv')

# Subset the DataFrame and select just ['X coordinate', 'Y coordinate'] columns
locations_pumps = pumps[['X coordinate', 'Y coordinate']]

# Transform the DataFrame to list of lists in form of ['X coordinate', 'Y coordinate'] pairs
pumps_list = locations_pumps.values.tolist()

# Create a for loop and plot the data using folium (use previous map + add another layer)
map1 = map
for point in range(0, len(pumps_list)):
    folium.Marker(pumps_list[point], popup=pumps['Pump Name'][point]).add_to(map1)
map1

In [27]:
# Import the data the right way
dates = pd.read_csv('datasets/dates.csv', parse_dates=['date'])

# Set the Date when handle was removed (8th of September 1854)
handle_removed = pd.to_datetime('1854/9/8')

# Create new column `day_name` in `dates` DataFrame with names of the day 
dates['day_name'] = dates['date'].dt.weekday_name

# Create new column `handle` in `dates` DataFrame based on a Date the handle was removed 
dates['handle'] = dates['date'] > handle_removed

# Check the dataset and datatypes
dates.info()

# Create a comparison of how many cholera deaths and attacks there were before and after the handle was removed
dates.groupby(['handle']).sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 6 columns):
order       43 non-null int64
date        43 non-null datetime64[ns]
attacks     43 non-null int64
deaths      43 non-null int64
day_name    43 non-null object
handle      43 non-null bool
dtypes: bool(1), datetime64[ns](1), int64(3), object(1)
memory usage: 1.8+ KB


Unnamed: 0_level_0,order,attacks,deaths
handle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,231,528,500
True,715,43,116


In [29]:
import bokeh
from bokeh.plotting import output_notebook, figure, show
output_notebook(bokeh.resources.INLINE)

# Set up figure
p = figure(plot_width=900, plot_height=450, x_axis_type='datetime', tools='lasso_select, box_zoom, save, reset, wheel_zoom',
          toolbar_location='above', x_axis_label='Date', y_axis_label='Number of Deaths/Attacks', 
          title='Number of Cholera Deaths/Attacks before and after 8th of September 1854 (removing the pump handle)')

# Plot on figure
p.line(dates['date'], dates['deaths'], color='red', alpha=1, line_width=3, legend='Cholera Deaths')
p.circle(dates['date'], dates['deaths'], color='black', nonselection_fill_alpha=0.2, nonselection_fill_color='grey')
p.line(dates['date'], dates['attacks'], color='black', alpha=1, line_width=2, legend='Cholera Attacks')

show(p)