In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
frame = pd.read_csv("../input/Health_AnimalBites.csv")

In [None]:
frame.head()

In [None]:
by_species = frame.groupby("SpeciesIDDesc").size().to_frame().reset_index().rename(columns= {0:"Count"}).sort_values("Count",ascending = False)

In [None]:
print(by_species)

Dogs and cats dominate the list as being the most likely to bite.  Not necessarily after causality here, but could be due to the fact that humans may spend far more time around dogs and cats than most of the rest of these animals.  Although the dogs/cats in question may or may not be domesticated.

In [None]:
frame["bite_date"].sort_values(ascending = False).head()

we are looking at some years that are WAAAAAY out of bounds for the dataset.  I noticed the problem when I tried to convert the "bite_date" column to a datetime object using pd.to_datetime().  I am going to strip the text on the '-' and throw out observations that that are out of bounds.

In [None]:
frame["year"] = frame["bite_date"].str.split("-").str[0] 
# accesses the string split vectorized text method, then the first element of each list produced which
# should be the year

In [None]:
frame = frame[pd.notnull(frame["year"])] # recraft the frame to exclude those incidents missing a recorded date

In [None]:
frame["year"] = frame["year"].astype(np.int64) # convert the years to int

In [None]:
frame = frame[frame["year"] < 2018] # now recraft the frame by throwing out all incidents that occured AFTER 2018

In [None]:
by_year = frame.groupby('year').size().to_frame().rename(columns = {0:"Count"})
print(by_year.head())

We have gotten a year which we can use 

In [None]:
len(frame["victim_zip"].unique())

The point here is that there are 227 unique zip codes present in the dataset.  Let's just look at the ones where bites are most prevalent.

In [None]:
by_zip = frame.groupby("victim_zip").size().to_frame().rename(columns={0:"Count"}).sort_values("Count",ascending=False)

In [None]:
targets = by_zip[by_zip["Count"] > 25] # find all zip codes where more than 25 bites have been reported
print(targets.head()) # check the list

We just created a Series whose index is the zip code, and whose values are the Count of the number of bites associated with the given zip code (at least those who had more than 25 bites).  We can use the index values containing the zip codes to help filter the original "frame" and grab only those rows whose zip codes are represented in the "targets" series.

A heatmap showing bites over time in these zip codes might be interesting to look at.

In [None]:
fig,ax = plt.subplots(figsize=(12,12))
sns.heatmap(frame[frame["victim_zip"].isin(targets.index)].groupby(["victim_zip","year"]).size().unstack(),ax=ax,annot=True)

The above line is a bit wordy, so I will break it down for those that are having trouble seeing the process:

* We can generate a heatmap using the sns.heatmap() method.  But we still need to point the method to the appropriate data
* The data we need should contain only those observations in the original DataFrame ('frame') whose zip codes account for more than 25 bites (see the 'targets' Series we created above).  Pandas has the nifty .isin() Series method that will compare Series to Series or Lists to Lists and return a boolean response of the common elements.  So we are effectively filtering the original DataFrame by saying:  "If the "victim_zip" values from the original frame appear in the index of the 'target' series, then I want to keep those values for use in the heatmap". 
* With the correct observations present, we can craft the DataFrame into what is effectively a cross-tabulation showing total observations at the intersection of each zip code and year.  You can use either pd.crosstab, or conduct a groupby using a list, and then unstacking the innermost index. 



I think the most interesting observation is that the number of recorded incidents for these zip codes seem to pick up drastically after the year 2010.  Somehow the reporting of the data improved in only one year between 2009 and 2010. 

## Work in process!

In [None]:
frame[(frame["SpeciesIDDesc"] == "DOG") | (frame["SpeciesIDDesc"] == "CAT")].head() # filter to grab just the observations containing dogs and ca