In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('../input/montcoalert/911.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

Understanding a bit whats in the data from the code above my first question is what are the top zip codes for 911 calls?

In [None]:
df['zip'].value_counts().head()

In [None]:
df['twp'].value_counts().head()

How many unique titles are there?

In [None]:
df['title'].nunique()

I want to combine the lat and lng data into one cell and round it to possibly get a higher resolution where the majority of calls come from:

In [None]:
df['latlng'] = [', '.join(str(x) for x in y) for y in map(tuple, df[['lat', 'lng']].values.round(2))]
df.head()

In [None]:
df['latlng'].value_counts().head()

In [None]:
df['latlng'].nunique()

that would be a pretty high resolution compared to townships but maybe a bit much to chart out in the simple methods I'm planning

I'll seperate the reasons for calls into their general categories as documented in the title column.

In [None]:
df['reason'] = df['title'].apply(lambda x:x.split(':')[0])
df.head()
df['reason'].value_counts()

In [None]:
sns.countplot(x=df['reason'], palette='inferno')

In [None]:
type(df['timeStamp'])

Converting timestamp data to pandas datetime object and adding some columns for month, hour, day of the week

In [None]:
df['timeStamp'] = pd.to_datetime(df['timeStamp'])

In [None]:
dmap = {0: 'Mon', 1:'Tues', 3:'Wed', 4:'Thur', 5:'Fri', 6:'Sat', 7:'Sun'}
df['month'] = df['timeStamp'].apply(lambda x:x.month)
df['hour'] = df['timeStamp'].apply(lambda x:x.hour)
df['DOW'] = df['timeStamp'].apply(lambda x:x.dayofweek).map(dmap)

In [None]:
df.head()

In [None]:
sns.set_theme(style="whitegrid", context="notebook", )

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x="DOW", hue="reason", data=df, palette='plasma_r')
# place the legend outside the figure/plot
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)
plt.title("Reasons for 911 calls per Day of Week")
plt.tight_layout()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x = 'month', data = df, hue='reason', palette = 'ocean')
plt.legend(bbox_to_anchor = (1.01, 1), borderaxespad=0)
plt.title('Reasons for 911 Calls per Month')
plt.tight_layout()

In [None]:
df['month'].value_counts()

Interesting. In the csv file I was given there were a few months of data miissing, in order to make it challenging to find ways to fill that gap with some inferences I suppose. Anyways there it is full and complete.

In [None]:
byMonth = df.groupby('month').count()
byMonth.head()

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(x='month', y='title', data = byMonth)

In [None]:
plt.figure(figsize=(10,60))
sns.lmplot(x='month', y='title', data = byMonth.reset_index())

It's actually not that different even with the missing month data! Also interesting.

In [None]:
df['date'] = df['timeStamp'].apply(lambda x:x.date())
df.head()

In [None]:
byDate = df.groupby('date').count()
plt.figure(figsize=(12,6))
sns.lineplot(x='date', y='title', data=byDate)
plt.xlabel('date')
plt.ylabel('911 call count')
plt.title('call volume by date')

In [None]:
date_ems = df[df['reason']=='EMS'].groupby('date').count()
plt.figure(figsize=(12,6))
sns.lineplot(x='date', y='title', data=date_ems)
plt.xlabel('date')
plt.ylabel('911 call count')
plt.title('ems')

In [None]:
date_traffic = df[df['reason']=='Traffic'].groupby('date').count()
plt.figure(figsize=(12,6))
sns.lineplot(x='date', y='title', data=date_traffic)
plt.xlabel('date')
plt.ylabel('911 call count')
plt.title('traffic')

In [None]:
date_fire = df[df['reason']=='Fire'].groupby('date').count()
plt.figure(figsize=(12,6))
sns.lineplot(x='date', y='title', data=date_fire)
plt.xlabel('date')
plt.ylabel('911 call count')
plt.title('fire')

Lets see if there are any patterns within the hours of the day and call volume

In [None]:
dayHour = df.groupby(['DOW', 'hour']).count()['reason'].unstack()
dayHour

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(dayHour)
plt.title('call frequency by day and hour')
plt.xlabel('hour')
plt.ylabel('day of the week')

In [None]:
plt.figure(figsize=(10,6))
sns.clustermap(dayHour)
plt.title('call frequency by day and hour')

Does the pattern continue on a monthly basis?

In [None]:
monthHour = df.groupby(['month', 'hour']).count()['reason'].unstack()
monthHour

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(monthHour)
plt.title('call frequency by month and hour')
plt.xlabel('hour')
plt.ylabel('month')

In [None]:
plt.figure(figsize=(10,6))
sns.clustermap(dayHour)
plt.title('call frequency by month and hour')

Aaaaaand just for fun to check the call volume by reason in each township:

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(x="twp", hue="reason", data=df, palette='plasma_r')
# place the legend outside the figure/plot
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)
plt.title("Reasons for 911 calls per Day of Week")
plt.xticks(rotation=90)
plt.tight_layout()

So next I would like to explore the different townships to see if there are any troublesome areas that perhaps you might want to be extra careful in based on some of the 911 data.
So I'll create a new column to determine what the specific reasons are for EMS calls and plot both the top and tail 25 reasons people call for EMS:

In [None]:
df['specific_EMS'] = df[df['reason']=='EMS']['title'].apply(lambda title:title.split(': ')[1])

In [None]:
plt.figure(figsize=(12,8))
df[df['reason']=='EMS']['specific_EMS'].value_counts()[:30].plot(kind='bar')

In [None]:
plt.figure(figsize=(12,8))
df[df['reason']=='EMS']['specific_EMS'].value_counts()[-30:].plot(kind='bar')

Neighborhoods that are associated with heavy drug use tend to have a bit more problems, and that possibly reflects in the data here. So lets see if there's any correlation between overdoses and some violent crime, say stabbing, in any particular townships.

In [None]:
plt.figure(figsize=(12,8))
df[df['specific_EMS']=='OVERDOSE'].groupby('twp')['title'].count().sort_values(ascending=False)[:25].plot(kind='bar')
plt.title('Overdoses per township')
plt.xlabel=('Township')
plt.ylabel=('Number of Overdoses')
plt.tight_layout()
plt.legend(bbox_to_anchor=(1.01, 1),borderaxespad=0)
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(12,8))
df[df['specific_EMS']=='STABBING'].groupby('twp')['title'].count().sort_values(ascending=False).plot(kind='bar')
plt.title('Stabbings per Township')
plt.xlabel=('Township')
plt.ylabel=('Number of Stabbings')
plt.tight_layout()
plt.xticks(rotation=90)

Once again Norristown and Pottstown have the overwhelming majority of stabbings. Considering both the plots above I would try to avoid those areas if possible.

There is probably a lot more that would be interesting to find out within this dataset but for now this is my initial submission of any kind and I will keep it at this for now. Cheers!