In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('../input/montcoalert/911.csv')

In [None]:
df.info()

In [None]:
df.head()

# Basic Questions

In [None]:
# Top 5 Zip Codes for 911 Calls
df.groupby('zip').count().sort_values(ascending=False, by='e').head()

In [None]:
df['zip'].value_counts().head()

In [None]:
# Top 5 townships(twp) for 911 calls
df.groupby('twp').count().sort_values(ascending=False, by='e').head()

In [None]:
df['twp'].value_counts().head()

# Feature Creation

Looking at the titles column, we can see that the instances follow the pattern "reason:department". To create a new features  we will extract the reason as well as department from the title column.

In [None]:
df['title'].head()

In [None]:
def reason(x):
    return x.split(':')[0]

def department(x):
    return x.split(':')[1].strip()

df['reason'] = df['title'].apply(reason)
df['department'] = df['title'].apply(department)

In [None]:
print('There are {} different titles'.format(df['title'].nunique()))
print('There are {} different departments'.format(df['department'].nunique()))
print('There are {} different reasons'.format(df['reason'].nunique()))

In [None]:
df['reason'].value_counts()

In [None]:
sns.countplot(x='reason', data=df)

In [None]:
# timestamp column is a string
df.info()

In [None]:
# converting timestamp to datetime objects
df['timeStamp'] = pd.to_datetime(df['timeStamp'])

In [None]:
# timestamp is now datetime
df.info()

In [None]:
# we can now extract different attributes from timestamp
df['hour'] = df['timeStamp'].apply(lambda time: time.hour)
df['month'] = df['timeStamp'].apply(lambda time: time.month)
df['day_of_week'] = df['timeStamp'].apply(lambda time: time.dayofweek)

In [None]:
# the day_of_week column contains integers, which follow the pattern of the dictenary dmap
# we use this dict to map the day_names in string format 
dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df['day_of_week'] = df['day_of_week'].map(dmap)

In [None]:
# by looking at the countplot per reason we can see, 
# that the number of traffic reasons declines at the weekend, 
# which makes sense due to the fact that there is less traffic on weekends anyways

sns.countplot(x='day_of_week',hue=df['reason'],data=df)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
sns.countplot(x='month',hue=df['reason'],data=df)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
df['date'] = df['timeStamp'].apply(lambda time: time.date())

# add plots for outliers and plot for 2020 (Covid)

In [None]:
df.groupby('date').count()['e'].plot.line(figsize=(12,5))

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.tight_layout()

In [None]:
df[df['reason']=='Traffic'].groupby('date').count()['e'].plot.line(figsize=(12,5))
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Traffic')
plt.tight_layout()

In [None]:
df[df['reason']=='Fire'].groupby('date').count()['e'].plot.line(figsize=(12,5))
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Fire')
plt.tight_layout()

In [None]:
df[df['reason']=='EMS'].groupby('date').count()['e'].plot.line(figsize=(12,5))
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('EMS')
plt.tight_layout()

In [None]:
df_unstack = df.groupby(by=['day_of_week', 'hour']).count()['e'].unstack()
df_unstack

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df_unstack)

In [None]:
plt.figure(figsize=(12,6))
sns.clustermap(df_unstack)

In [None]:
df_month = df.groupby(by=['day_of_week', 'month']).count()['e'].unstack()
df_month

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df_month)

In [None]:
sns.clustermap(df_month, cmap='coolwarm')