# Data

# Imports

In [None]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')
sns.set_context('notebook')
matplotlib.rcParams['figure.figsize'] = (12,8) 

# Exploratory Data Analysis

In [None]:
df = pd.read_csv('../input/montcoalert/911.csv')
df.head()

In [None]:
df.info()

In [None]:
# Get numeric columns
num_cols = df.select_dtypes(include=[np.number]).columns.values
print('Numeric cols :',num_cols)

In [None]:
# Get Non-numeric/categorical columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns.values
print('Categorical cols :',cat_cols)

In [None]:
df['zip'].value_counts()
# There are 204 unique Zip codes

## Township (twp)

In [None]:
# 911 calls received from Five most and least townships.
df['twp'].value_counts()
# There are 68 unique townships in our dataset.

In [None]:
# Countplot of all the calls received from different townships
plt.figure(figsize=(12,12))
sns.countplot(y=df['twp'], order=df['twp'].value_counts().loc[:].index)
plt.tight_layout()

# From graph, it is observed that 9 townships made more than 20000 calls each between 2015-2020.
# Also, more than half of the townships have not crossed 10000 calls each during this juncture

## Title

In [None]:
# Title of the 911 calls received.
df['title'].value_counts().head(20)

In [None]:
# Title column have two parts divided by ':'
# Splitting Title into two columns, namely 'category' and 'purpose' of call.
df['category'] = df['title'].apply(lambda x:x.split(':')[0])
df['purpose'] = df['title'].apply(lambda x:x.split(':')[1])
df.drop('title', axis=1, inplace=True)

In [None]:
df['category'].value_counts()
# There are three main categories of calls in the dataset, 
# which are Emergency Medical Services(EMS), Traffic and Fire

In [None]:
sns.countplot(df['category'])

In [None]:
display(df['purpose'].value_counts())
# There are 95 different Purposes/sub-categories of 911 calls as per the data.

print('\n ----In percentage---- \n')
100*(round(df['purpose'].value_counts()/len(df['purpose']),2)).head(3)

In [None]:
# There are 95 unique purpose, most purposes are less frequent. 
# Plotting 30 most frequent purpose vs count
sns.countplot(y=df['purpose'], order=df['purpose'].value_counts().iloc[:30].index)


# Based on Data, 'VEHICLE ACCIDENT' is the most common reason, resulting in whopping 22% of the of 911 calls (1.5 million calls) during 5 years span.
# Second one being 'DISABLED VEHICLE', resulting in 7% of calls (47909 calls).
# Finally,  ANIMAL COMPLAINT, HIT + RUN, PRISONER IN CUSTODY, and FOOT PATROL being the least reasons.


In [None]:
EMS = df[df['category']=='EMS']['purpose']
EMS.nunique()
# There are 81 unique calls under "EMS" category

In [None]:
# Plotting 30 out of 81 most frequent call purposes under "EMS" category vs count
plt.title("EMS")
sns.countplot(y=EMS, order=EMS.value_counts().iloc[:30].index)

# From graph, Most common purpose for 911 calls under EMS category are FALL VICTIM, RESPIRATORY and CARDIAC EMERGENCY. 
# Each constituting over 10% off overall calls under EMS respectively.

In [None]:
Fire = df[df['category']=='Fire']['purpose']
Fire.nunique()

# There are 60 unique calls under "Fire" category

In [None]:
# Plotting 25 most frequent call purposes under "Fire" category vs count
plt.title("Fire")
sns.countplot(y=Fire, order=Fire.value_counts().iloc[:25].index)


# From graph, Most common purpose for 911 calls under "Fire" category is FIRE ALARM, which accounts for 38% of all calls under Fire category
# Further, VEHICLE ACCIDENT and FIRE INVESTIGATION adds up to around 10% each to total calls.

In [None]:
Traffic = df[df['category']=='Traffic']['purpose']
Traffic.value_counts()

# 7 unique subcategories/purposes under "Traffic" category

In [None]:
plt.title("Traffic")
sns.countplot(y=Traffic, order=Traffic.value_counts().index)

# From graph, again "VEHICLE ACCIDENT" is the top purpose of 911 calls accounts for 64% of all calls under Traffic category. 
# Next "DISABLED VEHICLE" and "ROAD OBSTRUCTION" together combines to 31%.
# Intrestingly, these three reasons alone constitutes 95% of 911 calls related to Traffic

## Time Stamp

In [None]:
# TimeStamp is of string type
print(df['timeStamp'].dtype)
df['timeStamp']

# Note:
# The observation start from Dec-2015 to July-2020 here, hence less datapoints to make inferences on these years. 

In [None]:
# Converting timeStamp in to datetime object
df['timeStamp'] = pd.to_datetime(df['timeStamp'])
print(df['timeStamp'].dtype)

In [None]:
# Converting Datetime column in to new columns
df['date'] = df['timeStamp'].apply(lambda x: x.date())
df['year'] = df['timeStamp'].apply(lambda x: x.year)
df['month'] = df['timeStamp'].apply(lambda x: x.month)
df['day'] = df['timeStamp'].apply(lambda x: x.dayofweek)
df['hour'] = df['timeStamp'].apply(lambda x: x.hour)

In [None]:
# New columns
df[['date', 'year', 'month', 'day', 'hour']]

## Year


In [None]:
# Line graph of 911 call count of all categories for each year from 2015 to 2020
sns.lineplot(data=df.groupby('year').count()['category'])

In [None]:
# Bar graph of 911 call count per category for each year from 2015 to 2020
sns.countplot(df['year'], hue=df['category'])

## Month


In [None]:
df['month'].unique()

In [None]:
# Mapping Month names to numbers
df['month'] = df['month'].map({1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun', 7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'})
df['month'].unique()

In [None]:
# Line graph of 911 calls count in individual months (Jan-Dec)
sns.lineplot(data=df.groupby('month').count()['category'], marker='o')

In [None]:
# Bar graph of 911 calls count per month, sorted in decreasing order.
sns.countplot(df['month'], order=df['month'].value_counts().index)

# From Graph, 'January' received the highest number of calls, while least in 'September'. 
# Also, number of calls received on 'March', 'June' and 'July' are almost same. 
# Simillarly, in 'May' and 'December'.

In [None]:
# Bar graph of 911 calls count per month for each category, sorted in decreasing order.
sns.countplot(df['month'], order=df['month'].value_counts().index, hue=df['category'])
# This is same as above graph, where the total sum is splitted by categories

# From graph below, Calls received related to 'Fire' in all months are approximately same,
# On an average of around 8000 calls each month.

## Day

In [None]:
df['day'].unique()

In [None]:
# Mapping Day of the week to numbers
df['day'] = df['day'].map({0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'})
df['day'].unique()

In [None]:
# Line graph of overall 911 calls count for each day of the week in Ascending Order.
sns.lineplot(data=df['day'].value_counts()[::-1], marker='o')


# From Graph it is clearly understood that, 911 calls are least received during weekends, specially on Sundays 
# It significantly increases on Monday (mostly Traffic from Bar graph)
# and gradually increases on other days till it hits the peak on Fridays (~ 1.03 million calls)

In [None]:
# Bar graph of overall 911 calls count for each day of the week in Ascending Order.
sns.countplot(df['day'], order=df['day'].value_counts()[::-1].index, hue=df['category'])

## Month-Year Matrix

**Note:**

Notice that usually 2020 is excluded when calculating total calls received in a 'Year', as it contains incomplete data of only 7 months (Jan-Jul). 

But in this section, we are comparing monthly calls of every year. As April 2020 contains complete data from 1st to 30th and it is considered.

In [None]:
df[df['month']=='Apr']['date']

In [None]:
# Month and Year Matrix

year_month = df.groupby(['month', 'year'], dropna=True).count()['category'].unstack()
display(year_month)
sns.heatmap(year_month, cmap='coolwarm', lw=0.25)


# As stated before, 2015 and 2020 doesn't have complete data, hence null values.
# For better visualization Heatmap is used.

# From Heatmap, 
# 'March 2018' recorded the highest 911 calls of 14923 as per our dataset.
# and 'April 2020' recorded the least calls of 8243 (excluding 2015 as it has only 10th Dec) 



## Day of Week and Month Matrix

In [None]:
# Day of the Week and Month Matrix

month_day = df.groupby(['day', 'month']).count()['category'].unstack()
display(month_day)

In [None]:
# Heatmap of the above matrix
sns.heatmap(month_day, cmap='coolwarm', lw=0.25)

# From Heatmap
# The highest number of calls documented on a 'Friday', in 'March' with 10941 calls
# The Least number of calls documented on a 'Sunday' in 'November' with 5196 calls  

In [None]:
# Cluster Map for more detailed relationship between months and between days of week.
# Scaled the values between 0 to 1 for more contrast

sns.clustermap(month_day, cmap='coolwarm', standard_scale=1)

## Day of the Week and Hour Matrix

In [None]:
# Day of the Week and Hour Matrix
day_hour = df.groupby(['day', 'hour']).count()['category'].unstack()
display(day_hour)

In [None]:
# Total calls on given hour per Day of the Week
day_hour.sum()

In [None]:
sns.heatmap(day_hour, cmap='coolwarm', lw=0.25)

# From heatmap,
# 911 calls start rising significantly from 7 a.m. (except Weekends), till 8 p.m. (hour 20)
# Most calls are observed at Hour 17 and Highest recorded at Hour 17, on Friday of 7113 calls
# Least calls are observed at Hour 4 and lowest recorded at Hour 4, on Wednesday of 1128 calls

# Data Insights

**To summarize,**

A total of 663522 911 calls are observed between *10-Dec-2015* and *29-Jul-2020*.


</br>

1. **Individual Feature Analysis**

</br>

|  | Most Calls Received | Least Calls Received |
| ---------------: | :---------- | :----------- |
| **Call Category** | EMS | Fire |
| **Call Purpose** | Vehicle Accident | - |
| **Township** | Lower Merion | Lehigh County |
| **Year** | 2018 | 2017 (excluding partial years)|
| **Month** | January | September|
| **Day of Week**| Friday | Sunday  |
| **Hour of Day** | 17 (5 p.m)| 4 a.m |

</br>

2. **Combined Analysis**

| Particular | Highest on | Least on |
| ---------------: | :---------- | :----------- |
| **Month in Years** | March 2018 (14923) | April 2020 (8243) |
| **Day of Week in Months** | Fridays in March (10941) | Sundays in November (5196) |
| **Hour in Days** | 5 p.m on Fridays (7113)| 4 a.m on Wednesdays (1128) |
