In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# In this notebook, the following tasks that I will do are:

**Data Cleaning**
1. Remove useless data
2. Change the data type of timeStamp from Object to Date

**Data Analysis to answer several Questions that may find insight:**
1. What are the top 5 zip codes for the number of 911 calls?
1. What are the top 5 townships(twp) for the number of 911 calls?
1. Which month has the highest 911 calls?
1. Which day of the week has the highest 911 calls?
1. Which hour has the highest 911 calls?
1. What is the most common reason for a 911 call?
1. The total number of each reason for 911 calls for each day of a week in all years.
1. The number of 911 calls for each reason yearly
1. The trends of each reason in all years
1. Find the trends of the total number of 911 calls by year.
1. Does Season have an association with the number of 911 calls?
1. Does the hours have any association with the number of the 911 call?

# Data Cleaning

****1. Remove meaningless data****

In [None]:
df = pd.read_csv('../input/montcoalert/911.csv')

Before analyst or cleaning the data, lets take a look of the information of data first

In [None]:
df.info()

In [None]:
df.head(5)

According to the results above, we could see some variables have null values. Besides that, the data type of variable timeStamp is object, to let the variable timeStamp could be easier to use and analyst, I will convert it to Date data type

But before that, according details of data set that provided by author, the variable e actually is a useless variable, so I will delete the variable e first.

In [None]:
del df['e']

2. Change the data type of TimeStamp from Object to Date

In [None]:
from datetime import datetime

df['timeStamp'] = pd.to_datetime(df['timeStamp'])
df.info()

# Data Analysis

**1.What are the top 5 zip codes for the number of 911 calls?**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df['zip'].value_counts().head(5)

According to the result above, we could find those five zip code are the top five highest number of 911 calls

**2. What are the top 5 townships(twp) for number of 911 calls?**

In [None]:
df['twp'].value_counts().head(5)

As we can see, LOWER MERION, ABINGTON, NORRISTOWN, UPPER MERION and CHELTENHAM are the top 5 townships in the number of 911 calls.

**3. Which month has the highest 911 calls?**

In [None]:
df['month'] = df['timeStamp'].apply(lambda m: m.month)
df['month'].head()

# Retrieve the month from timeStamp as a new column

In [None]:
df['month'].value_counts().head()

In [None]:
sns.countplot(df['month'])

According to the results above, January has the highest number of 911 calls in total. Besides that, June and March also very close to the highest number.

**4. Which day of the week has the highest 911 calls?**

In [None]:
df['day'] = df['timeStamp'].apply(lambda d: d.weekday())
df['day'].head()

In [None]:
dayOfWeek = {0:'Sun', 1:'Mon', 2:'Tue', 3:'Wed', 4:'Thur', 5:'Fri', 6:'Sat'}
df['day'] = df['day'].map(dayOfWeek)

In [None]:
sns.countplot(df['day'])

Thursay has the highest 911 calls in total.

**5. Which hour has the highest 911 calls?**

In [None]:
df['hour'] = df['timeStamp'].apply(lambda i: i.hour)
df['hour'].head()

In [None]:
hourOfDay = df.groupby('hour').count()
sns.countplot(df['hour'])

According to the results above, we could find 5pm has the highest 911 calls.

**6. What is the most common reason for a 911 call?**

In [None]:
df['Reason'] = df['title'].apply(lambda i: i.split(':')[0])
df['Reason'].head(5)

In [None]:
sns.countplot(df['Reason'])

According to the countplot above, the EMS is the most common reason of 911 calls.

**7. The total number of each reason for 911 calls for each day of a week in all years.**

In [None]:
df.groupby(['day','Reason']).count()['lat']

In [None]:
sns.countplot(x='day',data=df,hue='Reason',palette='viridis')

**8. The number of 911 calls for each reason yearly**

In [None]:
df['Year'] = df['timeStamp'].apply(lambda y:y.year)
df['Year'].head()

In [None]:
df.groupby(['Year','Reason']).count()['lat']

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(x='Reason',data=df,hue='Year',palette='viridis')

**9. The trends of each reason in all years**

In [None]:
df[df['Reason'] == 'EMS'].groupby('Year').count()['lat'].plot()
plt.title('Trends of EMS by Year')

In [None]:
df[df['Reason'] == 'Fire'].groupby('Year').count()['lat'].plot()
plt.title('Trends of Fire by Year')

In [None]:
df[df['Reason'] == 'Traffic'].groupby('Year').count()['lat'].plot()
plt.title('Trends of Traffic by Year')

**10. Find the trends of the total number of 911 calls by year.**

In [None]:
year = df.groupby('Year').count()
sns.lineplot(data=year.reset_index(), x='Year', y='lat')
plt.title('The trends of number of 911 calls by year')


**11.  Does Season have an association with the number of 911 calls for each reason?**

In [None]:
season = {3:'spring',4:'spring',5:'spring',6:'summer',7:'summer',8:'summer',9:'autumn',10:'autumn',11:'autumn',12:'winter',1:'winter',2:'winter'}

df['season'] = df['month'].map(season)

In [None]:
yearOfSeason = df.groupby(['season','Year']).count()['lat'].unstack()
sns.heatmap(yearOfSeason.dropna(axis=1),cmap='coolwarm')

According to the heatmap above, we can't see any pattern that indicate there has a association between the season and the number of 911 calls.

**12. Does the hours has any association with the number of 911 call?**

In [None]:
dayHour = df.groupby(by=['day','hour']).count()['Reason'].unstack()
dayHour.head()

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(data=dayHour, cmap='coolwarm')

According to the heatmap above, we could see there has a pattern that indicates the association between hours and the number of 911 calls. For example, night and morning of the number of 911 calls are only a few, but when afternoon, the number of 911 calls will become more and more.