In [None]:
##Import relevant libraries necessary for Exploratory Data Analysis

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime


In [None]:
%matplotlib inline

In [None]:
##Import relevant
data = pd.read_csv("../input/montcoalert/911.csv", parse_dates = ["timeStamp"])

The Data contains the following features

* lat: String variable, Latitude
* lng: String variable, Longitude
* desc: String variable, Description of the Emergency Call
* zip: String variable, Zip Code
* title: String variable, Title
* timeStamp: String variable, YYYY-MM-DD HH:MM:SS
* twp: String variable, Township
* addr: String variable, Address
* e: String variable, Always 1

In [None]:
data.info()

We can see from the information of the data provided by *Pandas* that some missing values exist in the `twp` and `zip` columns. Next the rows containing missing values will be dropped and the `e` column as well.

In [None]:
data.isna().sum()

In [None]:
clean_data = data.copy().dropna(axis=0).drop("e", axis=1)
clean_data.head()

In [None]:
clean_data.shape

Next we'll answer a few questions, such as;
- What Townships call for emergency frequently?
- Why do people call 911 frequently?, e.t.c

In [None]:
print("{} was the most emergency calls made by a town, while {} was the least 911 Emergency calls"\
      .format(clean_data["twp"].value_counts().max(),clean_data["twp"].value_counts().min()))



In [None]:
clean_data["title"].nunique() 

To further explore this data, a new feature will be derived from the `title` feature, this is because the title has many unique values as a result of expanded categories, the new feature will hold the compressed categories.

In [None]:
## The title col
clean_data["reason"] = clean_data["title"].apply(lambda x: x.split(":")[0])

clean_data.head()

In [None]:
clean_data["reason"].nunique() #A huge difference from 144

In [None]:
#What is the reason for calling 911 regularly
clean_data["reason"].value_counts()

In [None]:
## Ignore warnings from seaborn
import warnings

warnings.filterwarnings(action = "ignore", category = FutureWarning,module = "seaborn")


In [None]:
sns.countplot(clean_data["reason"], palette = "winter")

Next we'll discover how the days of week affect the rate of calls made, but first we have to create a new column, fortunately the dataset's `timeStamp` feature was parsed in a `datetime` format.

In [None]:
time_0 = clean_data["timeStamp"].iloc[0]
print(f"This first day in the data's entry is {time_0.day_name()}") #Special functions are available for this datatype
print(f"This first month in the data's entry is {time_0.month_name()}")

In [None]:
clean_data["date"] = clean_data["timeStamp"].apply(lambda x: x.date())
clean_data["day"] = clean_data["timeStamp"].apply(lambda x: x.day_name()[:3])
clean_data["month"] = clean_data["timeStamp"].apply(lambda x: x.month)
clean_data["hour"] = clean_data["timeStamp"].apply(lambda x: x.hour)
clean_data["year"] = clean_data["timeStamp"].apply(lambda x: x.year)

clean_data.head()

In [None]:
#How many emergency calls are sent in a day?
plt.figure(figsize=(10,7))
sns.countplot(clean_data["day"], data = clean_data, hue="reason", palette = "rocket")
plt.legend(bbox_to_anchor = (1.15,1), loc = 0)

In [None]:
#How many emergency calls are sent in a month?
plt.figure(figsize=(10,7))
sns.countplot(clean_data["month"], data = clean_data, hue="reason", palette = "rocket")
plt.legend(bbox_to_anchor = (1.15,1), loc = 0)

In [None]:
byDate = clean_data.groupby("date").count()
byDate.head()

In [None]:
byDate.head()

In [None]:
def date_tick_gen(startDate: tuple, endDate: tuple, num_tick:int):
    """
    startDate: Tuple. Input start date with tuple of 3 integers in format Y, M, d
    endDate: Tuple. Input end date with tuple of 3 integers in format Y, M, d
    num_tick: Int. Number of ticks to generate
    
    """ 
    Y, M, d = startDate
    start = datetime.date(Y, M, d)
    Y, M, d = endDate
    end = datetime.date(Y, M, d)
    diff = (end - start).days
    step = int(diff/num_tick)
    ticks = list(start + datetime.timedelta(days=x) for x in range(0, diff, step))
    
    return ticks

start = datetime.date(2015,11,20) #I used these dates so the chart wont be truncated
end = datetime.date(2020,8,10)

ticks = date_tick_gen((2015,11,20), (2020,8,10), 15) #Tweak the num_steps parameter to see effects on the plot



plt.figure(figsize=(17,7))

byDate["zip"].plot()

plt.xlim((start, end))
plt.xticks(ticks)

plt.tight_layout()

There was a surge in emergency calls between 25-Feb, 2018 and 29-Mar,2018. What event/s could has originated this?

In [None]:
#Plot to show the date calls for Traffic related emergencies peaked
plt.figure(figsize=(17,7))

byDateTraffic = clean_data[clean_data["reason"] == "Traffic"].groupby("date").count()
byDateTraffic["zip"].plot()

plt.xlim((start, end))
plt.xticks(ticks)

plt.tight_layout()

Looking at these, a questions arises what days in a specific month are emergencies calls more prone to be made?

To answer this we would have to find the correlation between the days of week and the months.

In [None]:
byMonthDay = clean_data.groupby(["month","day"]).count()["zip"].unstack(1)
byMonthDay.head()

In [None]:
plt.figure(figsize=(14,10))
sns.heatmap(byMonthDay, cmap="coolwarm")

From the data, it is deduced that Fridays of the month of March are associated with high emergency calls

## Geo Spatial Plotting

In [None]:
#Import necessary libraries
import geopandas as gpd
import folium
from folium import Marker
from folium.plugins import MarkerCluster

In [None]:
warnings.filterwarnings(action = "ignore", category = FutureWarning,module = "pyproj")

geo_data = gpd.GeoDataFrame(data = clean_data[['lat', 'lng', 'desc', 'year','twp', 'date','addr','reason']],
                            geometry = gpd.points_from_xy(clean_data["lng"], clean_data["lat"]))
geo_data.crs = {"init":"epsg:4326"}

geo_data.head()

In [None]:
map_1 = folium.Map(location=[37.6, -95.665], tiles="cartodbpositron", zoom_start = 4) #Map of USA
map_1

In [None]:
map_2 = folium.Map(location=[37.6, -95.665], tiles="cartodbpositron", zoom_start = 10)
sub_gData = geo_data[geo_data["year"] == 2020]

mc = MarkerCluster()

for idx, row in sub_gData.iloc[:100].iterrows():
    popup = row["addr"] +" " + row["twp"].title()
    tooltip = row["reason"]
    mc.add_child(Marker([row["lat"],row["lng"]], popup = popup,tooltip=tooltip))
    
map_1.add_child(mc)