In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df= pd.read_csv('/kaggle/input/montcoalert/911.csv')
df.head()

**Data Analysis**

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum() #finding out null values for each column

In [None]:
df.duplicated().sum() #finding out duplcated rows if any

In [None]:
df.drop_duplicates(keep=False,inplace=True) #dropping the duplicate rows
df.duplicated().sum()

In [None]:
df.describe()

**Data Quality Issues**

* All the columns are not properly named,one cannot infer from the column names from what they are trying to           interepret so giving all columns a meaningful name.
* Zip code is a float object, changing it to string as we don't need to perfrom any mathematical operations on zip     code.
* Converting timeStamp column to datetime object

**New features to be added in the dataset**

* Extracting reason of emergency call from the title and making a new column named "Reason".
* Extracting hour,month,day,year from time and creating respective columns.

In [None]:
#creating a copy of original dataset
df_clean=df.copy()
df_clean.head()

In [None]:
#renaming the columns
df_clean.rename(columns = {"lat": "latitude", 
                           "lng":"longitude", 
                           "desc": "description",
                            "twp":"township",
                            "addr":"address",
                            "timeStamp":"time"},inplace=True)
df_clean.columns

In [None]:
#changing the type of zip code column and timestamp column
df_clean['zip'] = df_clean['zip'].astype(str)
df_clean['time']=pd.to_datetime(df_clean['time'])
df_clean.head()
df_clean.info()

In [None]:
#removing the decimal point from zip column
def change_zip(x):
    x=x[0:5]
    return x
df_clean['zip']= df_clean['zip'].apply(change_zip)
df_clean.head()

In [None]:
#extracting reason from title
def make_reason(x):
    x=x.split(':')[0]
    return x
df_clean['Reason']= df_clean['title'].apply(make_reason)
df_clean.head()

In [None]:
#extracting month,day,year,hour from timestamp column
df_clean['Hour']= df_clean['time'].apply(lambda t: t.hour)
df_clean['Month']= df_clean['time'].apply(lambda t: t.month)
df_clean['Day of Week']= df_clean['time'].apply(lambda t: t.strftime('%A'))
df_clean['Year']= df_clean['time'].apply(lambda t: t.year)
df_clean['Date']= df_clean['time'].apply(lambda x: x.date())
df_clean.head()

**Data Anaylsis and Visualization**

In [None]:
df_clean.groupby("Reason")['e'].count()

In [None]:
df_clean['township'].unique()

In [None]:
df_clean['zip'].unique()

In [None]:
df_clean.groupby(['Date','Reason'])['e'].count()

**Distribution of Latitude**

In [None]:
plt.figure(figsize=(8,4), dpi=80)
lat=df_clean[(df_clean['latitude']>39) & (df_clean['latitude']<41)]['latitude']
plt.hist(lat);
plt.xlabel("LATITUDE")
plt.ylabel("COUNT");
plt.title("Distribution of latitude");
plt.show()

**Distribution of longitude**

In [None]:
plt.figure(figsize=(8,4), dpi=80)
long=df_clean[(df_clean['longitude']>-76) & (df_clean['longitude']<-74)]['longitude']
plt.hist(lat);
plt.xlabel("LONGITUDE")
plt.ylabel("COUNT");
plt.title("Distribution of longitude");

**Reason for 911 calls**

In [None]:
plt.figure(figsize=(8,4), dpi=80)
sns.countplot(x='Reason',data=df_clean);
plt.title("Reason for 911 Calls");

**Number of 911 calls each day**

In [None]:
order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
plt.figure(figsize=(8,4), dpi=80)
sns.countplot(x='Day of Week', hue='Reason',data=df_clean,order=order);
plt.title("Number of 911 Calls Per Day");
plt.legend(loc=3);

**Number of 911 calls per month**

In [None]:
plt.figure(figsize=(8,4), dpi=80)
sns.countplot(x='Month', hue='Reason',data=df_clean);
plt.title("Number of 911 Calls per month")
plt.legend(loc=3);

**Number of 911 calls per year**

In [None]:
plt.figure(figsize=(8,4), dpi=80)
sns.countplot(x='Year', hue='Reason',data=df_clean);
plt.title("Number of 911 Calls per year")
plt.legend(loc=2);

**911 calls per day,month and hour**

In [None]:
#month
plt.figure(figsize=(8,4), dpi=80)
month=df_clean.groupby('Month').count()
plt.plot(month['e'])
plt.xlabel("Number of 911 Calls per month");
plt.ylabel("Count");
plt.title("911 Calls per month");

#day
plt.figure(figsize=(8,4), dpi=80)
month=df_clean.groupby('Day of Week').count()
plt.plot(month['e'])
plt.xlabel("Day");
plt.ylabel("Count");
plt.title("911 Calls per day");

#hour
plt.figure(figsize=(8,4), dpi=80)
month=df_clean.groupby('Hour').count()
plt.plot(month['e'])
plt.xlabel("Hour of the Day");
plt.ylabel("Count");
plt.title("911 Calls per hour");

In [None]:
plt.figure(figsize=(8,4), dpi=80)
dayHour = df_clean.groupby(['Day of Week','Hour']).count().unstack()['Reason']
sns.heatmap(dayHour)

**911 calls per township**

In [None]:
plt.figure(figsize=(20,50), dpi=200)
sns.countplot(y='township',hue='Reason',data=df_clean);