In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #data visualization
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### **EDA**

In [None]:
df = pd.read_csv(os.path.join(dirname, filename))
df.describe(), df.info()

In [None]:
df.head()

In [None]:
#Top 5 zip codes for 911 calls
#Use groupby
df.groupby(['zip']).count().sort_values(by=['lat'],ascending=False)['lat'].head(5)


In [None]:
#Use value_counts()
df['zip'].value_counts().head(5)

In [None]:
#Top 5 towns for 911 calls
df.groupby(['twp']).count().sort_values(by=['lat'],ascending=False)['lat'].head(5)


In [None]:
#Unique title codes
df['title'].nunique()

### **Creating new features**
* In the titles column there are "Reasons/Departments" specified before the title code. These are EMS, Fire, and Traffic. Use .apply() with a custom lambda expression to create a new column called "Reason" that contains this string value.
* Convert the column from strings to DateTime objects

In [None]:
df['reason'] = df['title'].apply(lambda x: x.split(':')[0])
df['reason'].nunique()

In [None]:
df.groupby('reason').count()['lat']

In [None]:
df['reason'].value_counts().plot(kind = 'bar')
plt.xlabel('Reason')

In [None]:
type(df['timeStamp'].iloc[1])

In [None]:
df['timeStamp'] = pd.to_datetime(df['timeStamp'])
time = df['timeStamp'].iloc[1]
time, time.hour, time.month,time.day,time.dayofweek

In [None]:
df['hour'] = df['timeStamp'].apply(lambda x: x.hour)
df['month'] = df['timeStamp'].apply(lambda x: x.month)
df['day'] = df['timeStamp'].apply(lambda x: x.day)
df['dayofweek'] = df['timeStamp'].apply(lambda x: x.dayofweek)


In [None]:
daymap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df['dayofweek'] = [daymap[i] for i in df['dayofweek']]
df['dayofweek']

### **Data visualization **


In [None]:
import seaborn as sb

1. Plot of number of incident reports by days of the week and categories of incidents

In [None]:
sb.countplot(x='dayofweek', hue ='reason',data=df)

2. Plot of number of incident reports by months and categories of incidents

In [None]:
sb.countplot(x='month', hue ='reason',data=df)

3. Plot of number of incident reports by months, try fitting a linear model


In [None]:
df_month = df.groupby(['month']).count()
df_month = df_month.reset_index()
sb.lmplot(x= 'month', y = 'lat', data = df_month)

4. Plot of # of incidents per days

In [None]:
#1 Use seaborn
df['date'] = df['timeStamp'].apply(lambda x: x.date())
sb.countplot(x ='date', data = df)


In [None]:
#2 Use plt
plt.figure(figsize=(20,7))
df_date = df.groupby(['date']).count()
df_date = df_date.reset_index()
plt.plot(df_date['date'],df_date['lat'])

5. Plot of # of incidents per day and per reason

In [None]:
#1.USe seaborn
plt.figure(figsize=(20,8))
sb.countplot(x ='date', hue ='reason',data = df)

In [None]:
#2. Use plt

reason = df['reason'].unique().tolist()

for i in reason:
    plt.figure(figsize=(20,5))
    df_reason= df[df['reason'] ==i]
    df_reason_bydate = df_reason.groupby(['date']).count()
    df_reason_bydate = df_reason_bydate.reset_index()
    plt.plot(df_reason_bydate['date'],df_reason_bydate['lat'])
    plt.title(i)

6. Create a heat map and cluster map for Day vs Hour

In [None]:
#Create a new dataset for the heatmap, using unstack
df_new = df.groupby(['dayofweek','hour']).count()['lat']
df_new = df_new.unstack(level = -1)
df_new.head()

In [None]:
sb.heatmap(df_new, cmap='viridis')

In [None]:
sb.clustermap(df_new,cmap='viridis')

7. Create a heatmap and cluster map for Month vs. Day

In [None]:
df_monthday = df.groupby(['month','dayofweek']).count()['lat'].unstack(level = -1)
df_monthday

In [None]:
sb.heatmap(df_monthday)

In [None]:
sb.clustermap(df_monthday)