Let’s begin by installing and importing the necessary packages\libraries. This is a fundamental step required in every exploratory data analysis process.

In [24]:
# Install sodapy
# Sodapy is a python client for the Socrata Open Data API.
!pip install sodapy



In [25]:
import pandas as pd
import numpy as np
from sodapy import Socrata
import seaborn as sns
from matplotlib import pyplot as plt

###### Loading the dataset from an API endpoint using sodapy

In [26]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password
client = Socrata("data.cincinnati-oh.gov", None)
# Entire records returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results_cincinnati = client.get("vnsz-a3wp", limit=492500)
# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results_cincinnati)



###### Initial exploration to understand and summarize the dataset

In [27]:
# how many rows and columns given dataset consists
print("Cincinnati 911 dataset contains ", results_df.shape[0], " rows and " , results_df.shape[1], "columns.")

Cincinnati 911 dataset contains  492500  rows and  17 columns.


###### Explore the first few rows of the dataset

In [28]:
# reading first five rows
results_df.head()

Unnamed: 0,address_x,latitude_x,longitude_x,agency,create_time_incident,disposition_text,event_number,incident_type_id,incident_type_desc,neighborhood,arrival_time_primary_unit,beat,closed_time_incident,dispatch_time_primary_unit,cfd_incident_type,cfd_incident_type_group,community_council_neighborhood
0,PRESIDENT DR,39.1534720000896,-84.560768,CFD,2020-09-01T16:45:57.000,MED: MT RESPONSE NO TRANSPORT,CFD200901000147,29A1,1ST PARTY CALLER WITH INJURY TO NOT DANGEROUS ...,VILLAGES AT ROLL HILL,2020-09-01T16:54:08.000,ST12,2020-09-01T17:04:49.000,2020-09-01T16:50:28.000,BLS,TRAFFIC / TRANSPORTATION INCIDENTS,VILLAGES AT ROLL HILL
1,HIGHLAND AV / RIDGE RD,39.1702690000896,-84.426011,CFD,2020-04-09T09:10:59.000,MED: MT RESPONSE NO TRANSPORT,CFD200409000078,29A1,1ST PARTY CALLER WITH INJURY TO NOT DANGEROUS ...,,2020-04-09T09:18:50.000,OTHER JURIS 8,2020-04-09T09:19:15.000,2020-04-09T09:11:10.000,BLS,TRAFFIC / TRANSPORTATION INCIDENTS,
2,CLIFTON AV / W MARTIN LUTHER KING JR DR,39.1350670000896,-84.519264,CFD,2020-01-10T16:18:16.000,"EMS: NO TRANSPORT,IN: INVESTIG",CFD200110000136,29A1,1ST PARTY CALLER WITH INJURY TO NOT DANGEROUS ...,CUF,2020-01-10T16:18:47.000,ST34,2020-01-10T16:33:53.000,2020-01-10T16:18:47.000,BLS,TRAFFIC / TRANSPORTATION INCIDENTS,CORRYVILLE - HEIGHTS
3,VINE ST,39.1039320000897,-84.514527,CFD,2019-12-27T17:41:30.000,MEDT: MEDIC TRANSPORT,CFD191227000180,29A1,1ST PARTY CALLER WITH INJURY TO NOT DANGEROUS ...,DOWNTOWN,2019-12-27T17:45:12.000,ST03,2019-12-27T18:24:43.000,2019-12-27T17:42:14.000,BLS,TRAFFIC / TRANSPORTATION INCIDENTS,DOWNTOWN
4,N I75 TO EXIT 1B,39.0962690000897,-84.521328,CFD,2019-10-21T19:01:00.000,MEDT: MEDIC TRANSPORT,CFD191021000218,29A1,1ST PARTY CALLER WITH INJURY TO NOT DANGEROUS ...,DOWNTOWN,2019-10-21T19:06:17.000,ST14,2019-10-21T19:48:01.000,2019-10-21T19:01:42.000,BLS,TRAFFIC / TRANSPORTATION INCIDENTS,DOWNTOWN


###### Look at the date range of the dataset

In [29]:
print(results_df.create_time_incident.min())
print(results_df.create_time_incident.max())

2015-01-01T00:02:52.000
2020-11-13T00:07:34.000


###### Statistical description of numerical data

In [30]:
# gives the statistical result of all numerical column present in dataset
results_df.describe()

Unnamed: 0,address_x,latitude_x,longitude_x,agency,create_time_incident,disposition_text,event_number,incident_type_id,incident_type_desc,neighborhood,arrival_time_primary_unit,beat,closed_time_incident,dispatch_time_primary_unit,cfd_incident_type,cfd_incident_type_group,community_council_neighborhood
count,492412,468959.0,468959.0,492500,492500,489456,492500,490897,333958,492500,419665,491972,487676,446317,479236,479227,492500
unique,16737,302341.0,273527.0,2,491676,221,492498,695,623,51,419223,554,486923,446261,6,107,72
top,READING RD,39.1060190000598,-84.52617,CFD,2016-11-06T21:19:33.000,MEDT: MEDIC TRANSPORT,CFD180110000001,FALARM,DIFFICULTY SPEAKING BETWEEN BREATHS,WESTWOOD,2017-12-31T18:52:39.000,ST35,2017-02-07T18:09:30.000,2019-07-12T09:14:32.000,ALS,SICK PERSON,WESTWOOD
freq,12161,19.0,20.0,326529,3,133562,2,31223,18060,37581,4,24052,5,2,200247,48503,38038


###### Statistical description of categorical data

In [31]:
# printing index of all categorical data
categorical=results_df.dtypes[results_df.dtypes==object].index   
# stats description of categorical data
results_df[categorical].describe()  

Unnamed: 0,address_x,latitude_x,longitude_x,agency,create_time_incident,disposition_text,event_number,incident_type_id,incident_type_desc,neighborhood,arrival_time_primary_unit,beat,closed_time_incident,dispatch_time_primary_unit,cfd_incident_type,cfd_incident_type_group,community_council_neighborhood
count,492412,468959.0,468959.0,492500,492500,489456,492500,490897,333958,492500,419665,491972,487676,446317,479236,479227,492500
unique,16737,302341.0,273527.0,2,491676,221,492498,695,623,51,419223,554,486923,446261,6,107,72
top,READING RD,39.1060190000598,-84.52617,CFD,2016-11-06T21:19:33.000,MEDT: MEDIC TRANSPORT,CFD180110000001,FALARM,DIFFICULTY SPEAKING BETWEEN BREATHS,WESTWOOD,2017-12-31T18:52:39.000,ST35,2017-02-07T18:09:30.000,2019-07-12T09:14:32.000,ALS,SICK PERSON,WESTWOOD
freq,12161,19.0,20.0,326529,3,133562,2,31223,18060,37581,4,24052,5,2,200247,48503,38038


###### Checking whether null value is present in given dataset or not

In [32]:
# is null value present or not?
results_df.isnull().values.any()  

# total number of null value in each column
results_df.isnull().sum()

address_x                             88
latitude_x                         23541
longitude_x                        23541
agency                                 0
create_time_incident                   0
disposition_text                    3044
event_number                           0
incident_type_id                    1603
incident_type_desc                158542
neighborhood                           0
arrival_time_primary_unit          72835
beat                                 528
closed_time_incident                4824
dispatch_time_primary_unit         46183
cfd_incident_type                  13264
cfd_incident_type_group            13273
community_council_neighborhood         0
dtype: int64

###### understanding a few target column

In [33]:
results_df.incident_type_id.unique()

array(['29A1', '24A1', '24C2', '24C1', '24D4', '911CALL', '1A1', '24C3',
       '1D1', '1D0', '1C3', '1C4', '1C6', '1C2', '1C5', '1C0', '1C1',
       '13C2', '13C2C', '26C2', '6C1', '13C3', '21D5', '10C1', '23C2',
       '18C2', '6C1A', '6C1E', '6C1O', '28C2', '23C2I', '19C2', '15D8',
       '23C2V', '13C3C', '11D1', '36C2A', '36C2S', '36C2B', '36C2C',
       '36C1A', '36C1S', '23C6V', '23C6', '12D3', 'AIRF', '20A1', '13A1',
       '15C1', '14B1', '14A1', '31C1', '14C1', '8C1', '8B1', '2C2', '2E1',
       '2D1', '2D0', '2D4', '2D3', '2C1', '2D2', '2A1', '2A2', '2B1',
       '26C1', '3D7', '3D4', '3D5', '3A2', '3D2', '3A1', '3B1', '3B2',
       '3A3', '3D1', '3B3', '23C3I', '23C3', '27D1G', '21D1', '17D2',
       '27D1S', '29D6', '30D1', '8D1', '4D1', '14E1', '1D2', '5D2',
       '4D3A', '4D4A', '4A2A', '4D2A', '4A1A', '4B1A', '4B2A', '4D1A',
       '4B3A', 'ASSLTF', '3D9', 'SUICF', '12C7', 'BLDGF', 'ACCIF', '26C4',
       '24D6', '24D7', '24C4', '5C3', '5C2', '5A2', '5A1', '5D1', '5C1'

In [34]:
results_df['cfd_incident_type'].value_counts()

ALS     200247
BLS     168110
FIRE     75787
OTHE     33285
MEDI       960
NOT        847
Name: cfd_incident_type, dtype: int64

In [35]:
results_df['neighborhood'].value_counts()

WESTWOOD                          37581
DOWNTOWN                          30947
AVONDALE                          28174
EAST PRICE HILL                   27246
N/A                               22989
WEST PRICE HILL                   21337
OVER-THE-RHINE                    20180
WEST END                          19188
WALNUT HILLS                      18451
COLLEGE HILL                      18169
MADISONVILLE                      15132
CUF                               15025
ROSELAWN                          13374
OAKLEY                            11276
WINTON HILLS                      10637
MT. AIRY                          10362
EVANSTON                          10310
CORRYVILLE                        10120
BOND HILL                          9758
MT. WASHINGTON                     9637
HARTWELL                           9216
CLIFTON                            8536
NORTHSIDE                          8073
QUEENSGATE                         7704
MT. AUBURN                         7630
