In [1]:
# import requisite module
import pandas as pd 
import json
import seaborn as sns
import numpy as np

In [2]:
# read data from json 
json_data = pd.read_json('https://data.sfgov.org/resource/fjjd-jecq.json?$limit=2000000')

Right now SODA API has a limit of returning 1000 rows at a time when querying the dataset. To query more than 1000 rows, I added '$limit=' parameter to json url which will set a limit on how much I want to query from a dataset. 

In [8]:
# save data in pandas dataframe
calls_for_service=pd.DataFrame(json_data)

In [9]:
calls_for_service.shape

(2000000, 14)

In [17]:
calls_for_service.head()

Unnamed: 0,address,address_type,agency_id,call_date,call_dttm,call_time,city,crime_id,disposition,offense_date,original_crimetype_name,report_date,state
0,1500 Block Of Pine St,Premise Address,1,2016-09-20T00:00:00.000,2016-09-20T11:50:00.000,2018-11-03 11:50:00,San Francisco,162641608,REP,2016-09-20T00:00:00.000,Complaint Unkn,2016-09-20T00:00:00.000,CA
1,100 Block Of Erie St,Premise Address,1,2016-09-20T00:00:00.000,2016-09-20T12:36:00.000,2018-11-03 12:36:00,San Francisco,162641785,UTL,2016-09-20T00:00:00.000,909,2016-09-20T00:00:00.000,CA
2,900 Block Of Market St,Premise Address,1,2016-09-20T00:00:00.000,2016-09-20T14:01:00.000,2018-11-03 14:01:00,San Francisco,162642180,HAN,2016-09-20T00:00:00.000,Burglary,2016-09-20T00:00:00.000,CA
3,1900 Block Of Palou Av,Premise Address,1,2016-09-20T00:00:00.000,2016-09-20T14:30:00.000,2018-11-03 14:30:00,San Francisco,162642293,REP,2016-09-20T00:00:00.000,Burglary,2016-09-20T00:00:00.000,CA
4,Florida St/division St,Intersection,1,2016-09-20T00:00:00.000,2016-09-20T14:49:00.000,2018-11-03 14:49:00,San Francisco,162642379,HAN,2016-09-20T00:00:00.000,Encampment,2016-09-20T00:00:00.000,CA


Dataset has 2 million rows and 14 columns

In [10]:
calls_for_service.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 14 columns):
address                    2000000 non-null object
address_type               2000000 non-null object
agency_id                  2000000 non-null int64
call_date                  2000000 non-null object
call_dttm                  2000000 non-null object
call_time                  2000000 non-null datetime64[ns]
city                       1944403 non-null object
common_location            197653 non-null object
crime_id                   2000000 non-null int64
disposition                2000000 non-null object
offense_date               2000000 non-null object
original_crimetype_name    2000000 non-null object
report_date                2000000 non-null object
state                      2000000 non-null object
dtypes: datetime64[ns](1), int64(2), object(11)
memory usage: 213.6+ MB


In [11]:
#check for the null entries 
calls_for_service.isnull().sum()

address                          0
address_type                     0
agency_id                        0
call_date                        0
call_dttm                        0
call_time                        0
city                         55597
common_location            1802347
crime_id                         0
disposition                      0
offense_date                     0
original_crimetype_name          0
report_date                      0
state                            0
dtype: int64

In [12]:
# drop column common_location as it has many null entries
calls_for_service = calls_for_service.drop('common_location', axis=1)

In [13]:
calls_for_service.shape

(2000000, 13)

In [14]:
# forward fill city column values to replace the null values
calls_for_service['city'] = calls_for_service['city'].bfill()

In [15]:
calls_for_service.isnull().sum()


address                    0
address_type               0
agency_id                  0
call_date                  0
call_dttm                  0
call_time                  0
city                       0
crime_id                   0
disposition                0
offense_date               0
original_crimetype_name    0
report_date                0
state                      0
dtype: int64

Observe that there are no NULL values in city column anymore.

In [29]:
calls_for_service['call_dttm'] = pd.to_datetime(calls_for_service['call_dttm'])

In [16]:
# read another exel file which has radio codes as it's respectve meaning used in original dataframe
radio_code_xl = pd.read_excel("Radio_Codes_2016.xlsx")
radio_code = pd.DataFrame(radio_code_xl)
radio_code.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 2 columns):
original_crimetype_name    178 non-null object
Meaning                    178 non-null object
dtypes: object(2)
memory usage: 2.9+ KB


In [18]:
# replace radio codes mentioned in original_crimetype_name column with radio codes meaning from radio code data frame
calls_for_service['original_crimetype_name'] = calls_for_service['original_crimetype_name'].astype(str)
radio_code['original_crimetype_name']= radio_code['original_crimetype_name'].astype(str)
radio_code['Meaning']= radio_code['Meaning'].astype(str)
calls_for_service['original_crimetype_name'] = calls_for_service['original_crimetype_name'].map(radio_code.set_index('original_crimetype_name')['Meaning']).fillna(calls_for_service['original_crimetype_name'])

In [30]:
calls_for_service.head()

Unnamed: 0,address,address_type,agency_id,call_date,call_dttm,call_time,city,crime_id,disposition,offense_date,original_crimetype_name,report_date,state
0,1500 Block Of Pine St,Premise Address,1,2016-09-20T00:00:00.000,2016-09-20 11:50:00,2018-11-03 11:50:00,San Francisco,162641608,REP,2016-09-20T00:00:00.000,Complaint Unkn,2016-09-20T00:00:00.000,CA
1,100 Block Of Erie St,Premise Address,1,2016-09-20T00:00:00.000,2016-09-20 12:36:00,2018-11-03 12:36:00,San Francisco,162641785,UTL,2016-09-20T00:00:00.000,Interview a citizen,2016-09-20T00:00:00.000,CA
2,900 Block Of Market St,Premise Address,1,2016-09-20T00:00:00.000,2016-09-20 14:01:00,2018-11-03 14:01:00,San Francisco,162642180,HAN,2016-09-20T00:00:00.000,Burglary,2016-09-20T00:00:00.000,CA
3,1900 Block Of Palou Av,Premise Address,1,2016-09-20T00:00:00.000,2016-09-20 14:30:00,2018-11-03 14:30:00,San Francisco,162642293,REP,2016-09-20T00:00:00.000,Burglary,2016-09-20T00:00:00.000,CA
4,Florida St/division St,Intersection,1,2016-09-20T00:00:00.000,2016-09-20 14:49:00,2018-11-03 14:49:00,San Francisco,162642379,HAN,2016-09-20T00:00:00.000,Encampment,2016-09-20T00:00:00.000,CA


In [21]:
# Count the occurance of each original crime type
data_by_city_crime=calls_for_service.groupby(['original_crimetype_name']).size().reset_index(name='count')
data_by_city_crime

Unnamed: 0,original_crimetype_name,count
0,Audible,234
1,Logged/Advised Call,4
2,"""Bike Mob""",1
3,"""Chop Shop""",1
4,"""Info"" Only",1
5,& 601,1
6,& 851,1
7,& 919,1
8,& Dog Attack,1
9,'Fire Hazard',1


In [25]:
# Remove the white spaces and special characters from the original crime types values
calls_for_service['original_crimetype_name'].str.strip()
calls_for_service['original_crimetype_name'] = calls_for_service['original_crimetype_name'].map(lambda x: x.lstrip('`&[***~ +-".,//').rstrip(' ***."'))
data_by_city_crime=calls_for_service.groupby(['original_crimetype_name']).size().reset_index(name='count')
data_by_city_crime

Unnamed: 0,original_crimetype_name,count
0,,32
1,'Fire Hazard',1
2,'S,2
3,(221 Taken),1
4,(459),1
5,(487),1
6,(Nom 851),1
7,0-0,1
8,000,7
9,000 Abandoned Pkg,1


In [26]:
# count the crimes per city
data_by_city=calls_for_service.groupby(['city']).size().reset_index(name='count')
data_by_city

Unnamed: 0,city,count
0,Brisbane,38
1,Daly City,1001
2,Fort Mason,67
3,Hunters Point,573
4,Presidio,526
5,San Francisco,1986644
6,Treasure Isla,9924
7,Yerba Buena,1227


In [27]:
# display the top 25 crimes 
data_by_city_crime.sort_values('count',ascending=False).head(25)

Unnamed: 0,original_crimetype_name,count
11246,Passing Call,260977
15199,Traffic Stop,210708
14829,Suspicious Person,104815
8946,Homeless Complaint,102769
518,22500e,76451
10499,Muni Inspection,68826
5635,Audible Alarm,58056
15299,Trespasser,52632
16460,Well Being Check,47954
14831,Suspicious Vehicle,47448


In [28]:
calls_for_service.agency_id.unique()

array([1], dtype=int64)