In [1]:
import geopy
import pandas as pd

In [2]:
# Read raw data locally
df = pd.read_csv("crime_data_raw.csv")

In [3]:
df.tail(n=10)

Unnamed: 0,dr_no,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1_2,crm_cd,crm_cd_desc,...,status,status_desc,crm_cd_1,location,lat,lon,crm_cd_2,cross_street,crm_cd_3,crm_cd_4
690444,230406516,2023-03-19T00:00:00.000,2023-03-19T00:00:00.000,122,4,Hollenbeck,488,1,510,VEHICLE - STOLEN,...,IC,Invest Cont,510.0,3200 GARNET ST,34.0248,-118.2076,,,,
690445,230405171,2023-02-05T00:00:00.000,2023-02-02T00:00:00.000,1830,4,Hollenbeck,428,1,236,INTIMATE PARTNER - AGGRAVATED ASSAULT,...,IC,Invest Cont,236.0,3000 BUDAU AV,34.0808,-118.1738,,,,
690446,231405429,2023-02-05T00:00:00.000,2023-02-05T00:00:00.000,1250,14,Pacific,1444,1,341,"THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LI...",...,IC,Invest Cont,341.0,4300 LINCOLN BL,33.9846,-118.4428,,,,
690447,230806871,2023-03-03T00:00:00.000,2023-03-02T00:00:00.000,620,8,West LA,889,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,2000 S HOLT AV,34.0412,-118.3814,,,,
690448,231104474,2023-01-12T00:00:00.000,2023-01-12T00:00:00.000,1240,11,Northeast,1107,1,440,THEFT PLAIN - PETTY ($950 & UNDER),...,IC,Invest Cont,440.0,1000 MILWAUKEE AV,34.1214,-118.1915,,,,
690449,230804266,2023-01-08T00:00:00.000,2023-01-08T00:00:00.000,1030,8,West LA,839,1,341,"THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LI...",...,IC,Invest Cont,341.0,10200 SANTA MONICA BL,34.0611,-118.4184,,,,
690450,230109201,2023-03-23T00:00:00.000,2023-03-22T00:00:00.000,1830,1,Central,111,1,440,THEFT PLAIN - PETTY ($950 & UNDER),...,IC,Invest Cont,440.0,700 W CESAR E CHAVEZ AV,34.0606,-118.2439,,,,
690451,231604807,2023-01-27T00:00:00.000,2023-01-26T00:00:00.000,1800,16,Foothill,1663,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",...,IC,Invest Cont,740.0,12500 BRANFORD ST,34.2466,-118.4054,,,,
690452,231606525,2023-03-22T00:00:00.000,2023-03-22T00:00:00.000,1000,16,Foothill,1602,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,IC,Invest Cont,230.0,12800 FILMORE ST,34.279,-118.4116,,,,
690453,230906458,2023-03-05T00:00:00.000,2023-03-05T00:00:00.000,900,9,Van Nuys,914,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),...,IC,Invest Cont,745.0,14500 HARTLAND ST,34.1951,-118.4487,,,,


# Preprocessing

In [4]:
# Check the number of missing values
df.isna().sum()

dr_no                  0
date_rptd              0
date_occ               0
time_occ               0
area                   0
area_name              0
rpt_dist_no            0
part_1_2               0
crm_cd                 0
crm_cd_desc            0
mocodes            94972
vict_age               0
vict_sex           90483
vict_descent       90489
premis_cd              8
premis_desc          362
weapon_used_cd    450488
weapon_desc       450488
status                 0
status_desc            0
crm_cd_1               9
location               0
lat                    0
lon                    0
crm_cd_2          638918
cross_street      578212
crm_cd_3          688721
crm_cd_4          690402
dtype: int64

In [5]:
# Remove entires with missing crime code 1 (crm_cd_1)
df = df[df['crm_cd_1'].notna()]
# Remove entires with missing premis_cd
df = df[df['premis_cd'].notna()]
# Remove rows with invalid latitude and longitude information
df = df[df['lat'] != 0.0]
df = df[df['lon'] != 0.0]

In [6]:
# Inspect data types
df.dtypes

dr_no               int64
date_rptd          object
date_occ           object
time_occ            int64
area                int64
area_name          object
rpt_dist_no         int64
part_1_2            int64
crm_cd              int64
crm_cd_desc        object
mocodes            object
vict_age            int64
vict_sex           object
vict_descent       object
premis_cd         float64
premis_desc        object
weapon_used_cd    float64
weapon_desc        object
status             object
status_desc        object
crm_cd_1          float64
location           object
lat               float64
lon               float64
crm_cd_2          float64
cross_street       object
crm_cd_3          float64
crm_cd_4          float64
dtype: object

In [7]:
# Convert some columns to datetime
df[['date_rptd', 'date_occ']] = df[['date_rptd', 'date_occ']].apply(pd.to_datetime)

In [8]:
# Select a subset of criminal activities that is related to this project from the crime code attribute
df['crm_cd'].unique()
# Define crime code that is interested in this project according to UCR handbook
interest_crmcd = [210, 310, 320, 510, 520, 433, 330, 331, 410, 420, 421, 350, 351, 352, 353, 341, 343, 440]
# Keep entiries with crime code listed above
df_interest = df.loc[df['crm_cd'].isin(interest_crmcd)]

In [9]:
# Convert data type and remove unwanted columns
df_interest = df_interest.astype({'crm_cd': int, 'premis_cd': int, 'crm_cd_1': int}) 
df_interest = df_interest.drop(columns = ['part_1_2', 'weapon_used_cd', 'weapon_desc',
                                          'crm_cd_2', 'cross_street', 'crm_cd_3', 'crm_cd_4'])

In [10]:
# Reset index
df_interest.reset_index(drop=True, inplace=True)

In [11]:
df_interest.tail(n=10)

Unnamed: 0,dr_no,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,crm_cd,crm_cd_desc,mocodes,...,vict_sex,vict_descent,premis_cd,premis_desc,status,status_desc,crm_cd_1,location,lat,lon
306441,230706403,2023-02-23,2023-02-22,2130,7,Wilshire,785,510,VEHICLE - STOLEN,,...,,,101,STREET,IC,Invest Cont,510,2000 S HARCOURT AV,34.0382,-118.3431
306442,231004484,2023-01-10,2023-01-04,2255,10,West Valley,1075,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),1822 0325,...,M,W,402,MARKET,IC,Invest Cont,343,17800 VENTURA BL,34.1629,-118.5202
306443,231306158,2023-02-11,2023-02-08,2200,13,Newton,1317,330,BURGLARY FROM VEHICLE,1307 0344 1822,...,M,H,101,STREET,IC,Invest Cont,330,1700 E 16TH ST,34.0233,-118.2424
306444,230905273,2023-02-04,2023-01-24,1815,9,Van Nuys,979,330,BURGLARY FROM VEHICLE,0321 0358 0344,...,F,W,101,STREET,IC,Invest Cont,330,MILBANK ST,34.1536,-118.4181
306445,232007343,2023-03-21,2023-03-20,1640,20,Olympic,2038,341,"THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LI...",1202 0344 1822,...,M,H,602,BANK,IC,Invest Cont,341,3100 WILSHIRE BL,34.0618,-118.2891
306446,230406516,2023-03-19,2023-03-19,122,4,Hollenbeck,488,510,VEHICLE - STOLEN,,...,,,101,STREET,IC,Invest Cont,510,3200 GARNET ST,34.0248,-118.2076
306447,231405429,2023-02-05,2023-02-05,1250,14,Pacific,1444,341,"THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LI...",0325 0344,...,X,X,402,MARKET,IC,Invest Cont,341,4300 LINCOLN BL,33.9846,-118.4428
306448,231104474,2023-01-12,2023-01-12,1240,11,Northeast,1107,440,THEFT PLAIN - PETTY ($950 & UNDER),1822 0344,...,M,W,101,STREET,IC,Invest Cont,440,1000 MILWAUKEE AV,34.1214,-118.1915
306449,230804266,2023-01-08,2023-01-08,1030,8,West LA,839,341,"THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LI...",0344 1822,...,M,O,717,HEALTH SPA/GYM,IC,Invest Cont,341,10200 SANTA MONICA BL,34.0611,-118.4184
306450,230109201,2023-03-23,2023-03-22,1830,1,Central,111,440,THEFT PLAIN - PETTY ($950 & UNDER),1602 0344 0346,...,M,B,717,HEALTH SPA/GYM,IC,Invest Cont,440,700 W CESAR E CHAVEZ AV,34.0606,-118.2439


In [12]:
# Save dataset
df_interest.to_csv("crime_data_complete.csv", index=False, encoding='utf-8')

# Obtain Geographical Information

**(No plan for implementation for now)**

We want to obtain zipcode information from the latitude and longitude columns by using `geopy`. The main challenge here is that `geopy` has extremely low request limit. The solution in this scenario is to sacrifice precision of latitude and longitude to obtain desired information.