# Arrests

Two data sets:
1. Franzen Scraped
2. Larson Scraped (naive)

## Franzen Scraped

In [1]:
# Libraries, base baths, etc
import pandas as pd
import numpy as np
import json
import csv

dirData = '../data/'
dirDataExt = dirData + 'external/'
dirDataProc = dirData + 'processed/'

dirScrapeFran = dirDataExt + 'franzen-scrape/'
dirScrapeLars = dirDataExt + 'larson-scrape/'

In [2]:
# Helper to read the ndljson's
def readNDLJson(f):
    with open(f) as data:
        jsonList = [json.loads(line) for line in data]
    return jsonList

# Helper function to format the arrests
def formatRecord(record, ind=False):
    if (ind == True):
        record['datetime'] = record['datetime']['$date']
    record['_id'] = record['_id']['$oid']
    
    return record

In [3]:
# Read and format the data

actJson = readNDLJson(dirScrapeFran + 'activities.json')
for arrest in actJson:
    arrest = formatRecord(arrest, True)
    
actListJson = readNDLJson(dirScrapeFran + 'activityList.json')
for activity in actListJson:
    activity = formatRecord(activity)

disJson = readNDLJson(dirScrapeFran + 'dispositionList.json')
for disp in disJson:
    disp = formatRecord(disp)

In [4]:
actJson[2020]

{u'_id': u'50fcbd7ec0025f8c936c11ae',
 u'activity': u'ANIMAL/NEGLECT',
 u'address': u'938 Longfellow Place, Iowa City, IA 52240, USA',
 u'apt': u'',
 u'date': {u'$date': u'2013-01-07T14:29:00.000+0000'},
 u'datetime': u'2013-01-07T14:29:00.000+0000',
 u'details': u'Linked to CFS#: 13003060',
 u'dispatch': u'13003059',
 u'disposition': u'COMPLETED',
 u'inc': u'00234',
 u'link': u'?dis=13003059&date=01072013',
 u'loc': [-91.515237, 41.650727],
 u'time': u'02:29 pm'}

In [5]:
# Read the formatted lists into DataFrame
actExcCol = ['_id', 'address', 'addresses', 'apt', 'date', 'dispatch', 'inc', 'link', 'time']
activity = pd.DataFrame.from_records(actJson, exclude=actExcCol)

actList = pd.DataFrame.from_records(actListJson, exclude=['index'])
dispList = pd.DataFrame.from_records(disJson, exclude=['index'])

In [6]:
# From police activity, calc johnson county blotter
arrestsFran = activity[(activity['disposition'] == 'ARREST MADE') | (activity['disposition'] == 'ARREST MADEA')]
ticketsFran = activity[activity['disposition'] == 'CHARGED/RELEASED']
blotter = pd.concat([arrestsFran, ticketsFran])

print (blotter.size)
blotter.head(1)

9772


Unnamed: 0,activity,datetime,details,disposition,lat,loc,lon
0,TRAFFIC STOP,2013-01-20T16:50:00.000+0000,CERTIFIED SENT,ARREST MADE,,"[-91.500164, 41.635156]",


In [7]:
# Simplify disposition
blotter['chargeType'] = blotter['disposition'].apply(lambda x: 'A' if (x == ('ARREST MADE' or 'ARREST MADEA')) else 'C')
blotter['datetime'] = pd.to_datetime(blotter['datetime'])

# Redefine the blotter
blotter = blotter[['datetime', 'activity', 'loc', 'chargeType']]

## Naive Scraping

In [8]:
# csv files and data structures
jcOld = dirScrapeLars + '10-15-naive.csv'
jcNew = dirScrapeLars + '9-16-naive.csv'

arrestsNaive = []

In [9]:
# Define a formatting function to read naive web scrapes
def formatLarson(arr):
    #Use incident number as the key, it's unique. 
    key = row.pop("*Incident #*")
    if key == "*Incident #*":
        return

    dates = row.pop("*Offense Date*\nDate of Birth")
    try:
        arrestDate = dates.split("*")[1]
    except:
        # if there is no date, discard as we can't classify it
        return
        
    arrestType = row.pop("*C/A*")
    arrestAddress = row.pop("Arrest Location")
    arrestCharges = row.pop("*Charge(s)*")
    
    arrestsNaive.append({"loc":arrestAddress, "datetime": arrestDate, "chargeType":arrestType, 
                         "activity":arrestCharges})

In [10]:
# Helper function to coerce the charges into something more concise
def chargesList(arr):
    splitted = arr.split('\n')
    newCharges = []
    for charge in splitted:
        if (len(charge[2:]) >= 3):
            newCharges.append(charge[2:])
        
    return newCharges

In [11]:
# Read in the files
for row in csv.DictReader(open(jcOld)):
    formatLarson(row)

for row in csv.DictReader(open(jcNew)):
    formatLarson(row)
    
    
# DataFrame and calc datetime
naiveDF = pd.DataFrame.from_records(arrestsNaive)
naiveDF['datetime'] = pd.to_datetime(naiveDF['datetime'])
naiveDF['activity'] = naiveDF['activity'].apply(lambda x: chargesList(x))
naiveDF.head()

Unnamed: 0,activity,chargeType,datetime,loc
0,[ Assault Causing Injury],A,2015-10-30 23:30:00,2116 FRIENDSHIP ST
1,[ Harassment 3rd Degree],A,2015-11-03 12:00:00,IOWA CITY
2,[ OWI],A,2015-11-05 01:57:00,111 S LUCAS
3,[ Drive while license under suspension/cancelled],C,2015-11-04 23:31:00,RIVERSIDE DR/GRAND
4,"[ Child endangerment/abuse- no injury, Interf...",A,2015-11-04 23:43:00,1926 BROADWAY ST APT B


In [None]:
naiveDF['activity'].iloc[0:50]