# Arrests

Two data sets:
1. Franzen Scraped
2. Larson Scraped (naive)

## Franzen Scraped

In [1]:
# Libraries, base baths, etc
import pandas as pd
import numpy as np
import json
import csv

dirData = '../data/'
dirDataExt = dirData + 'external/'
dirDataProc = dirData + 'processed/'

dirScrapeFran = dirDataExt + 'franzen-scrape/'
dirScrapeLars = dirDataExt + 'larson-scrape/'

In [2]:
# Helper to read the ndljson's
def readNDLJson(f):
    with open(f) as data:
        jsonList = [json.loads(line) for line in data]
    return jsonList

# Helper function to format the arrests
def formatRecord(record, ind=False):
    if (ind == True):
        record['datetime'] = record['datetime']['$date']
    record['_id'] = record['_id']['$oid']
    
    return record

In [3]:
# Read and format the data

actJson = readNDLJson(dirScrapeFran + 'activities.json')
for arrest in actJson:
    arrest = formatRecord(arrest, True)
    
actListJson = readNDLJson(dirScrapeFran + 'activityList.json')
for activity in actListJson:
    activity = formatRecord(activity)

disJson = readNDLJson(dirScrapeFran + 'dispositionList.json')
for disp in disJson:
    disp = formatRecord(disp)

In [4]:
actJson[2020]

{u'_id': u'50fcbd7ec0025f8c936c11ae',
 u'activity': u'ANIMAL/NEGLECT',
 u'address': u'938 Longfellow Place, Iowa City, IA 52240, USA',
 u'apt': u'',
 u'date': {u'$date': u'2013-01-07T14:29:00.000+0000'},
 u'datetime': u'2013-01-07T14:29:00.000+0000',
 u'details': u'Linked to CFS#: 13003060',
 u'dispatch': u'13003059',
 u'disposition': u'COMPLETED',
 u'inc': u'00234',
 u'link': u'?dis=13003059&date=01072013',
 u'loc': [-91.515237, 41.650727],
 u'time': u'02:29 pm'}

In [74]:
# Read the formatted lists into DataFrame
actExcCol = ['_id', 'address', 'addresses', 'apt', 'date', 'dispatch', 'inc', 'link', 'time']
activity = pd.DataFrame.from_records(actJson, exclude=actExcCol)

# Coerce the activity datetime to datetime.index to get actual dates instead of UTC timestamps
activity['datetime'] = pd.to_datetime(activity['datetime'], yearfirst=True)

# Read in the final files
actList = pd.DataFrame.from_records(actListJson, exclude=['index'])
dispList = pd.DataFrame.from_records(disJson, exclude=['index'])

In [75]:
# From police activity, calc johnson county blotter
arrestsFran = activity[(activity['disposition'] == 'ARREST MADE') | (activity['disposition'] == 'ARREST MADEA')]
ticketsFran = activity[activity['disposition'] == 'CHARGED/RELEASED']
blotter = pd.concat([arrestsFran, ticketsFran])

print ("The emulated arrest blotter has {} records.".format(blotter.size))
blotter.head(1)

The emulated arrest blotter has 9772 records.


Unnamed: 0,activity,datetime,details,disposition,lat,loc,lon
0,TRAFFIC STOP,2013-01-20 16:50:00,CERTIFIED SENT,ARREST MADE,,"[-91.500164, 41.635156]",


In [72]:
# Simplify disposition
blotter['chargeType'] = blotter['disposition'].apply(lambda x: 'A' if (x == ('ARREST MADE' or 'ARREST MADEA')) else 'C')

# Redefine the blotter
blotter = blotter[['datetime', 'activity', 'loc', 'chargeType']]

## Naive Scraping

In [None]:
# csv files and data structures
jcOld = dirScrapeLars + '10-15-naive.csv'
jcNew = dirScrapeLars + '9-16-naive.csv'

arrestsNaive = []

In [None]:
# Define a formatting function to read naive web scrapes
def formatLarson(arr):
    #Use incident number as the key, it's unique. 
    key = row.pop("*Incident #*")
    if key == "*Incident #*":
        return

    dates = row.pop("*Offense Date*\nDate of Birth")
    try:
        arrestDate = dates.split("*")[1]
    except:
        # if there is no date, discard as we can't classify it
        return
        
    arrestType = row.pop("*C/A*")
    arrestAddress = row.pop("Arrest Location")
    arrestCharges = row.pop("*Charge(s)*")
    
    arrestsNaive.append({"loc":arrestAddress, "datetime": arrestDate, "chargeType":arrestType, 
                         "activity":arrestCharges})

In [None]:
# Helper function to coerce the charges into something more concise
def chargesList(arr):
    splitted = arr.split('\n')
    newCharges = []
    for charge in splitted:
        if (len(charge[2:]) >= 3):
            newCharges.append(charge[2:])
        
    return newCharges

In [None]:
# Read in the files
for row in csv.DictReader(open(jcOld)):
    formatLarson(row)

for row in csv.DictReader(open(jcNew)):
    formatLarson(row)
    
    
# DataFrame and calc datetime
naiveDF = pd.DataFrame.from_records(arrestsNaive)
naiveDF['datetime'] = pd.to_datetime(naiveDF['datetime'])
naiveDF['activity'] = naiveDF['activity'].apply(lambda x: chargesList(x))
naiveDF.head()

In [None]:
naiveDF['activity'].iloc[0:50]

## Merging

[Very to-do]

## Gameday Calculation


1. Read in `iowa-home-games.csv`
2. Make Set of home game dates (sets are faster to compute `isInSet()`/inclusion)
3. Create function to calculate `gameday(Y/N)` or `gamedayWeekend(Y/N)`

In [8]:
# Read in the games, set the index as the date, and infer it's format
iowaHomeGames = pd.DataFrame.from_csv(open(dirDataProc + 'iowa-home-games.csv'), index_col='Date', infer_datetime_format=True)
iowaHomeGames.head()

Unnamed: 0_level_0,Unnamed: 0,Visitor,Visitor Score,Home Team,Home Score,Line,Win,Delta
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1978-09-16,98,Northwestern,3.0,Iowa,20.0,14.0,1,17.0
1978-09-23,151,Iowa State,31.0,Iowa,0.0,-2.5,0,-31.0
1978-10-07,333,Utah,13.0,Iowa,9.0,,0,-4.0
1978-10-28,529,Purdue,34.0,Iowa,7.0,-10.5,0,-27.0
1978-11-04,579,Michigan,34.0,Iowa,0.0,-27.0,0,-34.0


In [166]:
# get set of home dates
gamedays = pd.Series(iowaHomeGames.index).map(lambda x: x.date()).tolist()

# add the gameday column, currently it looks like no gamedays are in franzens arrests
activity['GD'] = activity['datetime'].head().map(lambda x: x.date()).isin(gamedays)

## Plotting

In [None]:
# libraries & set up
import matplotlib.pyplot as plt
import matplotlib

# Overall matplotlib config
plt.style.use('ggplot')

%matplotlib inline

In [None]:
# Top arrests
topArrests = blotter['activity'].value_counts(ascending=True)
topArrestsPlotted = topArrests.iloc[-15:,]



# Create axes
ax = plt.subplot(111)

# add gridlines
ax.xaxis.grid(zorder=0)

# Remove axes lines
ax.spines["top"].set_visible(False)    
ax.spines["bottom"].set_visible(False)    
ax.spines["right"].set_visible(False)    
ax.spines["left"].set_visible(False)    

# remove axes ticks
ax.get_xaxis().tick_bottom()    
ax.get_yaxis().tick_left()

# Plot it
plt.barh(range(topArrestsPlotted.size), topArrestsPlotted)
plt.yticks(range(topArrestsPlotted.size), topArrestsPlotted.index)


plt.show()

In [None]:
# Arrests over time

# Get the # arrests on each day
arrestDates = blotter['datetime'].apply(lambda x: x.date()).value_counts()
arrestsByDay = arrestDates.reindex(pd.to_datetime(arrestDates.index))

### IN PROGRESS ###

In [None]:
# Small Multiples
### We want a plot for each crime, so we can see the ebb & flow each week
### Therefore, we group on actvity and then calc the num of occurences by day

# set the datetime index
sm = blotter.set_index('datetime')
sm['dayWeek'] = sm.index.dayofweek

# We want the grouping in the comments
smData = sm.groupby(['activity', 'dayWeek']).size()

# ....but we don't want EVERY activity
# So we create a set to test for inclusion, and filter the selection
smCrimes = set(topArrestsPlotted[-9:].index)

In [None]:
topArrestsPlotted