# University of Iowa Formatting


To do list here.

In [6]:
# Libraries

from postal.expand import expand_address
import pandas as pd
import pickle

import datetime

In [2]:
# Directories

dataDir = '../../data/external/ui-police/'

arrestsRaw = dataDir + 'docket.csv'
processDir = '../../data/processed/ui-police/'
chargeDir = '../../data/processed/charges/'

In [3]:
# addresses

def formatAddr(raw):
    '''Function to format an address only if the raw string is valid.
    Input: String containing address
    Output: Normalized address'''
    
    # Error checking
    if ((type(raw) != str) | (len(raw) == 0)):
        return False
    
    # Prepare and get info
    query = raw + ', Iowa City, Iowa'
    expanded = expand_address(query)
    
    # return normalized address or notify
    directions = ['nb', 'eb', 'sb', 'wb']
    if (len(expanded) > 0):
        bestCand = expanded[0]
        if (bestCand[:2]) in directions:
            return bestCand[3:]
        else:
            return bestCand
    else:
        print (raw + ' did not expand!')

In [4]:
# read in: add cols, parse dates, ...

arrDF = pd.read_csv(arrestsRaw)
arrDF.rename(columns={'arrLocation': 'incAddr', 'date': 'incDate', 'time': 'incTime', 'address': 'homeAddr'}, inplace=True)

arrDF.head(2)

Unnamed: 0,incDate,name,age,homeAddr,incAddr,date2,chargeCode,charge,incTime,homeCity,homeState,homeZip
0,10/12/16,"KESSELL, BRANDON GAGE",18,1232 BURGE HALL,HANCHER FOOT BRIDGE EAST,,124.401,POSSESSION:CONTROL SUBSTANCE,1:13,IOWA CITY,IA,52242
1,10/12/16,"KESSELL, BRANDON GAGE",18,1232 BURGE HALL,HANCHER FOOT BRIDGE EAST,,124.414,DRUG PARAPHERNALIA,1:13,IOWA CITY,IA,52242


In [5]:
# read the dates and construct a time stamp
arrDF.incDate = pd.to_datetime(arrDF.incDate) # date

In [14]:
def createTimestamp(date, time):
    
    # janky way of getting the time vals
    hrs = int(time.split(':')[0])
    mns = int(time.split(':')[1])
    
    date1 = datetime.datetime(year=date.year, month=date.month, day=date.day, hour=hrs, minute=mns)
    return date1
    
arrDF['timestamp'] = arrDF.apply(lambda x: createTimestamp(x.incDate, x.incTime), axis=1)

In [12]:
# address normalization
arrDF.incAddr = arrDF.incAddr.str.lower()
arrDF.incAddr = arrDF.incAddr.apply(lambda x: formatAddr(x))

- Charges, and their codes
- Addresses cleaned out
- [*DONE*] Timestamp by combining the date and time fields?

In [31]:
# charges
#pd.unique(arrDF[['chargeCode', 'charge']].values.ravel())
charges = arrDF[['chargeCode', 'charge', 'age']].groupby(['chargeCode', 'charge']).count()\
    .rename(columns={'age':'count'})\
    .sort_values('count', ascending=False)
charges.to_csv((chargeDir + 'ui-charge-codes.csv'))

---

***Saving***

In [17]:
# save that ish
arrDF.to_pickle((processDir + 'ui-police-docket.pickle'))
arrDF.to_csv((processDir + 'ui-police-docket.csv'), index=False)