# Iowa City Formatting

To do list here.

In [3]:
# Libraries

from postal.expand import expand_address
import pandas as pd
import pickle

In [4]:
# Directories

dataDir = '../../data/external/iowa-city-police/'
arrestsRaw = dataDir + 'arrest-loc-inc.csv'
citationsRaw = dataDir + 'citation.csv'

processDir = '../../data/processed/iowa-city-police/'

### Citations

In [5]:
citeCols = ['lName', 'fName', 'homeAddr', 'homeCity', 'dob',\
            'timestamp', 'charge', 'incAddr', 'incDate']

# read in: add cols, parse dates, ...
citeDF = pd.read_csv((citationsRaw), header=None, names=citeCols, \
                    parse_dates=['dob', 'incDate'])

# drop any rows that dont have a name or address associated with them
citeDF.dropna(subset=['lName', 'incAddr'], inplace=True)

In [4]:
citeDF.tail(25)

Unnamed: 0,lName,fName,homeAddr,homeCity,dob,timestamp,charge,incAddr,incDate
53413,CRONBAUGH,DAVID,1702 ALGONQUIN RD,IOWA CITY,01/16/1948,2014004793,Disobey Stop Sign,FOSTER RD AND NO NAME RD,2014-05-02
53414,BROWN,COREY,722 WESTWINDS DR.,IOWA CITY,12/16/1984,2014005899,No Valid Dl,MELROSE AVE,2014-05-28
53415,COTANT,PETER,1704 10TH ST,CORALVILLE,10/14/1966,2016004196,No Valid Dl,JOHNSON/MARKET,2016-04-30
53416,COTANT,PETER,1704 10TH ST,CORALVILLE,10/14/1966,2016004196,Disobey Stop Sign,JOHNSON/MARKET,2016-04-30
53417,ZHANG,LULU,2743 TRIPLE CROWN LN.,IOWA CITY,04/09/1982,2015002777,No Valid Dl,E HARRISON ST AND S LINN ST,2015-03-19
53418,FURLONG,NINA,4589 SAND RD SE,IOWA CITY,03/26/1970,2012005848,Operate W/O Registration (Expired),GILBERT ST/HWY 6,2012-05-11
53419,WILLIAMS,NICHOLAS,2208 MIAMI DR,IOWA CITY,03/07/1991,2011025497,No Seatbelt/improper Seatbelt,1ST AVE/BRADFORD,2011-10-08
53420,MCALLISTER,MARY,27334 N BANKSTON RD,NEW VIENNA,12/20/1994,2016008000,Speeding,STATE 1/DODGE ST MEASURING 706 FEET SOUTHWEST ...,2016-08-21
53421,BLAUE,MEGAN,311 DOUGLASS CT,IOWA CITY,09/23/1979,2016001283,No Seatbelt/improper Seatbelt,MUSCATINE AVE AND 2ND AVE,2016-02-05
53422,WATSON,CARRIE,140 S WESTMINSTER ST,IOWA CITY,02/12/1967,2014006647,Speeding,2662 E. WASHINGTON ST.,2014-06-18


In [5]:
test = citeDF.iloc[-1]['timestamp']
test

'2016005229'

In [7]:
pd.to_datetime(test, format='YYYY%S')

ValueError: time data '2016005229' does not match format 'YYYY%S' (match)

In [5]:
# Variables

def formatAge(dob, inc):
    '''Function to get the timedelta from two dates in years (age)
    Input: two pandas parsed dates
    OutPut: Integer that represents num years'''
    
    if (dob or inc) in [None, False]:
        return
    
    
# needs normalized date cols
#citeDF['age'] = citeDF['incDate'] - citeDF['dob']

SyntaxError: invalid syntax (<ipython-input-5-18b1ff4605ca>, line 8)

In [6]:
# addresses

def formatAddr(raw):
    '''Function to format an address only if the raw string is valid.
    Input: String containing address
    Output: Normalized address'''
    
    # Error checking
    if ((type(raw) != str) | (len(raw) == 0)):
        return False
    
    # Prepare and get info
    query = raw + ', Iowa City, Iowa'
    expanded = expand_address(query)
    
    # return normalized address or notify
    directions = ['nb', 'eb', 'sb', 'wb']
    if (len(expanded) > 0):
        bestCand = expanded[0]
        if (bestCand[:2]) in directions:
            return bestCand[3:]
        else:
            return bestCand
    else:
        print (raw + ' did not expand!')
        
citeDF['incFormAddr'] = citeDF['incAddr'].apply(lambda x: formatAddr(x))

In [7]:
# do the charges
# cluster them

In [8]:
# Save out
citeDF.to_csv((processDir + 'icpd-cites.csv'), index=False)
citeDF.to_pickle((processDir + 'icpd-cites.pickle'))

-------------------

### Arrests

In [6]:
arrCols = ['name', 'dob', 'homeAddr', 'homeCity', 'timestamp',\
          'incDate', 'incAddr', 'incAct', 'charge']

# read in: add cols, parse dates, ...
arrDF = pd.read_csv((arrestsRaw), header=None, names=arrCols,\
                    parse_dates=['dob', 'incDate'])
print ('{} arrests'.format(arrDF.size))

# drop any rows that dont have a name or address associated with them
arrDF.dropna(subset=['name', 'incAddr'], inplace=True)

532134 arrests


In [10]:
arrDF['incFormAddr'] = arrDF['incAddr'].apply(lambda x: formatAddr(x))

In [11]:
arrDF.head(3)

Unnamed: 0,name,dob,homeAddr,homeCity,timestamp,incDate,incAddr,incAct,charge,incFormAddr
0,"Hamann, Brian Joel",1980-09-18,97 Taupe Ln,Reeds Spring,2006014000.0,2006-03-26,1 Av S/Court St,Taken into Custody,Public Intoxication,1 avenue south court street iowa city iowa
1,"PATHEUANGSIN, WALLY",1991-03-17,1903 Hollywood Blvd,Iowa City,2007032000.0,2007-06-27,1 Av S/Court St,Taken into Custody,Operate Veh Without Owners Consent,1 avenue south court street iowa city iowa
2,"HOWARD, CURLEY LEE",2066-02-01,2401 Hwy 6 E 3007,Iowa City,2010004000.0,2010-01-24,1 Av S/Lower Muscatine Rd,On View,DRIVING WHILE BARRED,1 avenue south lower muscatine road iowa city...


In [None]:
# timestamps
def formatStamps(raw):
    year = raw[:5]
    seconds = raw[5:]

In [None]:
# save out
arrDF.to_csv((processDir + 'arrests-icpd.csv'), index=False)
arrDF.to_pickle((processDir + 'arrests-icpd.pickle'))

---

## **Analysis**

### Charges

What are the most popular charges? How can we normalize the charges into tidy bins?

In [12]:
# lower is first preprocessing step
citeDF['charge'] = citeDF.charge.str.lower()
arrDF.charge = arrDF.charge.str.lower()

In [13]:
# value counts
charges = pd.concat([citeDF['charge'], arrDF['charge']], ignore_index=True)
chargeCounts = pd.DataFrame({'num': charges.value_counts(), 'perc':charges.value_counts(normalize=True)})

print ('There are {} charges (not fixed for errors)'.format(chargeCounts.size))

chargeCounts.reset_index(inplace=True, drop=False)
chargeCounts.rename(columns={'index': 'charge'}, inplace=True)

There are 1064 charges (not fixed for errors)


In [16]:
chargeCounts.head().style.bar(subset=['num', 'perc'])

Unnamed: 0,charge,num,perc
0,speeding,13918,0.128081
1,public intoxication,10633,0.0978503
2,paula,6290,0.0578838
3,no proof of insurance,4562,0.0419819
4,no seatbelt/improper seatbelt,4230,0.0389266


In [17]:
# save that ish
chargeCounts.to_csv((processDir + 'charge-counts-icpd.csv'), index=False)
chargeCounts.to_pickle((processDir + 'charge-counts-icpd.pickle'))

In [18]:
chargeGroups = pd.read_csv((processDir + 'charge-groups-icpd.csv'))
chargeGroups.head()

Unnamed: 0,charge,subcat,num,perc
0,speeding,speeding,13918,0.128081
1,public intoxication,public intoxication,10633,0.09785
2,paula,paula,6290,0.057884
3,no proof of insurance,insurance,4562,0.041982
4,no seatbelt/improper seatbelt,seatbelt,4230,0.038927


In [30]:
chargeGroups[['subcat', 'num']].groupby('subcat').agg('sum')\
    .sort_values('num', ascending=False)\
    .style.bar(subset=['num'])

Unnamed: 0_level_0,num
subcat,Unnamed: 1_level_1
speeding,17831
disobey traffic,12621
public intoxication,11803
DL,10241
paula,6290
operating,5715
possession,5302
insurance,4562
seatbelt,4230
theft,3975


### Addresses

In [13]:
arrDF[arrDF['name'] == 'MILLS, ALEXANDER D']

Unnamed: 0,name,dob,homeAddr,homeCity,timestamp,incDate,incAddr,incAct,charge
27604,"MILLS, ALEXANDER D",1991-04-19,215 E PRENTISS,IOWA CITY,2011006000.0,2011-02-11,215 E PRENTISS ST,On View,Unlawful Use Authentic Dl/id Of Another
27605,"MILLS, ALEXANDER D",1991-04-19,215 E PRENTISS,IOWA CITY,2011006000.0,2011-02-11,215 E PRENTISS ST,On View,Disorderly House
27606,"MILLS, ALEXANDER D",1991-04-19,215 E PRENTISS,IOWA CITY,2011006000.0,2011-02-11,215 E PRENTISS ST,On View,Public Intoxication
36224,"MILLS, ALEXANDER D",1991-04-19,215 E PRENTISS,IOWA CITY,2011014000.0,2011-03-30,400 S GILBERT ST,On View,Public Intoxication
