# **Filter & Merging**

Combine all three data sources and trim some columns we don't need.

In [1]:
# Libraries

from postal.expand import expand_address
import pandas as pd
import pickle

In [2]:
# directories

# input
dataDir = '../../data/external/'
icDir = dataDir + 'iowa-city-police/'
uiDir = dataDir + 'ui-police/'

# output
processDir = '../../data/processed/'
chargeDir = processDir + 'charges/'
icOut = processDir + 'iowa-city-police/'
uiOut = processDir + 'ui-police/'

In [None]:
# Dataframes
#citeDF = pd.read_pickle((icOut + 'icpd-cites.pickle'))
arrDF = pd.read_pickle((icOut + 'arrests-icpd.pickle'))
dockDF = pd.read_pickle((uiOut + 'ui-police-docket.pickle'))

In [8]:
# labels
arrLabels = pd.read_csv((chargeDir + 'ic-charges.csv'), index_col='charge')
dockLabels = pd.read_csv((chargeDir + 'ui-charges.csv'), index_col='charge')

arrDict = arrLabels['category'].to_dict()
dockDict = dockLabels['category'].to_dict()

### Columns, Variables

In [None]:
#citeDF.columns

In [None]:
arrDF.columns

In [None]:
dockDF.columns

We want to keep the following variables, and we'll work our way towards this schema:

```
Arrest Record
---------------------------------
    name (last, first middle)
    age
    date/timestamp (unix)
    incAddr
    charge
```

In [None]:
# Name
#citeDF['name'] = citeDF['lName'] + ', ' + citeDF['fName']

In [None]:
# dob parsing
#citeDF.dropna(subset=['dob'], inplace=True)
#citeDF['dob'] = pd.to_datetime(citeDF.dob, infer_datetime_format=True, exact=False, errors='coerce')

In [3]:
# Age in years, 365 days in a year
def calcAge(dob, inc):
    
    # error handling
    if ((type(dob) or type(inc)) != pd.tslib.Timestamp):
        return
    
    td = abs((inc - dob).days)
    return int(td // 365)

In [None]:
#citeDF['age'] = citeDF.apply(lambda x: calcAge(x.dob, x.incDate), axis=1)
#citeDF.head(2)

Now we move onto the Iowa City Police Department arrests. We only need...

- [x]Age calculation
- [ ] Timestamp to UTC calc

In [5]:
# age
arrDF['age'] = arrDF.apply(lambda x: calcAge(x.dob, x.incDate), axis=1)

In [10]:
arrDF['category'] = arrDF.charge.apply(lambda x: arrDict[x])

University of Iowa Police docket, renaming to make it easier when we filter cols.

In [11]:
dockDF.rename(columns={'incAddr': 'incFormAddr'}, inplace=True)

In [16]:
# add groups
dockDF['category'] = dockDF.charge.apply(lambda x: dockDict[x] if (dockDict.has_key(x)) else '')

KeyError: 'CITY,PUBLIC URINATION & DEFECATION'

---

Now we filter the columns, so that we have a slimmer dataset thats normalized. We'll also add the type of criminal activity (source) in a column.

In [17]:
# add that column to each df
# citeDF['source'] = 'c'
arrDF['source'] = 'a'
dockDF['source'] = 'd'

In [22]:
# filter columns
slimCols = ['name', 'age', 'incDate', 'timestamp', 'category', 'charge', 'incFormAddr', 'source']

# and combine!
# combDF = pd.concat([citeDF[slimCols], arrDF[slimCols], dockDF[slimCols]])
combDF = pd.concat([arrDF[slimCols], dockDF[slimCols]])

In [23]:
# drop records without a charge, because that's (in the end), what we're interested in.

before = combDF.shape[0]
print ('{} total records'.format(before))

combDF.dropna(subset=['charge'], inplace=True)

print ('{} after dropping those without a charge,\n{} deleted'\
      .format(combDF.shape[0], before - combDF.shape[0]))

86406 total records
86406 after dropping those without a charge,
0 deleted


In [24]:
# SAVE THAT ISH OUT
combDF.to_pickle((processDir + 'combined-activity.pickle'))
combDF.to_csv((processDir + 'combined-activity.csv'), index=False)

In [26]:
combDF.tail()

Unnamed: 0,name,age,incDate,timestamp,category,charge,incFormAddr,source
27437,"SCHREINER, SCOTT ANDREW",22.0,1994-07-07,1994-07-07 02:12:00,PUBLIC INTOX,PUBLIC INTOX,van allen loading dock iowa city iowa,d
27438,"HENRY, BURL LYNN",31.0,1994-06-29,1994-06-29 00:45:00,ASSAULT,ASSAULT C/SER INJURY,lindquist building iowa city iowa,d
27439,"HENRY, BURL LYNN",31.0,1994-06-29,1994-06-29 00:45:00,CRIMINAL MISCHIEF,CRIMINAL MISCHIEF 2ND DEGREE,lindquist building iowa city iowa,d
27440,"HENRY, BURL LYNN",31.0,1994-06-29,1994-06-29 00:45:00,ASSAULT,ASSAULT C/SER INJURY,lindquist building iowa city iowa,d
27441,"AYLSWORTH,",41.0,1994-06-28,1994-06-28 18:00:00,THEFT,THEFT 5TH - BY DECEPTION,hospital ramp interstate iowa city iowa,d
