# Cleaning Geo-coding Data
The purpose of this notebook/script is to clean the geo-code data points associated with each shooting incident that is documented in Newsroom DB.

In [1]:
import pandas as pd
import numpy as np

In [2]:
#reading .csv file. You can replace shootings_1.csv with any other files that you are interested in analyzing
shootings = pd.read_csv('./shootings_1.csv') 

In [3]:
'''
I tend to find that type "date and time" provides programmers with an object oriented accessability
to days, month, and year of a date and time object. That's the main reason why I use it here.
'''
from datetime import datetime
shootings['Date']=shootings['Date'].replace(np.nan,'') #replacing empty NaN values with empty strings
def crap_to_dt(d):
    if(d==''):
        return (d)
    else:
        return (datetime.strptime(str(d) ,'%Y-%m-%d').date()) #converting dates from type string to type date and time
    
shootings['Date']=shootings['Date'].apply(crap_to_dt)
shootings['Date']= pd.to_datetime(shootings['Date'],errors='coerce') #apply pandas date and time function for future indexing

In [4]:
shootings = shootings[shootings['Date'].dt.year>2011] #pulling all the data from 2012 onwards.

In [5]:
geoDF = shootings[['Date','Shooting Location','Geocode Override','Link']]
geoDF.head(3)

Unnamed: 0,Date,Shooting Location,Geocode Override,Link
60,2012-01-04,"2256 N. Lockwood Avenue, Chicago, IL","(41.922035,-87.758888)",
61,2012-01-05,"3622 W. Lawrence Avenue, Chicago, IL","(41.968612,-87.719245)",
62,2012-01-05,"548 N. Monticello Avenue, Chicago, IL","(41.891433,-87.717758)",


In [6]:
'''
Although this is weird, but empty cells need to be replaced and filled by another value. In this case it's "Foo" (without quotations)
This step is necessary as some entries in the shootings file have no geo-code.
'''
geoDF['Geocode Override']=geoDF['Geocode Override'].replace(np.nan,'FOO')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [7]:
geoDF['Geocode Override'].isnull().any() #checking to see if we still have any empty cells?

False

In [8]:
long = []
lat = []

def cleanGeo (geocode):
    if(str(geocode).startswith('(')):
        lat_x,long_y = str(geocode).replace('(','').replace(')','').split(',')
        lat.append(float(lat_x))
        long.append(float(long_y))
    else:
        long.append('')
        lat.append('')
geoDF['Geocode Override'].apply(cleanGeo)

60       None
61       None
62       None
63       None
64       None
65       None
66       None
67       None
68       None
69       None
70       None
71       None
72       None
73       None
74       None
75       None
76       None
77       None
78       None
79       None
80       None
81       None
82       None
83       None
84       None
85       None
86       None
87       None
88       None
89       None
         ... 
16589    None
16590    None
16591    None
16592    None
16593    None
16594    None
16595    None
16596    None
16597    None
16598    None
16599    None
16600    None
16601    None
16602    None
16603    None
16604    None
16605    None
16606    None
16607    None
16608    None
16609    None
16610    None
16611    None
16612    None
16613    None
16614    None
16615    None
16616    None
16617    None
16618    None
Name: Geocode Override, dtype: object

In [9]:
'''
Checking if lat and long have the same length. This makes sense, because geo-code data always come in pairs.
'''
print (len(lat))
print (len(long))

16105
16105


In [10]:
'''
adding the cleaned latitude (lat) and longtitude (long) values to a new data frame called geoDF.
'''
geoDF['lat'] = lat
geoDF['long'] = long

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [20]:
geoDF #here's a view of geoDF.

Unnamed: 0,Date,Shooting Location,Geocode Override,Link,lat,long
60,2012-01-04,"2256 N. Lockwood Avenue, Chicago, IL","(41.922035,-87.758888)",,41.922,-87.7589
61,2012-01-05,"3622 W. Lawrence Avenue, Chicago, IL","(41.968612,-87.719245)",,41.9686,-87.7192
62,2012-01-05,"548 N. Monticello Avenue, Chicago, IL","(41.891433,-87.717758)",,41.8914,-87.7178
63,2012-01-05,"8500 S. Lafayette Avenue, Chicago, IL","(41.7398218810558,-87.6255959272385)",,41.7398,-87.6256
64,2012-01-07,"7100 S. University Avenue, Chicago, IL","(41.76583,-87.597396)",,41.7658,-87.5974
65,2012-01-10,"712 S. Central Park Avenue, Chicago, IL","(41.8722492456436,-87.715652436018)",,41.8722,-87.7157
66,2012-01-10,"1165 N. Milwaukee Avenue, Chicago, IL","(41.901546,-87.663116)",,41.9015,-87.6631
67,2012-01-12,"6135 S. Cottage Grove Avenue, Chicago, IL","(41.7833708971739,-87.605921253562)",,41.7834,-87.6059
68,2012-01-16,"6714 S. Cottage Grove Avenue, Chicago, IL","(41.772567,-87.606155)",,41.7726,-87.6062
69,2012-01-20,"315 W. Chicago Avenue, Chicago, IL","(41.8965272605419,-87.6363495737314)",,41.8965,-87.6363


In [21]:
geoDF.to_csv('geoCleaned.csv') #this is to write geoDF to a .csv file.

----------
### Per Kyle Request (06/12/2017)

In [15]:
'''
based on a request from Kyle B., here is the geo-code data for 2017 only.
'''
geoDF_2017 = geoDF[geoDF['Date'].dt.year==2017]
geoDF_2017.to_csv('geoCleaned_2017Only.csv')
