In [1]:
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#loading all the csvs into a list
df_list = []

#loading crime data
for year in range(2010, 2019):
    df = pd.read_csv('../Data/{} Crime Log.csv'.format(year))
    #combining IncidentFromDate and IncidentFromTime into a single datetime index
    df['time'] = df['IncidentFromDate'] + ' ' + df['IncidentFromTime']
    df['time'] = pd.to_datetime(df['time'])
    df = df.dropna(subset=['time']).drop_duplicates('time').set_index('time')
    df_list.append(df)
        

#concatenating into single DataFrame
crime_data = pd.concat(df_list).sort_index()

crime_data.head()

Unnamed: 0_level_0,OCANumber,IncidentFromDate,IncidentFromTime,IncidentToDate,IncidentToTime,OffenseCode,Offense Description,CaseStatus,CaseDisposition,LocationCode,PatrolZone,LocationLandmark,LocationStreetNumber,LocationDirectional,LocationStreet,LocationLatitude,LocationLongitude,CreatedSource
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2010-01-01 01:36:00,10010002,01/01/2010,01:36:00,,,7399,Miscellaneous Offenses,Closed/Cleared,,NONCAM,Z4,,,,,,,1.0
2010-01-01 01:55:00,10010001,01/01/2010,01:55:00,01/01/2010,02:58:00,7399,Miscellaneous Offenses,Closed/Cleared,,ONCAMRES,Z1,Center Street Apartments North 132 Z1,939.0,,"Hemphill Avenue, NW",,,1.0
2010-01-01 02:00:00,10010004,01/01/2010,02:00:00,01/01/2010,04:00:00,2308,Larceny – From Building,Inactive,,NONCAM,Z2,Sigma Nu Fraternity 325 Z2,750.0,,"Fowler Street, NW",,,1.0
2010-01-01 12:00:00,10010032,01/01/2010,12:00:00,01/08/2010,15:20:00,2204,Burglary – No Forced Entry – Residence,Closed/Cleared,Unfounded,ONCAMRES,Z2,North Avenue Apartments North Building,120.0,,"North Avenue, NW",,,1.0
2010-01-02 06:15:00,10010005,01/02/2010,06:15:00,,,7399,Miscellaneous Offenses,Closed/Cleared,,ONCAM,Z2,North Avenue Apartments,120.0,,"North Avenue, NW",,,1.0


In [4]:
#renaming and removing extraneous columns
c = crime_data
c = c.iloc[:, [5, 6, 8, 9, 10, 11, 12, 14, 15, 16]]
c.columns = ['code', 'description', 'disposition', 'location', 'patrol_zone', 'landmark', 'street_number', 'street', 'lat', 'long']
c.head()

Unnamed: 0_level_0,code,description,disposition,location,patrol_zone,landmark,street_number,street,lat,long
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-01-01 01:36:00,7399,Miscellaneous Offenses,,NONCAM,Z4,,,,,
2010-01-01 01:55:00,7399,Miscellaneous Offenses,,ONCAMRES,Z1,Center Street Apartments North 132 Z1,939.0,"Hemphill Avenue, NW",,
2010-01-01 02:00:00,2308,Larceny – From Building,,NONCAM,Z2,Sigma Nu Fraternity 325 Z2,750.0,"Fowler Street, NW",,
2010-01-01 12:00:00,2204,Burglary – No Forced Entry – Residence,Unfounded,ONCAMRES,Z2,North Avenue Apartments North Building,120.0,"North Avenue, NW",,
2010-01-02 06:15:00,7399,Miscellaneous Offenses,,ONCAM,Z2,North Avenue Apartments,120.0,"North Avenue, NW",,


Let's start cleaning by dropping any rows that don't have any location or description data. We'll also need to drop any records that aren't crimes. These are represented by the code 9999.

In [5]:
print("Shape before dropping any records:", c.shape)

#dropping crimes without any location information
c = c.dropna(subset=['landmark', 'lat', 'long'], how='all')
print("Shape after dropping crimes without locations:", c.shape)

#dropping records without any descriptive information
c = c.dropna(subset=['code', 'description'], how='all')
print("Shape after dropping crimes without code or description:", c.shape)

#dropping non=crimes
c = c[c['code'] != '9999']
print("Shape after dropping non-crimes:", c.shape)

Shape before dropping any records: (19077, 10)
Shape after dropping crimes without locations: (18088, 10)
Shape after dropping crimes without code or description: (18058, 10)
Shape after dropping non-crimes: (11642, 10)


Now let's start cleaning individual columns

In [6]:
c.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11642 entries, 2010-01-01 01:55:00 to 2018-12-01 09:00:00
Data columns (total 10 columns):
code             11642 non-null object
description      11140 non-null object
disposition      5383 non-null object
location         11622 non-null object
patrol_zone      11392 non-null object
landmark         11494 non-null object
street_number    8421 non-null float64
street           8523 non-null object
lat              3122 non-null object
long             3124 non-null float64
dtypes: float64(2), object(8)
memory usage: 1000.5+ KB


### Some possible things to look into cleaning:
- street number isn't an integer
- lat isn't a float
- code isn't an integer
- many null lat/long coordinates

We can start by converting street number and lat to the right type

In [7]:
#filling null values in lat, long, and street_number with 0
c[['street_number', 'lat', 'long']] = c[['street_number', 'lat', 'long']].fillna(0)

#converting street_number to int
c['street_number'] = c['street_number'].astype(int)

#converting lat to float
c['lat'] = c['lat'].astype(float)

Now we can look into cleaning the code column. Let's look at the values it can take.

In [8]:
c['code'].unique()

array([7399.0, 2308.0, 2204.0, 2305.0, 2404.0, 2317.0, 2902.0, 5707.0,
       3562.0, 2605.0, 4104.0, 2399.0, 2303.0, 5309.0, 5404.0, 4199.0,
       2999.0, 5308.0, 1316.0, 2406.0, 5311.0, 2901.0, 2203.0, 1315.0,
       1313.0, 1399.0, 1199.0, 2314.0, 3564.0, 1205.0, 4803.0, 1299.0,
       4899.0, 3605.0, 2604.0, 2304.0, 2205.0, 2202.0, 5399.0, 2903.0,
       5403.0, 4004.0, 2307.0, 2434.0, 7345.0, 5299.0, 1103.0, 3599.0,
       5314.0, 2408.0, 2502.0, 2301.0, 2699.0, 2302.0, 1207.0, 9999.0,
       5006.0, 4802.0, 3512.0, 3699.0, 3707.0, 5799.0, 1206.0, 5211.0,
       2606.0, 2804.0, 6201.0, 8399.0, 2599.0, 5212.0, 2701.0, 1203.0,
       3502.0, 3532.0, 1204.0, 3611.0, 5215.0, 3571.0, '5404', '7399',
       '2308', '2399', '5707', '2303', '2304', '4104', '1313', '2902',
       '4199', '2999', '2605', '3562', '3550', '2903', '2305', '2901',
       '1316', '2699', '2317', '2206', '4802', '5309', '8399', '1399',
       '2803', '1204', '9999F', '5311', '2604', '1315', '2204', '5399',
     

Many of these codes are duplicates, only with an extra 0 at the end due to being converted to a float instead of an int.

We can merge them properly by converting to an int, after assigning a unique integer to the codes with alphabetical characters (i.e. 9999W, 9999F, 9999V, 9999T, 9999N, 9999CTW, and 7399R)

In [10]:
#Making a dictionary to map codes with alphabetical values to a unique integer
code_dict = {
    '9999W': '99991',
    '9999F': '99992',
    '9999V': '99993',
    '9999T': '99994',
    '9999N': '99995',
    '9999CTW': '99996',
    '7399R': '73991'
}

#Mapping only the records with codes in the dictionary
c.loc[c.code.isin(code_dict.keys()), 'code'] = c['code'].map(code_dict)
#converting to int
c['code'] = c['code'].astype(int)

c['code'].unique()

array([ 7399,  2308,  2204,  2305,  2404,  2317,  2902,  5707,  3562,
        2605,  4104,  2399,  2303,  5309,  5404,  4199,  2999,  5308,
        1316,  2406,  5311,  2901,  2203,  1315,  1313,  1399,  1199,
        2314,  3564,  1205,  4803,  1299,  4899,  3605,  2604,  2304,
        2205,  2202,  5399,  2903,  5403,  4004,  2307,  2434,  7345,
        5299,  1103,  3599,  5314,  2408,  2502,  2301,  2699,  2302,
        1207,  9999,  5006,  4802,  3512,  3699,  3707,  5799,  1206,
        5211,  2606,  2804,  6201,  8399,  2599,  5212,  2701,  1203,
        3502,  3532,  1204,  3611,  5215,  3571,  3550,  2206,  2803,
       99992,  5499,  2424, 99991,  2099,  2602,  4105,  5203,  1102,
        3572,  4101,  5202,  3561,  7901,  2501,  5312,  2199,  5213,
        3513,  3504,  3805,  6399,  3899,  1314,  2589,  3542,  3522,
        3799,  1209,  3560,  1104,  4801,  3533, 99994, 99995,  2310,
       99993, 73991,  1116,  5313, 99996], dtype=int64)

Now that the code column is clean, we can look into filling some of the empty coordinates. I will fill rows that have a landmark but no coordinate data with the coordinate data from other rows with the same landmark.

Let's look at the landmark column first.

In [11]:
c['landmark'].value_counts().head(10)

North Avenue Apartments                238
Campus Recreation Center               191
Beringause Building   46   Z1          188
Student Center                         178
Campus Recreation Center   122   Z4    151
Student Center   104   Z3              138
Bobby Dodd Stadium                      96
Beringause Building                     85
Barnes and Noble Bookstore              83
Barnes and Noble Bookstore 172 Z2       67
Name: landmark, dtype: int64

Many of the landmarks are the same, but some have the associated zone patrol info added to the end. Let's get rid of this zone patrol info to merge the values with the same landmark. Notice how the values with the added zone info have the pattern:

some spaces - a number - some spaces - zone

We can look for this pattern with regular expression. 

In [12]:
#filling null values with blank strings
c['landmark'] = c['landmark'].fillna('')
#truncating the matched pattern from values with zone info
c['landmark'] = c['landmark'].str.replace("\s+\d+\s+Z\d", '')

c['landmark'].value_counts().head(10)

Campus Recreation Center                  373
Student Center                            317
North Avenue Apartments                   282
Beringause Building                       273
Barnes and Noble Bookstore                166
Bobby Dodd Stadium                        160
                                          148
Glenn Residence Hall                      123
Georgia Tech Hotel & Conference Center    111
Technology Square Parking Deck            106
Name: landmark, dtype: int64

This data looks much more clean. Now we can use these landmarks to fill in missing lat/long coordinates

In [13]:
#creating a dataframe that links landmarks to their coordinates
landmark_coord_df = c[['landmark', 'lat', 'long']][(c.lat != 0)&(c.long != 0)]
landmark_coord_df = landmark_coord_df.drop_duplicates('landmark').set_index('landmark')

landmark_lat = landmark_coord_df['lat']
landmark_long = landmark_coord_df['long']

    
landmark_coord_df.head()

Unnamed: 0_level_0,lat,long
landmark,Unnamed: 1_level_1,Unnamed: 2_level_1
Technology Square Parking Deck,-84.388811,33.775559
Beringause Building,-84.402153,33.779417
Global Learning Center,33.776862,-84.389938
"Hemphill Avenue, NW @ Tenth Street, NW",-84.404264,33.772482
Arby's,-84.388985,33.763993


In [14]:
print("Number of missing latitudes before:", c[c.lat == 0].shape[0])
print("Number of missing longitudes before", c[c.long == 0].shape[0])

#filling missing values using fill_coord
c.loc[c.landmark.isin(landmark_coord_df.index), 'lat'] = c.landmark.map(landmark_lat)
c.loc[c.landmark.isin(landmark_coord_df.index), 'long'] = c.landmark.map(landmark_long)

print("Number of missing latitudes after:", c[c.lat == 0].shape[0])
print("Number of missing longitudes after:", c[c.long == 0].shape[0])

Number of missing latitudes before: 8520
Number of missing longitudes before 8518
Number of missing latitudes after: 3208
Number of missing longitudes after 3207


In [15]:
c.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11642 entries, 2010-01-01 01:55:00 to 2018-12-01 09:00:00
Data columns (total 10 columns):
code             11642 non-null int32
description      11140 non-null object
disposition      5383 non-null object
location         11622 non-null object
patrol_zone      11392 non-null object
landmark         11642 non-null object
street_number    11642 non-null int32
street           8523 non-null object
lat              11642 non-null float64
long             11642 non-null float64
dtypes: float64(2), int32(2), object(6)
memory usage: 1.2+ MB
