In [1]:
# Dependencies and Setup
import os
import pandas as pd
import numpy as np

# Raw data file
file_2014 = os.path.join('austin_crime.csv')
file_2015 = os.path.join('Annual_Crime_Dataset_2015.csv')
file_2016 = os.path.join('2016_Annual_Crime_Data.csv')

# Read crime report files and store into pandas data frame
crime_data_2014 = pd.read_csv(file_2014)
crime_data_2015 = pd.read_csv(file_2015)
crime_data_2016 = pd.read_csv(file_2016)

In [2]:
# Check original length of csv file rows
print(len(crime_data_2014))
print("-------------")
print(len(crime_data_2015))
print("-------------")
print(len(crime_data_2016))

159464
-------------
38573
-------------
37461


In [3]:
# Turn 2014 crime csv into dataframe and drop unneccessary columns
crime_data_2014 = pd.DataFrame(crime_data_2014)
crime_data_2014 = crime_data_2014.drop("latitude",axis=1)
crime_data_2014 = crime_data_2014.drop("location",axis=1)
crime_data_2014 = crime_data_2014.drop("longitude",axis=1)
crime_data_2014 = crime_data_2014.drop("census_tract",axis=1)
crime_data_2014 = crime_data_2014.drop("unique_key",axis=1)
crime_data_2014 = crime_data_2014.drop("zipcode",axis=1)
crime_data_2014 = crime_data_2014.drop("clearance_date",axis=1)
crime_data_2014 = crime_data_2014.drop("council_district_code",axis=1)
crime_data_2014 = crime_data_2014.drop("description",axis=1)
crime_data_2014 = crime_data_2014.drop("location_description",axis=1)
crime_data_2014 = crime_data_2014.drop("address",axis=1)

# Drop all rows containing no values
crime_data_2014 = crime_data_2014.dropna()

# Extract only 2014 data
crime_data_2014 = crime_data_2014.drop(crime_data_2014[crime_data_2014.year == 2015.0].index)

# Reorder columns
crime_data_2014 = crime_data_2014[['primary_type', 'timestamp', 'clearance_status', 'district', 'x_coordinate', 'y_coordinate', 'year']]
crime_data_2014.head()

Unnamed: 0,primary_type,timestamp,clearance_status,district,x_coordinate,y_coordinate,year
4,Theft: Shoplifting,2014-04-04 00:00:00,Cleared by Arrest,B,3121345.0,10082705.0,2014.0
9,Auto Theft,2014-04-04 00:00:00,Not cleared,C,3118304.0,10072414.0,2014.0
14,Theft: All Other Larceny,2014-05-08 00:00:00,Not cleared,H,3125886.0,10047276.0,2014.0
17,Auto Theft,2014-12-18 00:00:00,Not cleared,C,3125976.0,10072207.0,2014.0
20,Theft: All Other Larceny,2014-04-16 00:00:00,Not cleared,B,3117183.0,10103211.0,2014.0


In [4]:
# Edit timestamp column to show only month
crime_data_2014['timestamp'] = crime_data_2014['timestamp'].map(lambda x: str(x)[5:7])
crime_data_2014.head()

Unnamed: 0,primary_type,timestamp,clearance_status,district,x_coordinate,y_coordinate,year
4,Theft: Shoplifting,4,Cleared by Arrest,B,3121345.0,10082705.0,2014.0
9,Auto Theft,4,Not cleared,C,3118304.0,10072414.0,2014.0
14,Theft: All Other Larceny,5,Not cleared,H,3125886.0,10047276.0,2014.0
17,Auto Theft,12,Not cleared,C,3125976.0,10072207.0,2014.0
20,Theft: All Other Larceny,4,Not cleared,B,3117183.0,10103211.0,2014.0


In [5]:
# Edit clearance status to display only first character
crime_data_2014['clearance_status'] = crime_data_2014['clearance_status'].map(lambda x: str(x)[0:1])
crime_data_2014.head()

Unnamed: 0,primary_type,timestamp,clearance_status,district,x_coordinate,y_coordinate,year
4,Theft: Shoplifting,4,C,B,3121345.0,10082705.0,2014.0
9,Auto Theft,4,N,C,3118304.0,10072414.0,2014.0
14,Theft: All Other Larceny,5,N,H,3125886.0,10047276.0,2014.0
17,Auto Theft,12,N,C,3125976.0,10072207.0,2014.0
20,Theft: All Other Larceny,4,N,B,3117183.0,10103211.0,2014.0


In [6]:
crime_data_2014['primary_type_cleaned'] = crime_data_2014['primary_type']

In [7]:
import re
regex_pat = re.compile(r'^Theft:.*$', flags=re.IGNORECASE)
crime_data_2014["primary_type_cleaned"].replace(regex_pat, "Theft", inplace=True)

crime_data_2014["primary_type"] = crime_data_2014["primary_type_cleaned"]
crime_data_2014 = crime_data_2014.drop("primary_type_cleaned",axis=1)
crime_data_2014.head()

Unnamed: 0,primary_type,timestamp,clearance_status,district,x_coordinate,y_coordinate,year
4,Theft,4,C,B,3121345.0,10082705.0,2014.0
9,Auto Theft,4,N,C,3118304.0,10072414.0,2014.0
14,Theft,5,N,H,3125886.0,10047276.0,2014.0
17,Auto Theft,12,N,C,3125976.0,10072207.0,2014.0
20,Theft,4,N,B,3117183.0,10103211.0,2014.0


In [8]:
crime_data_2014["district"].value_counts()

D     4701
E     4437
F     3974
A     3835
I     3745
B     3673
H     3387
G     2603
C     2531
AP     110
Name: district, dtype: int64

In [9]:
crime_data_2014["timestamp"].value_counts()

07    3000
08    2942
10    2915
05    2822
03    2761
09    2756
01    2750
06    2711
04    2667
12    2642
11    2602
02    2428
Name: timestamp, dtype: int64

In [10]:
crime_data_2014["primary_type"].value_counts()

Theft                                           28143
Auto Theft                                       2162
Aggravated Assault                               1827
Robbery                                           839
Homicide: Murder & Nonnegligent Manslaughter       25
Name: primary_type, dtype: int64

In [11]:
# Check length of set
len(crime_data_2014)

32996

In [12]:
# Turn 2015 and 2016 crime csv into dataframes and drop unneccessary columns
crime_data_2015 = pd.DataFrame(crime_data_2015)
crime_data_2016 = pd.DataFrame(crime_data_2016)

crime_data_2015 = crime_data_2015.drop("GO Primary Key",axis=1)
crime_data_2016 = crime_data_2016.drop("GO Primary Key",axis=1)

crime_data_2015 = crime_data_2015.drop("GO Highest Offense Desc",axis=1)
crime_data_2016 = crime_data_2016.drop("GO Highest Offense Desc",axis=1)

crime_data_2015 = crime_data_2015.drop("Clearance Date",axis=1)
crime_data_2016 = crime_data_2016.drop("Clearance Date",axis=1)

crime_data_2015 = crime_data_2015.drop("GO Location Zip",axis=1)
crime_data_2016 = crime_data_2016.drop("GO Location Zip",axis=1)

crime_data_2015 = crime_data_2015.drop("GO Location",axis=1)
crime_data_2016 = crime_data_2016.drop("GO Location",axis=1)

crime_data_2015 = crime_data_2015.drop("GO Census Tract",axis=1)
crime_data_2016 = crime_data_2016.drop("GO Census Tract",axis=1)

crime_data_2015 = crime_data_2015.drop("Council District",axis=1)
crime_data_2016 = crime_data_2016.drop("Council District",axis=1)

# Drop all rows containing no values
crime_data_2015 = crime_data_2015.dropna()
crime_data_2016 = crime_data_2016.dropna()

In [13]:
# Rename columns for merge
crime_data_2015.columns = ['primary_type', 'timestamp', 'clearance_status', 'district', 'x_coordinate', 'y_coordinate']

# Add year column
crime_data_2015['year'] = 2015.0

In [14]:
# Edit timestamp column to show only month
for index, row in crime_data_2015.iterrows():
    split = row['timestamp']
    split = split.split('-')[1]
    crime_data_2015.set_value(index, 'timestamp', split)

  """


In [15]:
crime_data_2015["primary_type"].value_counts()

Theft          26624
Burglary        4846
Auto Theft      1982
Agg Assault     1829
Robbery          826
Murder            18
Name: primary_type, dtype: int64

In [16]:
crime_data_2015 = crime_data_2015[crime_data_2015.district != "UK"]
crime_data_2015["district"].value_counts()

D     5226
E     4719
B     4691
F     4422
I     4118
A     3950
H     3508
C     2747
G     2577
AP     166
Name: district, dtype: int64

In [17]:
crime_data_2015["timestamp"].value_counts()

Jul    3311
Aug    3153
May    3129
Jun    3060
Apr    3019
Mar    3016
Dec    2973
Sep    2961
Jan    2949
Nov    2908
Oct    2887
Feb    2758
Name: timestamp, dtype: int64

In [18]:
# Rename columns for merge
crime_data_2016.columns = ['primary_type', 'timestamp', 'clearance_status', 'district', 'x_coordinate', 'y_coordinate']

# Add year column
crime_data_2016["year"] = 2016.0

In [19]:
# Edit timestamp column to show only month
for index, row in crime_data_2016.iterrows():
    split = row['timestamp']
    split = split.split('-')[1]
    crime_data_2016.set_value(index, 'timestamp', split)

  """


In [20]:
crime_data_2016["timestamp"].value_counts()

Jan    3069
Mar    3064
Apr    3014
Oct    3007
Jun    2975
May    2926
Sep    2881
Dec    2876
Nov    2854
Aug    2830
Jul    2801
Feb    2676
Name: timestamp, dtype: int64

In [21]:
crime_data_2016 = crime_data_2016[crime_data_2016.district != "88"]
crime_data_2016 = crime_data_2016[crime_data_2016.district != "UK"]
crime_data_2016["district"].value_counts()

D     5083
E     4409
F     4172
B     4159
I     4065
H     3957
A     3793
C     2653
G     2518
AP     154
Name: district, dtype: int64

In [22]:
crime_data_2016["primary_type"].value_counts()

Theft          24907
Burglary        5036
Agg Assault     2090
Auto Theft      1989
Robbery          911
Murder            30
Name: primary_type, dtype: int64

In [23]:
# Check length of set
len(crime_data_2015)

36124

In [24]:
# Check length of set
len(crime_data_2016)

34963

In [25]:
output_data_file_2014 = "clean_csvs/crime_2014.csv"
output_data_file_2015 = "clean_csvs/crime_2015.csv"
output_data_file_2016 = "clean_csvs/crime_2016.csv"

In [26]:
crime_data_2014.to_csv(output_data_file_2014)
crime_data_2015.to_csv(output_data_file_2015)
crime_data_2016.to_csv(output_data_file_2016)