In [7]:
# Dependencies and Setup
import os
import pandas as pd
import numpy as np

# Raw data file
file_2014 = os.path.join('austin_crime.csv')
file_2015 = os.path.join('Annual_Crime_Dataset_2015.csv')
file_2016 = os.path.join('2016_Annual_Crime_Data.csv')

# Read crime report files and store into pandas data frame
crime_data_2014 = pd.read_csv(file_2014)
crime_data_2015 = pd.read_csv(file_2015)
crime_data_2016 = pd.read_csv(file_2016)

In [8]:
# Check original length of csv file rows
print(len(crime_data_2014))
print("-------------")
print(len(crime_data_2015))
print("-------------")
print(len(crime_data_2016))

159464
-------------
38573
-------------
37461


In [9]:
# Turn 2014 crime csv into dataframe and drop unneccessary columns
crime_data_2014 = pd.DataFrame(crime_data_2014)
crime_data_2014 = crime_data_2014.drop("latitude",axis=1)
crime_data_2014 = crime_data_2014.drop("location",axis=1)
crime_data_2014 = crime_data_2014.drop("longitude",axis=1)
crime_data_2014 = crime_data_2014.drop("census_tract",axis=1)
crime_data_2014 = crime_data_2014.drop("unique_key",axis=1)
crime_data_2014 = crime_data_2014.drop("zipcode",axis=1)
crime_data_2014 = crime_data_2014.drop("clearance_date",axis=1)
crime_data_2014 = crime_data_2014.drop("council_district_code",axis=1)
crime_data_2014 = crime_data_2014.drop("description",axis=1)
crime_data_2014 = crime_data_2014.drop("location_description",axis=1)
crime_data_2014 = crime_data_2014.drop("address",axis=1)

# Drop all rows containing no values
crime_data_2014 = crime_data_2014.dropna()

# Extract only 2014 data
crime_data_2014 = crime_data_2014.drop(crime_data_2014[crime_data_2014.year == 2015.0].index)

# Reorder columns
crime_data_2014 = crime_data_2014[['primary_type', 'timestamp', 'clearance_status', 'district', 'x_coordinate', 'y_coordinate', 'year']]
crime_data_2014.head()

Unnamed: 0,primary_type,timestamp,clearance_status,district,x_coordinate,y_coordinate,year
4,Theft: Shoplifting,2014-04-04 00:00:00,Cleared by Arrest,B,3121345.0,10082705.0,2014.0
9,Auto Theft,2014-04-04 00:00:00,Not cleared,C,3118304.0,10072414.0,2014.0
14,Theft: All Other Larceny,2014-05-08 00:00:00,Not cleared,H,3125886.0,10047276.0,2014.0
17,Auto Theft,2014-12-18 00:00:00,Not cleared,C,3125976.0,10072207.0,2014.0
20,Theft: All Other Larceny,2014-04-16 00:00:00,Not cleared,B,3117183.0,10103211.0,2014.0


In [10]:
# Edit timestamp column to show only month
crime_data_2014['timestamp'] = crime_data_2014['timestamp'].map(lambda x: str(x)[5:7])
crime_data_2014.head()

Unnamed: 0,primary_type,timestamp,clearance_status,district,x_coordinate,y_coordinate,year
4,Theft: Shoplifting,4,Cleared by Arrest,B,3121345.0,10082705.0,2014.0
9,Auto Theft,4,Not cleared,C,3118304.0,10072414.0,2014.0
14,Theft: All Other Larceny,5,Not cleared,H,3125886.0,10047276.0,2014.0
17,Auto Theft,12,Not cleared,C,3125976.0,10072207.0,2014.0
20,Theft: All Other Larceny,4,Not cleared,B,3117183.0,10103211.0,2014.0


In [11]:
# Edit clearance status to display only first character
crime_data_2014['clearance_status'] = crime_data_2014['clearance_status'].map(lambda x: str(x)[0:1])
crime_data_2014.head()

Unnamed: 0,primary_type,timestamp,clearance_status,district,x_coordinate,y_coordinate,year
4,Theft: Shoplifting,4,C,B,3121345.0,10082705.0,2014.0
9,Auto Theft,4,N,C,3118304.0,10072414.0,2014.0
14,Theft: All Other Larceny,5,N,H,3125886.0,10047276.0,2014.0
17,Auto Theft,12,N,C,3125976.0,10072207.0,2014.0
20,Theft: All Other Larceny,4,N,B,3117183.0,10103211.0,2014.0


In [12]:
# Check length of set
len(crime_data_2014)

32996

In [13]:
# Turn 2015 and 2016 crime csv into dataframes and drop unneccessary columns
crime_data_2015 = pd.DataFrame(crime_data_2015)
crime_data_2016 = pd.DataFrame(crime_data_2016)

crime_data_2015 = crime_data_2015.drop("GO Primary Key",axis=1)
crime_data_2016 = crime_data_2016.drop("GO Primary Key",axis=1)

crime_data_2015 = crime_data_2015.drop("GO Highest Offense Desc",axis=1)
crime_data_2016 = crime_data_2016.drop("GO Highest Offense Desc",axis=1)

crime_data_2015 = crime_data_2015.drop("Clearance Date",axis=1)
crime_data_2016 = crime_data_2016.drop("Clearance Date",axis=1)

crime_data_2015 = crime_data_2015.drop("GO Location Zip",axis=1)
crime_data_2016 = crime_data_2016.drop("GO Location Zip",axis=1)

crime_data_2015 = crime_data_2015.drop("GO Location",axis=1)
crime_data_2016 = crime_data_2016.drop("GO Location",axis=1)

crime_data_2015 = crime_data_2015.drop("GO Census Tract",axis=1)
crime_data_2016 = crime_data_2016.drop("GO Census Tract",axis=1)

crime_data_2015 = crime_data_2015.drop("Council District",axis=1)
crime_data_2016 = crime_data_2016.drop("Council District",axis=1)

# Drop all rows containing no values
crime_data_2015 = crime_data_2015.dropna()
crime_data_2016 = crime_data_2016.dropna()

In [14]:
# Rename columns for merge
crime_data_2015.columns = ['primary_type', 'timestamp', 'clearance_status', 'district', 'x_coordinate', 'y_coordinate']

# Add year column
crime_data_2015['year'] = 2015.0

# Edit timestamp column to show only month
crime_data_2015['timestamp'] = crime_data_2015['timestamp'].map(lambda x: str(x)[2:5])
crime_data_2015.head()

Unnamed: 0,primary_type,timestamp,clearance_status,district,x_coordinate,y_coordinate,year
0,Robbery,Jan,N,E,3130483.0,10102366.0,2015.0
1,Robbery,Jan,N,I,3124730.0,10090296.0,2015.0
2,Burglary,Jan,N,E,3135985.0,10117220.0,2015.0
3,Burglary,Jan,N,I,3129896.0,10096032.0,2015.0
4,Burglary,Jan,N,F,3110455.0,10039340.0,2015.0


In [15]:
# Rename columns for merge
crime_data_2016.columns = ['primary_type', 'timestamp', 'clearance_status', 'district', 'x_coordinate', 'y_coordinate']

# Add year column
crime_data_2016["year"] = 2016.0

# Edit timestamp column to show only month
crime_data_2016['timestamp'] = crime_data_2016['timestamp'].map(lambda x: str(x)[2:5])
crime_data_2016.head()

Unnamed: 0,primary_type,timestamp,clearance_status,district,x_coordinate,y_coordinate,year
0,Agg Assault,Jan,C,D,3067322.0,10062796.0,2016.0
1,Theft,Jan,C,G,3114957.0,10070462.0,2016.0
2,Robbery,Jan,N,E,3129181.0,10106923.0,2016.0
3,Theft,Jan,N,G,3113643.0,10070357.0,2016.0
5,Agg Assault,Jan,N,C,3146947.0,10077985.0,2016.0


In [16]:
# Check length of set
len(crime_data_2015)

36125

In [17]:
# Check length of set
len(crime_data_2016)

34973

In [18]:
output_data_file_2014 = "clean_csvs/crime_2014.csv"
output_data_file_2015 = "clean_csvs/crime_2015.csv"
output_data_file_2016 = "clean_csvs/crime_2016.csv"

In [19]:
crime_data_2014.to_csv(output_data_file_2014)
crime_data_2015.to_csv(output_data_file_2015)
crime_data_2016.to_csv(output_data_file_2016)