In [4]:
import pandas as pd
import urllib.request

DATA_URL = "https://data.vbgov.com/api/views/7gep-fpmg/rows.csv?accessType=DOWNLOAD"
RAW_DATA_FILENAME = "Police_Calls_for_Service.csv"
CLEAN_DATA_FILENAME = "Clean_Data.csv"

# Features that will be dropped from data
dropped_features = [
                    "Incident Number", 
                    "Report Number",
                    "Call Type",
                    "Zone",
                    "Case Disposition",
                    "Priority",
                    "Subdivision",
#                    "Call Date/Time',
                    "Entry Date/Time",
                    "Dispatch Date/Time",
                    "En Route Date/Time",
                    "On Scene Date/Time",
                    "Close Date/Time",
#                    "Location"
                    ]

# Features that will be One Hot Encoded
cat_features = [
#                    "Incident Number", 
#                    "Report Number",
#                    "Call Type",
#                    "Zone"
#                    "Case Disposition",
#                    "Priority",
#                    "Subdivision",
#                    "Call Date/Time',
#                    "Entry Date/Time",
#                    "Dispatch Date/Time",
#                    "En Route Date/Time",
#                    "On Scene Date/Time",
#                    "Close Date/Time",
#                    "Location"
                    ]

if len(set(dropped_features) & set(cat_features)) != 0:
    print("Warning: some features are tagged to be both dropped and One Hot Encoded")

In [5]:
#Load in data, downloading datafile if necessary
try:
    data = pd.read_csv(RAW_DATA_FILENAME)
except:
    # Download file
    print(RAW_DATA_FILENAME + " not found. Downloading file...")
    urllib.request.urlretrieve(DATA_URL, RAW_DATA_FILENAME)
    data = pd.read_csv(RAW_DATA_FILENAME)
    print("File Downloaded")

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
if "Call Date/Time" not in dropped_features:
    # Separating call_time into date month and time(hours)
    print("Separating Call Date/Time into Date, Month, and Time")
    time = []
    date = []
    month = []
    year = []    
    def separateDate(d):
        time.append(d.split(" ")[1].split(":")[0])
        date.append(d.split("/", 3)[1])
        month.append(d.split("/",3)[0])
        year.append(d.split("/", 3)[2].split(" ")[0])
        
    for date_time in data["Call Date/Time"]:
        separateDate(str(date_time))
    
    data.drop(["Call Date/Time"], axis=1, inplace=True)
    
    data['Call_Date'] = date
    data['Call_Month'] = month
    data['Call_Time'] = time
    data['Call_Year'] = year

Separating Call Date/Time into Date, Month, and Time


In [7]:
# Drop the features we're not using
print("Dropping unused features")
data = data.drop(dropped_features, axis=1)

# Drop rows containing incomplete data
# Note: Different features indicate missing values differently in this
# dataset. These only check for missing values for the features that we
# have used in this project.
print("Dropping rows with incomplete data")
rows_to_drop = set()
for index, row in data.iterrows():
    if "Case Disposition" not in dropped_features:
        if row["Case Disposition"] == "No Report":
            rows_to_drop.add(index)
    
    if "Call Date/Time" not in dropped_features:
        if row["Call_Year"] == 1899:
            rows_to_drop.add(index)
                
    if "Priority" not in dropped_features:
        if row["Priority"] == "E":
            rows_to_drop.add(index)
    
    if "Zone" not in dropped_features:
        if row["Zone"] == "UNK":
            rows_to_drop.add(index)

data.drop(rows_to_drop, axis=0, inplace=True)
data.dropna(inplace=True)

Dropping unused features
Dropping rows with incomplete data


In [8]:
# Do One Hot Encoding
print("Performing One Hot Encoding")
data = pd.get_dummies(data, columns=cat_features)
data.to_csv(CLEAN_DATA_FILENAME, index=False)
print("Saved clean data to", CLEAN_DATA_FILENAME)

Performing One Hot Encoding
Saved clean data to Clean_Data.csv
