In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import math
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn import tree, linear_model
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import explained_variance_score

In [36]:
crime = pd.read_csv("./data/crime.csv", low_memory=False)
police = pd.read_csv("./data/police_sta.csv")
fire = pd.read_csv("./data/fire_dept.csv")
community = pd.read_csv("./data/community_centers.csv")
parks = pd.read_csv("./data/parks.csv")

crime.name = "crime"
police.name = "police"
fire.name = "fire"
community.name = "community"
parks.name = "parks"

datasets = [crime, police, fire, community, parks]

In [50]:
for dataset in datasets:
    print(dataset.name)
    print(dataset.isnull().any())
    print()

crime
INCIDENT_NUMBER        False
OFFENSE_CODE           False
OFFENSE_CODE_GROUP     False
OFFENSE_DESCRIPTION    False
DISTRICT                True
REPORTING_AREA         False
SHOOTING                True
OCCURRED_ON_DATE       False
YEAR                   False
MONTH                  False
DAY_OF_WEEK            False
HOUR                   False
UCR_PART                True
STREET                  True
Lat                     True
Long                    True
Location               False
dtype: bool

police
X               False
Y               False
OBJECTID        False
BLDG_ID         False
BID             False
ADDRESS         False
POINT_X         False
POINT_Y         False
NAME            False
NEIGHBORHOOD    False
CITY            False
ZIP             False
FT_SQFT          True
STORY_HT         True
PARCEL_ID       False
dtype: bool

fire
X             False
Y             False
OBJECTID_1    False
OBJECTID       True
BFD_ID         True
MAP_ID         True
MAPCODE      

In [62]:
### crime ###
# UCR_PART is Uniform Crime Reporting, and attempts to lump crimes into categories
# Can't live without Lat/Long, so we'll have to drop those
crime.dropna(subset=["Lat", "Long"])

# My guess is that they only started recording this late into the dataset
crime["SHOOTING"].notnull().any()

True

In [76]:
### police ###

# None of the nulls here matter
police.isnull().any()

X               False
Y               False
OBJECTID        False
BLDG_ID         False
BID             False
ADDRESS         False
POINT_X         False
POINT_Y         False
NAME            False
NEIGHBORHOOD    False
CITY            False
ZIP             False
FT_SQFT          True
STORY_HT         True
PARCEL_ID       False
dtype: bool

In [78]:
### fire ###

# Lots of nulls here, but only ones that actually matter are X/Y (XCOORD and YCOORD are the same, but unsigned)
fire.isnull().any()

X             False
Y             False
OBJECTID_1    False
OBJECTID       True
BFD_ID         True
MAP_ID         True
MAPCODE        True
LOCCODE        True
LOCDEPT        True
LOCNAME        True
LOCCONTACT     True
LOCPHONE       True
LOCSTNO        True
LOCADDR       False
LOCOWNER       True
LOCWARD        True
LOCPARCL       True
LOCPRECT       True
LOCPLAN        True
STUDY          True
ABOVE          True
ABOVE_DESC     True
SOURCE         True
GEOADDRESS    False
PD            False
CT90           True
XCOORD         True
YCOORD         True
dtype: bool

In [79]:
### community ###

# None of the nulls here matter
community.isnull().any()

X           False
Y           False
FID         False
OBJECTID     True
SITE        False
PHONE       False
FAX          True
STREET      False
NEIGH       False
ZIP         False
dtype: bool

In [80]:
### parks ###

# None of the nulls here matter
parks.isnull().any()

OBJECTID            False
SITE_NAME           False
OWNERSHIP           False
PROTECTION           True
TYPECODE            False
DISTRICT            False
ACRES               False
ADDRESS             False
ZonAgg              False
TypeLong            False
OS_Own_Jur           True
OS_Mngmnt            True
POS                 False
PA                  False
ALT_NAME             True
AgncyJuris           True
Shape_STArea__      False
Shape_STLength__    False
ShapeSTArea         False
ShapeSTLength       False
PARK_ID              True
REGION              False
dtype: bool