In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import math
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn import tree, linear_model
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import explained_variance_score

In [125]:
crime = pd.read_csv("./data/crime.csv", low_memory=False).add_prefix("crime_")
police = pd.read_csv("./data/police_sta.csv").add_prefix("police_")
fire = pd.read_csv("./data/fire_dept.csv").add_prefix("fire_")
community = pd.read_csv("./data/community_centers.csv").add_prefix("community_")
parks = pd.read_csv("./data/parks.csv").add_prefix("parks_")
weather = pd.read_csv("./data/weather.csv").add_prefix("weather_")

crime.name = "crime"
police.name = "police"
fire.name = "fire"
community.name = "community"
parks.name = "parks"
weather.name = "weather"

datasets = [crime, police, fire, community, parks, weather]

In [127]:
for df in datasets:
    # Some of the datasets have Lat/Long as Y/X
    # They also have longcoord and latcoord, but they're unsigned, so dropping them
    df.columns = df.columns.str.replace('X', 'long')
    df.columns = df.columns.str.replace('Y', 'lat')
    df.drop(columns~=["coord"])
    
    # Makes typing easier
    df.columns = [x.lower() for x in df.columns]
    
    print(dataset.isnull().any())
    print()

SyntaxError: invalid syntax (<ipython-input-127-5e6d01153e1a>, line 6)

In [114]:
### crime ###
# UCR_PART is Uniform Crime Reporting, and attempts to lump crimes into categories
# Can't live without Lat/Long, so we'll have to drop those
crime.dropna(subset=["crime_Lat", "crime_Long"], inplace=True)

# My guess is that they only started recording this late into the dataset
crime["crime_SHOOTING"].notnull().any()

KeyError: ['Lat', 'Long']

In [100]:
### police ###

# None of the nulls here matter
police.isnull().any()

X               False
Y               False
OBJECTID        False
BLDG_ID         False
BID             False
ADDRESS         False
POINT_X         False
POINT_Y         False
NAME            False
NEIGHBORHOOD    False
CITY            False
ZIP             False
FT_SQFT          True
STORY_HT         True
PARCEL_ID       False
dtype: bool

In [88]:
### fire ###

# Lots of nulls here, but only ones that actually matter are X/Y (XCOORD and YCOORD are the same, but unsigned)
fire.isnull().any()

X             False
Y             False
OBJECTID_1    False
OBJECTID       True
BFD_ID         True
MAP_ID         True
MAPCODE        True
LOCCODE        True
LOCDEPT        True
LOCNAME        True
LOCCONTACT     True
LOCPHONE       True
LOCSTNO        True
LOCADDR       False
LOCOWNER       True
LOCWARD        True
LOCPARCL       True
LOCPRECT       True
LOCPLAN        True
STUDY          True
ABOVE          True
ABOVE_DESC     True
SOURCE         True
GEOADDRESS    False
PD            False
CT90           True
XCOORD         True
YCOORD         True
dtype: bool

In [89]:
### community ###

# None of the nulls here matter
community.isnull().any()

X           False
Y           False
FID         False
OBJECTID     True
SITE        False
PHONE       False
FAX          True
STREET      False
NEIGH       False
ZIP         False
dtype: bool

In [90]:
### parks ###

# None of the nulls here matter
parks.isnull().any()

OBJECTID            False
SITE_NAME           False
OWNERSHIP           False
PROTECTION           True
TYPECODE            False
DISTRICT            False
ACRES               False
ADDRESS             False
ZonAgg              False
TypeLong            False
OS_Own_Jur           True
OS_Mngmnt            True
POS                 False
PA                  False
ALT_NAME             True
AgncyJuris           True
Shape_STArea__      False
Shape_STLength__    False
ShapeSTArea         False
ShapeSTLength       False
PARK_ID              True
REGION              False
dtype: bool

In [91]:
### weather ###

# No nulls, hooray
weather.isnull().any()

Year                         False
Month                        False
Day                          False
High Temp (F)                False
Avg Temp (F)                 False
Low Temp (F)                 False
High Dew Point (F)           False
Avg Dew Point (F)            False
Low Dew Point (F)            False
High Humidity (%)            False
Avg Humidity (%)             False
Low Humidity (%)             False
High Sea Level Press (in)    False
Avg Sea Level Press (in)     False
Low Sea Level Press (in)     False
High Visibility (mi)         False
Avg Visibility (mi)          False
Low Visibility (mi)          False
High Wind (mph)              False
Avg Wind (mph)               False
High Wind Gust (mph)         False
Snowfall (in)                False
Precip (in)                  False
Events                       False
dtype: bool

In [103]:
police.columns.tolist()

['X',
 'Y',
 'OBJECTID',
 'BLDG_ID',
 'BID',
 'ADDRESS',
 'POINT_X',
 'POINT_Y',
 'NAME',
 'NEIGHBORHOOD',
 'CITY',
 'ZIP',
 'FT_SQFT',
 'STORY_HT',
 'PARCEL_ID']

In [None]:
features = data.iloc[:,3:].columns.tolist()
target = data.iloc[:,2].name
correlations = {}
for f in features:
    data_temp = data[[f,target]]
    x1 = data_temp[f].values
    x2 = data_temp[target].values
    key = f + ' vs ' + target
    correlations[key] = pearsonr(x1,x2)[0]
data_correlations = pd.DataFrame(correlations, index=['Value']).T
data_correlations.loc[data_correlations['Value'].abs().sort_values(ascending=False).index]