# Notebook 021: Baseline Classifier Model

This notebook contains a baseline crime-type classifier model using an initial set of property-related predictors

In [10]:
import urllib
import os
import operator
import pathlib
import requests
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LassoCV

import statsmodels.api as sm

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

%matplotlib inline

In [11]:
DATA_ROOT = '../data/'
MODEL_ROOT = '../models'
FIGURES_ROOT = '../figures/model-baseline'
WRITEDIR_ROOT = '../models/baseline'

READDIR_ROOT = os.path.join(DATA_ROOT, 'processed')
SHAPEDIR_ROOT = os.path.join(DATA_ROOT, 'raw')
FEATURES_ROOT = os.path.join(DATA_ROOT, 'processed')

readfile_model = os.path.join(READDIR_ROOT, 'crime-model-data-v1.csv')

readfile_zipshapes = os.path.join(SHAPEDIR_ROOT, 'shapefile/zipcodes/ZIP_Codes.shp')
readfile_cityshape = os.path.join(SHAPEDIR_ROOT, 'shapefile/city-boundary/City_of_Boston_Boundary.shp')
readfile_streetshapes = os.path.join(SHAPEDIR_ROOT, 'shapefile/street-segments/Boston_Street_Segments.shp')
readfile_tractshapes = os.path.join(SHAPEDIR_ROOT, 'shapefile/census-tracts/Census_2010_Tracts.shp')
readfile_hoodshapes = os.path.join(SHAPEDIR_ROOT, 'shapefile/boston-neighborhoods/Boston_Neighborhoods.shp')
readfile_zonesubshapes = os.path.join(SHAPEDIR_ROOT, 'shapefile/zoning-subdistricts/Zoning_Subdistricts.shp')
readfile_openshapes = os.path.join(SHAPEDIR_ROOT, 'shapefile/open-spaces/Open_Space.shp')

readfile_feat_property = os.path.join(FEATURES_ROOT, 'property-assessment-features-2013-2019.csv')

print(
    'readfile paths for datasets used in this notebook are:\n\t{}\n\t{}\n\t{}\n\t{}'.format(
        readfile_model, readfile_feat_property, readfile_tractshapes, WRITEDIR_ROOT
    )
)

readfile paths for datasets used in this notebook are:
	../data/processed/crime-model-data-v1.csv
	../data/processed/property-assessment-features-2013-2019.csv
	../data/raw/shapefile/census-tracts/Census_2010_Tracts.shp
	../models/baseline


In [12]:
# mkdir for saving figures if it doesn't already exist
if not os.path.exists(FIGURES_ROOT):
    os.mkdir(FIGURES_ROOT)
    
# mkdir for saving figures if it doesn't already exist
if not os.path.exists(WRITEDIR_ROOT):
    os.mkdir(WRITEDIR_ROOT)

In [13]:
df = pd.read_csv(readfile_model)

In [14]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151072 entries, 0 to 151071
Data columns (total 29 columns):
crime-type                           151072 non-null object
INCIDENT_NUMBER                      151072 non-null object
OFFENSE_DESCRIPTION                  151072 non-null object
timestamp                            151072 non-null object
lat                                  151072 non-null float64
lon                                  151072 non-null float64
year                                 151072 non-null int64
month                                151072 non-null int64
day-of-week                          151072 non-null object
hour                                 151072 non-null int64
ZIP5                                 151044 non-null float64
ZIP5_area                            151044 non-null float64
Name                                 151002 non-null object
Neighborhood_area                    151002 non-null float64
Neighborhood_area_2                  151002 non

Unnamed: 0,crime-type,INCIDENT_NUMBER,OFFENSE_DESCRIPTION,timestamp,lat,lon,year,month,day-of-week,hour,...,commercial-mix-ratio,industrial-mix-ratio,owner-occupied-ratio,residential-median-value,residential-gini-coef,industrial-mix-ratio-3yr-cagr,commercial-mix-ratio-3yr-cagr,owner-occupied-ratio-3yr-cagr,residential-median-value-3yr-cagr,residential-gini-coef-3yr-cagr
0,fraud,I192078177,forgery / counterfeiting,2019-08-01 17:46:00,42.304922,-71.102981,2019,8,Thursday,17,...,0.0,0.000294,0.0,756500.0,0.0,-0.034814,0.0,0.0,0.024264,0.0
1,harassment-disturbance,I192078061,harassment,2019-06-12 21:00:00,42.355553,-71.152747,2019,6,Wednesday,21,...,0.069416,0.0,0.550355,745950.0,0.199606,0.0,-0.010424,-0.028224,0.068142,0.011166
2,theft,I192078038,larceny theft of mv parts & accessories,2019-03-10 08:00:00,42.345625,-71.041291,2019,3,Sunday,8,...,0.47813,0.000938,0.456287,538500.0,0.228793,-0.084828,0.010549,-0.014131,0.06034,0.018436
3,theft,I192078015,larceny all others,2019-07-08 10:29:00,42.339304,-71.051604,2019,7,Monday,10,...,0.0,0.0,,,,0.0,0.0,,,
4,theft,I192077997,auto theft - leased/rented vehicle,2019-04-13 08:00:00,42.328564,-71.068353,2019,4,Saturday,8,...,0.375058,0.076862,0.460751,355500.0,0.172898,0.060168,-0.007779,-0.023444,0.082123,0.025972


## Preprocess observations model dataframe 

In [18]:
weekdays_list = [
    'Tuesday',
    'Wednesday',
    'Thursday',
    'Friday',
    'Saturday',
    'Sunday',
]

months_list = [
    
]

df['day-of-week'].value_counts()

Friday       22753
Wednesday    22146
Thursday     21941
Saturday     21687
Monday       21618
Tuesday      21451
Sunday       19476
Name: day-of-week, dtype: int64

In [21]:
# one-hot-encode day of week
weekday_dummies_df = pd.get_dummies(df['day-of-week']).drop(columns='Monday')[weekdays_list]

# append day of week to df
df[weekdays_list] = weekday_dummies_df[weekdays_list]

In [22]:
df.head()

Unnamed: 0,crime-type,INCIDENT_NUMBER,OFFENSE_DESCRIPTION,timestamp,lat,lon,year,month,day-of-week,hour,...,commercial-mix-ratio-3yr-cagr,owner-occupied-ratio-3yr-cagr,residential-median-value-3yr-cagr,residential-gini-coef-3yr-cagr,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,fraud,I192078177,forgery / counterfeiting,2019-08-01 17:46:00,42.304922,-71.102981,2019,8,Thursday,17,...,0.0,0.0,0.024264,0.0,0,0,1,0,0,0
1,harassment-disturbance,I192078061,harassment,2019-06-12 21:00:00,42.355553,-71.152747,2019,6,Wednesday,21,...,-0.010424,-0.028224,0.068142,0.011166,0,1,0,0,0,0
2,theft,I192078038,larceny theft of mv parts & accessories,2019-03-10 08:00:00,42.345625,-71.041291,2019,3,Sunday,8,...,0.010549,-0.014131,0.06034,0.018436,0,0,0,0,0,1
3,theft,I192078015,larceny all others,2019-07-08 10:29:00,42.339304,-71.051604,2019,7,Monday,10,...,0.0,,,,0,0,0,0,0,0
4,theft,I192077997,auto theft - leased/rented vehicle,2019-04-13 08:00:00,42.328564,-71.068353,2019,4,Saturday,8,...,-0.007779,-0.023444,0.082123,0.025972,0,0,0,0,1,0


In [None]:
# read in required data and related shapefiles
# df_crime = pd.read_csv(readfile_crime, dtype=str)
# df_offense_mapkey = pd.read_csv(readfile_crime_match_key)
# df_offense_mapkey_2 = pd.read_csv(readfile_crime_match_key_2)
# df_sam = pd.read_csv(readfile_sam, dtype=str)
# gdf_zips = gpd.read_file(readfile_zipshapes)
# gdf_boston = gpd.read_file(readfile_cityshape)
# gdf_streets = gpd.read_file(readfile_streetshapes)
# gdf_tracts = gpd.read_file(readfile_tractshapes)
# gdf_hoods = gpd.read_file(readfile_hoodshapes)
# gdf_zonesubs = gpd.read_file(readfile_zonesubshapes)
# gdf_openspace = gpd.read_file(readfile_openshapes)

# df_feat_prop = pd.read_csv(readfile_feat_property, dtype={'shape-id': str, 'fiscal-year': int})

# # convert SAM data lat/lon values to floats
# df_sam[['X', 'Y']] = df_sam[['X', 'Y']].astype(float)
# # convert crime data lat/lon values to floats
# df_crime[['Lat', 'Long']] = df_crime[['Lat', 'Long']].astype(float)