In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
from geopy.geocoders import Nominatim
from geopy.distance import vincenty
import googlemaps
import pandas as pd
import json
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss

import cPickle as pickle

import re

In [4]:
crime2016 = pd.read_pickle('../final_dataset/crime2016_complete.pkl')

In [32]:
crime2016.dropna(inplace=True)

In [33]:
crime2016.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 115622 entries, 0 to 117200
Data columns (total 70 columns):
incidntnum                                                                                                                          115622 non-null int64
category                                                                                                                            115622 non-null object
descript                                                                                                                            115622 non-null object
dayofweek                                                                                                                           115622 non-null object
date                                                                                                                                115622 non-null object
time                                                                                                               

## only look at crimes (drop non-criminal)

In [43]:
crime_only = crime2016[crime2016['category'] != 'non-criminal']

In [44]:
crime_only['new_resolution'].value_counts()

0    69598
1    32017
Name: new_resolution, dtype: int64

## features X and target variable y

In [45]:
crime_only.drop(['incidntnum', 'date', 'time', 'address_x', 'x', 'y', 'location',\
                 'address_y', 'new_time', 'year', 'resolution', 'pdid'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [46]:
features = [i for i in crime_only.columns if i != 'new_resolution']

In [47]:
features

['category',
 'descript',
 'dayofweek',
 'pddistrict',
 'neighborhood',
 'hour',
 'day',
 'month',
 'morning',
 'afternoon',
 'night',
 'zipcode',
 'sex and age - total population',
 'sex and age - total population - male',
 'sex and age - total population - female',
 'sex and age - under 5 years',
 'sex and age - 5 to 9 years',
 'sex and age - 10 to 14 years',
 'sex and age - 15 to 19 years',
 'sex and age - 20 to 24 years',
 'sex and age - 25 to 34 years',
 'sex and age - 35 to 44 years',
 'sex and age - 45 to 54 years',
 'sex and age - 55 to 59 years',
 'sex and age - 60 to 64 years',
 'sex and age - 65 to 74 years',
 'sex and age - 75 to 84 years',
 'sex and age - 85 years and over',
 'sex and age - median age (years)',
 'sex and age - 18 years and over',
 'sex and age - 21 years and over',
 'sex and age - 62 years and over',
 'sex and age - 65 years and over',
 'sex and age - 18 years and over.1',
 'sex and age - 18 years and over - male',
 'sex and age - 18 years and over - femal

In [48]:
y = crime_only['new_resolution']

In [49]:
y.value_counts()

0    69598
1    32017
Name: new_resolution, dtype: int64

### standardize demographic columns

In [58]:
crime_only[features[12:]] = (crime_only[features[12:]] - crime_only[features[12:]].mean())\
                            / (crime_only[features[12:]].max() - crime_only[features[12:]].min())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [60]:
crime_only.head()

Unnamed: 0,category,descript,dayofweek,pddistrict,neighborhood,hour,day,month,morning,afternoon,...,race - race alone or in combination with one or more other races - total population - white,race - race alone or in combination with one or more other races - total population - black or african american,race - race alone or in combination with one or more other races - total population - american indian and alaska native,race - race alone or in combination with one or more other races - total population - asian,race - race alone or in combination with one or more other races - total population - native hawaiian and other pacific islander,race - race alone or in combination with one or more other races - total population - some other race,hispanic or latino and race - total population - hispanic or latino (of any race),median_income,median_home_value,median_rent
0,assault,battery,friday,northern,pacific heights,8,26,8,1,0,...,0.044363,-0.211685,-0.430032,-0.212324,-0.207004,-0.240539,-0.203746,0.314049,0.493659,0.367514
1,warrants,warrant arrest,friday,northern,pacific heights,8,26,8,1,0,...,0.044363,-0.211685,-0.430032,-0.212324,-0.207004,-0.240539,-0.203746,0.314049,0.493659,0.367514
3,other offenses,"drivers license, suspended or revoked",saturday,northern,pacific heights,2,23,7,0,0,...,0.044363,-0.211685,-0.430032,-0.212324,-0.207004,-0.240539,-0.203746,0.314049,0.493659,0.367514
4,larceny/theft,grand theft from locked auto,thursday,northern,pacific heights,12,5,5,0,1,...,0.044363,-0.211685,-0.430032,-0.212324,-0.207004,-0.240539,-0.203746,0.314049,0.493659,0.367514
5,larceny/theft,grand theft from locked auto,thursday,northern,pacific heights,1,15,9,0,0,...,0.044363,-0.211685,-0.430032,-0.212324,-0.207004,-0.240539,-0.203746,0.314049,0.493659,0.367514


## make dummy variables for the categorical columns

In [63]:
crime_only[features[:12]]

Unnamed: 0,category,descript,dayofweek,pddistrict,neighborhood,hour,day,month,morning,afternoon,night,zipcode
0,assault,battery,friday,northern,pacific heights,8,26,8,1,0,0,94123
1,warrants,warrant arrest,friday,northern,pacific heights,8,26,8,1,0,0,94123
3,other offenses,"drivers license, suspended or revoked",saturday,northern,pacific heights,2,23,7,0,0,1,94123
4,larceny/theft,grand theft from locked auto,thursday,northern,pacific heights,12,5,5,0,1,0,94123
5,larceny/theft,grand theft from locked auto,thursday,northern,pacific heights,1,15,9,0,0,1,94123
6,missing person,found person,tuesday,bayview,mission bay,8,14,6,1,0,0,94158
7,assault,threats against life,friday,bayview,mission bay,10,8,1,1,0,0,94158
8,extortion,attempted extortion,friday,bayview,mission bay,10,8,1,1,0,0,94158
11,missing person,missing adult,thursday,bayview,mission bay,7,7,1,1,0,0,94158
17,missing person,missing adult,tuesday,bayview,mission bay,17,9,2,0,1,0,94158


## Decision Tree Classifier

In [50]:
from sklearn.tree import DecisionTreeClassifier

In [64]:
dtc = DecisionTreeClassifier()