In this project, I'm using Boston Crime data (district, date, reporting area, etc.) to predict the nature of a crime (its offense code group).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

crime = pd.read_csv("crime.csv", encoding = "ISO-8859-1")
crime = crime.set_index('INCIDENT_NUMBER')
print("Dataset dimensions: " + str(crime.shape))
categorical = crime.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i + ": "  + str(column.nunique()))

crime['SHOOTING'] = crime['SHOOTING'].notnull()
#crime.isna().any()
print(crime.dtypes)
crime.head()

Dataset dimensions: (319073, 16)
OFFENSE_CODE_GROUP: 67
OFFENSE_DESCRIPTION: 244
DISTRICT: 12
REPORTING_AREA: 879
SHOOTING: 1
OCCURRED_ON_DATE: 233229
DAY_OF_WEEK: 7
UCR_PART: 4
STREET: 4657
Location: 18194
OFFENSE_CODE             int64
OFFENSE_CODE_GROUP      object
OFFENSE_DESCRIPTION     object
DISTRICT                object
REPORTING_AREA          object
SHOOTING                  bool
OCCURRED_ON_DATE        object
YEAR                     int64
MONTH                    int64
DAY_OF_WEEK             object
HOUR                     int64
UCR_PART                object
STREET                  object
Lat                    float64
Long                   float64
Location                object
dtype: object


Unnamed: 0_level_0,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
INCIDENT_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
I182070945,619,Larceny,LARCENY ALL OTHERS,D14,808,False,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
I182070943,1402,Vandalism,VANDALISM,C11,347,False,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
I182070941,3410,Towed,TOWED MOTOR VEHICLE,D4,151,False,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
I182070940,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,False,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
I182070938,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,False,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


In [2]:
#Create tuple from latitude & longitude
#crime['LAT_LONG'] = list(zip(crime.Lat, crime.Long))
crime = crime.dropna()
print("number of dropped NaN\'s (out of 319073): {}".format(319073 - len(crime)))

# column drops and NA drop.
X = crime.drop(['OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 
                'OFFENSE_DESCRIPTION', 'DISTRICT', 
                'REPORTING_AREA', 'STREET', 'Location', 
                'OCCURRED_ON_DATE', 'DAY_OF_WEEK', 
                'Lat', 'Long', 'UCR_PART', 'SHOOTING'], 1)


# Convert OCCURED_ON_DATE to datetime
#X['Date'] = pd.to_datetime(crime.OCCURRED_ON_DATE)

number of dropped NaN's (out of 319073): 22500


In [3]:
# Create dummies separately.
districts = pd.get_dummies(crime.DISTRICT)
streets = pd.get_dummies(crime.STREET)
days_of_week = pd.get_dummies(crime.DAY_OF_WEEK)
reporting_areas = pd.get_dummies(crime.REPORTING_AREA)
shooting = pd.get_dummies(crime.SHOOTING)


# Concat with other variables
X = pd.get_dummies(X, sparse=True)
#print(X.dtypes)
X = pd.concat([X, districts, days_of_week, reporting_areas, shooting], axis=1)

print(X.dtypes)

Y = crime.OFFENSE_CODE_GROUP

#print(X.dtypes)

YEAR         int64
MONTH        int64
HOUR         int64
A1           uint8
A15          uint8
A7           uint8
B2           uint8
B3           uint8
C11          uint8
C6           uint8
D14          uint8
D4           uint8
E13          uint8
E18          uint8
E5           uint8
Friday       uint8
Monday       uint8
Saturday     uint8
Sunday       uint8
Thursday     uint8
Tuesday      uint8
Wednesday    uint8
             uint8
000          uint8
1            uint8
10           uint8
100          uint8
101          uint8
102          uint8
103          uint8
             ...  
939          uint8
94           uint8
940          uint8
941          uint8
942          uint8
943          uint8
944          uint8
945          uint8
946          uint8
948          uint8
949          uint8
95           uint8
950          uint8
951          uint8
952          uint8
953          uint8
954          uint8
956          uint8
957          uint8
958          uint8
959          uint8
96          

In [4]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 100 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(100,))
mlp.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [5]:
mlp.score(X,Y)

0.15710803073779475

In [44]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, Y, cv=5)



array([0.07769474, 0.08204428, 0.07722529, 0.08025847, 0.07905532])

In [57]:
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier()
cross_val_score(rfc, X, Y, cv=5)


  from numpy.core.umath_tests import inner1d


array([0.05167306, 0.04856267, 0.04343501, 0.03008216, 0.02726337])