In [1]:
#Import dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3
import argparse
import json
import pprint
import requests
import sys
import urllib
import os 
from urllib.error import HTTPError
from urllib.parse import quote
from urllib.parse import urlencode

In [2]:
#Assign database file to a variable
db_file = "../Resources/Los_Angeles/crimedata.db"

#Connect to database file
conn = sqlite3.connect(db_file)

In [3]:
#Query the la crime data table and put into a pandas dataframe
crime_df = pd.read_sql_query("select * from sherry_la_crime_clean_data;", conn)
crime_df.head()

Unnamed: 0,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode
0,2,201,510,16.0,101.0
1,2,275,510,16.0,101.0
2,3,325,510,16.0,101.0
3,4,421,510,16.0,101.0
4,5,566,510,16.0,101.0


In [4]:
crime_df.count()

AreaID               174766
ReportingDistrict    174766
CrimeCode            174766
VictimAge            174766
PremiseCode          174766
dtype: int64

In [5]:
# Most victim age crimes are commited against
ages = crime_df.VictimAge.value_counts()
ages.head(11)

17.0    12516
16.0    12093
0.0     11503
26.0     4411
27.0     4323
25.0     4271
28.0     4179
29.0     4131
24.0     4063
30.0     3966
31.0     3960
Name: VictimAge, dtype: int64

In [6]:
area_ID = crime_df.AreaID.value_counts() 
area_ID.head(10)

3     26280
1     24183
6     20940
7     18542
2     18192
9     17306
5     16300
4     15293
8     15290
13      816
Name: AreaID, dtype: int64

In [7]:
premise_desc = crime_df.PremiseCode.value_counts() 
premise_desc.head(10)

101.0    38064
502.0    24341
501.0    22651
108.0    12778
102.0    11263
203.0     9227
122.0     6446
707.0     3320
210.0     3165
123.0     2574
Name: PremiseCode, dtype: int64

In [8]:
crime_code = crime_df.CrimeCode.value_counts() 
crime_code.head(10)

624    16820
330    14673
440    13046
510    11412
310    10286
740    10077
626     8903
230     8644
210     7344
420     6942
Name: CrimeCode, dtype: int64

In [9]:
report_district = crime_df.ReportingDistrict.value_counts() 
report_district.head(10)

645    2735
646    2247
363    1898
162    1857
636    1672
152    1502
666    1497
156    1327
111    1316
182    1184
Name: ReportingDistrict, dtype: int64

In [10]:
crime_df.dtypes

AreaID               object
ReportingDistrict    object
CrimeCode            object
VictimAge            object
PremiseCode          object
dtype: object

In [11]:
#COnvert the datatype of each column to numeric for machine learning
crime_df['AreaID'] = crime_df['AreaID'].apply(pd.to_numeric, errors='coerce')
crime_df['CrimeCode'] = crime_df['CrimeCode'].apply(pd.to_numeric, errors='coerce')
crime_df['ReportingDistrict'] = crime_df['ReportingDistrict'].apply(pd.to_numeric, errors='coerce')
crime_df['VictimAge'] = crime_df['VictimAge'].apply(pd.to_numeric, errors='coerce')
crime_df['PremiseCode'] = crime_df['PremiseCode'].apply(pd.to_numeric, errors='coerce')

crime_df = crime_df.fillna(0)
crime_df.reset_index()
crime_df.dtypes

AreaID                 int64
ReportingDistrict      int64
CrimeCode              int64
VictimAge            float64
PremiseCode          float64
dtype: object

In [12]:
# model_small_df = crime_df.loc[crime_df['CrimeCode'].isin(['646'])]
# 440 - 0.628928407174613
#3     26280
#6     20940
#7     18542
#2     18192
#9     17306
#5     16300
#4     15293
#8     15290
#13      816
model_small_df = crime_df.loc[crime_df['AreaID'].isin(['3'])]
model_small_df.head()

Unnamed: 0,AreaID,ReportingDistrict,CrimeCode,VictimAge,PremiseCode
2,3,325,510,16.0,101.0
14,3,329,510,16.0,101.0
9078,3,357,510,16.0,101.0
9154,3,396,510,16.0,101.0
9168,3,361,510,16.0,101.0


In [13]:
model_small_df.count()

AreaID               26280
ReportingDistrict    26280
CrimeCode            26280
VictimAge            26280
PremiseCode          26280
dtype: int64

In [14]:
target = model_small_df["CrimeCode"]
data = model_small_df.drop("CrimeCode", axis=1)
feature_names = model_small_df.columns
print(target.shape, data.shape)

(26280,) (26280, 4)


In [15]:
data.head()

Unnamed: 0,AreaID,ReportingDistrict,VictimAge,PremiseCode
2,3,325,16.0,101.0
14,3,329,16.0,101.0
9078,3,357,16.0,101.0
9154,3,396,16.0,101.0
9168,3,361,16.0,101.0


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [17]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')

In [18]:
 # Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] gamma=0.0001, C=1 ...............................................


In [None]:
# List the best parameters for this dataset
print(grid.best_params_)

In [None]:
# List the best score
print(grid.best_score_)

In [None]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test)

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["blue", "red"]))