In [1]:
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
import pandas as pd
import os

def load_police_data():
    csv_path = os.path.join(os.path.join("police_data"), "crimes_2012_to_2017.csv")
    return pd.read_csv(csv_path, index_col=0)

In [3]:
police_data = load_police_data()

In [4]:
police_data = police_data.loc[:, ~police_data.columns.str.contains('^Unnamed')]
police_data = police_data.drop('Location', 1)
police_data = police_data.drop('ID', 1)
police_data = police_data.drop('Case Number', 1)
police_data = police_data.drop('Block', 1)
police_data = police_data.drop('Domestic', 1)
police_data = police_data.drop('Beat', 1)
police_data = police_data.drop('FBI Code', 1)
police_data = police_data.drop('X Coordinate', 1)
police_data = police_data.drop('Y Coordinate', 1)
police_data = police_data.drop('Updated On', 1)
police_data = police_data.drop('IUCR', 1)

  police_data = police_data.drop('Location', 1)
  police_data = police_data.drop('ID', 1)
  police_data = police_data.drop('Case Number', 1)
  police_data = police_data.drop('Block', 1)
  police_data = police_data.drop('Domestic', 1)
  police_data = police_data.drop('Beat', 1)
  police_data = police_data.drop('FBI Code', 1)
  police_data = police_data.drop('X Coordinate', 1)
  police_data = police_data.drop('Y Coordinate', 1)
  police_data = police_data.drop('Updated On', 1)
  police_data = police_data.drop('IUCR', 1)


In [5]:
police_data

Unnamed: 0,Date,Primary Type,Description,Location Description,Arrest,District,Ward,Community Area,Year,Latitude,Longitude
3,05/03/2016 11:40:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,10.0,24.0,29.0,2016,41.864073,-87.706819
89,05/03/2016 09:40:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,3.0,20.0,42.0,2016,41.782922,-87.604363
197,05/03/2016 11:31:00 PM,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,STREET,False,15.0,37.0,25.0,2016,41.894908,-87.758372
673,05/03/2016 10:10:00 PM,BATTERY,SIMPLE,SIDEWALK,False,15.0,28.0,25.0,2016,41.885687,-87.749516
911,05/03/2016 10:00:00 PM,THEFT,$500 AND UNDER,RESIDENCE,False,15.0,28.0,25.0,2016,41.886297,-87.761751
...,...,...,...,...,...,...,...,...,...,...,...
6250330,05/03/2016 11:33:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,10.0,28.0,30.0,2016,41.849222,-87.691556
6251089,05/03/2016 11:30:00 PM,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,True,7.0,17.0,69.0,2016,41.760744,-87.633335
6251349,05/03/2016 12:15:00 AM,BATTERY,AGGRAVATED: HANDGUN,SIDEWALK,False,8.0,15.0,66.0,2016,41.779235,-87.685207
6253257,05/03/2016 09:07:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,SIDEWALK,False,4.0,7.0,46.0,2016,41.745252,-87.552773


In [6]:
police_data= police_data.dropna()
police_data

Unnamed: 0,Date,Primary Type,Description,Location Description,Arrest,District,Ward,Community Area,Year,Latitude,Longitude
3,05/03/2016 11:40:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,10.0,24.0,29.0,2016,41.864073,-87.706819
89,05/03/2016 09:40:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,3.0,20.0,42.0,2016,41.782922,-87.604363
197,05/03/2016 11:31:00 PM,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,STREET,False,15.0,37.0,25.0,2016,41.894908,-87.758372
673,05/03/2016 10:10:00 PM,BATTERY,SIMPLE,SIDEWALK,False,15.0,28.0,25.0,2016,41.885687,-87.749516
911,05/03/2016 10:00:00 PM,THEFT,$500 AND UNDER,RESIDENCE,False,15.0,28.0,25.0,2016,41.886297,-87.761751
...,...,...,...,...,...,...,...,...,...,...,...
6250330,05/03/2016 11:33:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,10.0,28.0,30.0,2016,41.849222,-87.691556
6251089,05/03/2016 11:30:00 PM,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,True,7.0,17.0,69.0,2016,41.760744,-87.633335
6251349,05/03/2016 12:15:00 AM,BATTERY,AGGRAVATED: HANDGUN,SIDEWALK,False,8.0,15.0,66.0,2016,41.779235,-87.685207
6253257,05/03/2016 09:07:00 PM,BATTERY,DOMESTIC BATTERY SIMPLE,SIDEWALK,False,4.0,7.0,46.0,2016,41.745252,-87.552773


In [7]:
from datetime import datetime

# Time Conversion Function
def time_convert(date_time):
    s1 = date_time[:11]
    s2 = date_time[11:]
    
    month = s1[:2]
    date = s1[3:5]
    year = s1[6:10]
    
    hr = s2[:2]
    mins = s2[3:5]
    sec = s2[6:8]
    time_frame = s2[9:]
    if(time_frame == 'PM'):
        if (int(hr) != 12):
            hr = str(int(hr) + 12)
    else:
        if(int(hr) == 12):
            hr = '00'
    
    final_date = datetime(int(year), int(month), int(date), int(hr), int(mins), int(sec))
    return final_date



# Using apply() of pandas to apply time_convert on every row of the Date column
police_data['Date'] = police_data['Date'].apply(time_convert)
police_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  police_data['Date'] = police_data['Date'].apply(time_convert)


Unnamed: 0,Date,Primary Type,Description,Location Description,Arrest,District,Ward,Community Area,Year,Latitude,Longitude
3,2016-05-03 23:40:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,10.0,24.0,29.0,2016,41.864073,-87.706819
89,2016-05-03 21:40:00,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,3.0,20.0,42.0,2016,41.782922,-87.604363
197,2016-05-03 23:31:00,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,STREET,False,15.0,37.0,25.0,2016,41.894908,-87.758372
673,2016-05-03 22:10:00,BATTERY,SIMPLE,SIDEWALK,False,15.0,28.0,25.0,2016,41.885687,-87.749516
911,2016-05-03 22:00:00,THEFT,$500 AND UNDER,RESIDENCE,False,15.0,28.0,25.0,2016,41.886297,-87.761751
...,...,...,...,...,...,...,...,...,...,...,...
6250330,2016-05-03 23:33:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,10.0,28.0,30.0,2016,41.849222,-87.691556
6251089,2016-05-03 23:30:00,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,True,7.0,17.0,69.0,2016,41.760744,-87.633335
6251349,2016-05-03 00:15:00,BATTERY,AGGRAVATED: HANDGUN,SIDEWALK,False,8.0,15.0,66.0,2016,41.779235,-87.685207
6253257,2016-05-03 21:07:00,BATTERY,DOMESTIC BATTERY SIMPLE,SIDEWALK,False,4.0,7.0,46.0,2016,41.745252,-87.552773


In [8]:

def month(x):
    return x.strftime("%-m")

police_data['Month'] = police_data['Date'].apply(month)

police_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  police_data['Month'] = police_data['Date'].apply(month)


Unnamed: 0,Date,Primary Type,Description,Location Description,Arrest,District,Ward,Community Area,Year,Latitude,Longitude,Month
3,2016-05-03 23:40:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,10.0,24.0,29.0,2016,41.864073,-87.706819,5
89,2016-05-03 21:40:00,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,3.0,20.0,42.0,2016,41.782922,-87.604363,5
197,2016-05-03 23:31:00,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,STREET,False,15.0,37.0,25.0,2016,41.894908,-87.758372,5
673,2016-05-03 22:10:00,BATTERY,SIMPLE,SIDEWALK,False,15.0,28.0,25.0,2016,41.885687,-87.749516,5
911,2016-05-03 22:00:00,THEFT,$500 AND UNDER,RESIDENCE,False,15.0,28.0,25.0,2016,41.886297,-87.761751,5
...,...,...,...,...,...,...,...,...,...,...,...,...
6250330,2016-05-03 23:33:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,10.0,28.0,30.0,2016,41.849222,-87.691556,5
6251089,2016-05-03 23:30:00,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,True,7.0,17.0,69.0,2016,41.760744,-87.633335,5
6251349,2016-05-03 00:15:00,BATTERY,AGGRAVATED: HANDGUN,SIDEWALK,False,8.0,15.0,66.0,2016,41.779235,-87.685207,5
6253257,2016-05-03 21:07:00,BATTERY,DOMESTIC BATTERY SIMPLE,SIDEWALK,False,4.0,7.0,46.0,2016,41.745252,-87.552773,5


In [9]:


def hour(x):
    return x.strftime("%H")

def day(x):
    return x.strftime("%d")

police_data['Hour_Day'] = police_data['Date'].apply(hour)
police_data['Day'] = police_data['Date'].apply(day)

police_data




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  police_data['Hour_Day'] = police_data['Date'].apply(hour)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  police_data['Day'] = police_data['Date'].apply(day)


Unnamed: 0,Date,Primary Type,Description,Location Description,Arrest,District,Ward,Community Area,Year,Latitude,Longitude,Month,Hour_Day,Day
3,2016-05-03 23:40:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,10.0,24.0,29.0,2016,41.864073,-87.706819,5,23,03
89,2016-05-03 21:40:00,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,3.0,20.0,42.0,2016,41.782922,-87.604363,5,21,03
197,2016-05-03 23:31:00,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,STREET,False,15.0,37.0,25.0,2016,41.894908,-87.758372,5,23,03
673,2016-05-03 22:10:00,BATTERY,SIMPLE,SIDEWALK,False,15.0,28.0,25.0,2016,41.885687,-87.749516,5,22,03
911,2016-05-03 22:00:00,THEFT,$500 AND UNDER,RESIDENCE,False,15.0,28.0,25.0,2016,41.886297,-87.761751,5,22,03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250330,2016-05-03 23:33:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,10.0,28.0,30.0,2016,41.849222,-87.691556,5,23,03
6251089,2016-05-03 23:30:00,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,True,7.0,17.0,69.0,2016,41.760744,-87.633335,5,23,03
6251349,2016-05-03 00:15:00,BATTERY,AGGRAVATED: HANDGUN,SIDEWALK,False,8.0,15.0,66.0,2016,41.779235,-87.685207,5,00,03
6253257,2016-05-03 21:07:00,BATTERY,DOMESTIC BATTERY SIMPLE,SIDEWALK,False,4.0,7.0,46.0,2016,41.745252,-87.552773,5,21,03


In [16]:


police_data = police_data[police_data["Year"] == 2016]

X = police_data[['Year', 'Month', 'Day', 'District']]
y = police_data['Primary Type']

from sklearn.model_selection import train_test_split
seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

police_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 250732 entries, 3 to 6253474
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   Date                  250732 non-null  datetime64[ns]
 1   Primary Type          250732 non-null  object        
 2   Description           250732 non-null  object        
 3   Location Description  250732 non-null  object        
 4   Arrest                250732 non-null  bool          
 5   District              250732 non-null  float64       
 6   Ward                  250732 non-null  float64       
 7   Community Area        250732 non-null  float64       
 8   Year                  250732 non-null  int64         
 9   Latitude              250732 non-null  float64       
 10  Longitude             250732 non-null  float64       
 11  Month                 250732 non-null  object        
 12  Hour_Day              250732 non-null  object        
 13

In [17]:
import pandas as pd
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score

clf = OneVsRestClassifier(AdaBoostClassifier())
clf.fit(X_train, y_train)

In [18]:


results = cross_val_score(clf, X, y, cv=3)





In [19]:
print(("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0))


Accuracy: 15.319% (6.944%)
