# Feature Engineering with Toronto Crime Data

Learning to understand how to encode data for machine using numerical encoding and one hot encoding


In [4]:
import pandas as pd

crime_data = pd.read_csv("./data/toronto_crime_data.csv")
print(crime_data.head())
print(crime_data.columns)
print(crime_data.dtypes)
print(crime_data.shape)

   OBJECTID EVENT_UNIQUE_ID          REPORT_DATE             OCC_DATE  \
0         1  GO-20141637937  1/1/2014 5:00:00 AM  3/1/2012 5:00:00 AM   
1         2  GO-20141260793  1/1/2014 5:00:00 AM  1/1/2014 5:00:00 AM   
2         3  GO-20141259834  1/1/2014 5:00:00 AM  1/1/2014 5:00:00 AM   
3         4  GO-20141260264  1/1/2014 5:00:00 AM  1/1/2014 5:00:00 AM   
4         5  GO-20141260264  1/1/2014 5:00:00 AM  1/1/2014 5:00:00 AM   

   REPORT_YEAR REPORT_MONTH  REPORT_DAY  REPORT_DOY  REPORT_DOW  REPORT_HOUR  \
0         2014      January           1           1  Wednesday            16   
1         2014      January           1           1  Wednesday             3   
2         2014      January           1           1  Wednesday             0   
3         2014      January           1           1  Wednesday             1   
4         2014      January           1           1  Wednesday             1   

   ...              OFFENCE MCI_CATEGORY  HOOD_158          NEIGHBOURHOOD_158  \

In [10]:

numeric_cols = ["OCC_YEAR","OCC_DAY","OCC_HOUR","UCR_CODE","LONG_WGS84","LAT_WGS84"]
cat_cols = ["DIVISION","LOCATION_TYPE","PREMISES_TYPE","OFFENCE","NEIGHBOURHOOD_158", "OCC_MONTH"]

In [15]:
print(crime_data[numeric_cols].dtypes)
print(crime_data[cat_cols].dtypes)
print(len(crime_data.columns))

OCC_YEAR      float64
OCC_DAY       float64
OCC_HOUR        int64
UCR_CODE        int64
LONG_WGS84    float64
LAT_WGS84     float64
dtype: object
DIVISION             object
LOCATION_TYPE        object
PREMISES_TYPE        object
OFFENCE              object
NEIGHBOURHOOD_158    object
OCC_MONTH            object
dtype: object
31


# Numeric Values Encoding using pipelines

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

num_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])



print(num_tf)

Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])


In [19]:
from sklearn.preprocessing import OneHotEncoder

cat_tf = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)
preproc = ColumnTransformer([
    ("num", num_tf, numeric_cols),
    ("cat", cat_tf, cat_cols)
])

print(preproc)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['OCC_YEAR', 'OCC_DAY', 'OCC_HOUR', 'UCR_CODE',
                                  'LONG_WGS84', 'LAT_WGS84']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['DIVISION', 'LOCATION_TYPE', 'PREMISES_TYPE',
                                  'OFFENCE', 'NEIGHBOURHOOD_158',
                                  'OCC_MONTH'])])
