In [141]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [142]:
# loading the raw data
df = pd.read_csv('Chicago_Crimes_2012_to_2017.csv')

In [143]:
# checking for missing values
df_missing = df.isnull().sum()
df_missing

Unnamed: 0                  0
ID                          0
Case Number                 1
Date                        0
Block                       0
IUCR                        0
Primary Type                0
Description                 0
Location Description     1658
Arrest                      0
Domestic                    0
Beat                        0
District                    1
Ward                       14
Community Area             40
FBI Code                    0
X Coordinate            37083
Y Coordinate            37083
Year                        0
Updated On                  0
Latitude                37083
Longitude               37083
Location                37083
dtype: int64

In [144]:
# checking the size of the data
df.shape

(1456714, 23)

In [145]:
# since we're working with such a large dataset, I feel comfortable getting rid of the rows with any missing data
# we would still have well over 1 mil. rows of data to work with
df = df.dropna(axis = 0, how='any')
df_new_missing = df.isnull().sum()
df_new_missing

Unnamed: 0              0
ID                      0
Case Number             0
Date                    0
Block                   0
IUCR                    0
Primary Type            0
Description             0
Location Description    0
Arrest                  0
Domestic                0
Beat                    0
District                0
Ward                    0
Community Area          0
FBI Code                0
X Coordinate            0
Y Coordinate            0
Year                    0
Updated On              0
Latitude                0
Longitude               0
Location                0
dtype: int64

In [146]:
# checking the shape again, just to make sure we're still working with a healthy dataset
df.shape

(1418365, 23)

In [147]:
# getting rid of any unecessary/redundant features
# i want to get rid of 'Unnamed: 0', 'ID', 'Case Number', 'Date', 'Description', 'Updated On', 'Location', 'Primary Type'
# 'Unnamed: 0', 'ID', 'Case Number' are all just identifiers, which might not be necessary, but i don't mind keeping 'ID'
# the other features seem a bit too specific, and might make it difficult, but I can also see their importance in predicting crime
# 'Primary Type' is describing the crime, but IUCR is the code for the crime, which makes it a bit redundant to have both

In [148]:
# sorting into categories of data
categorical = df[['Block','Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'FBI Code']]
numerical = df[['IUCR', 'Ward', 'District', 'Beat', 'Community Area', 'Year']]
dates = df[['Date', 'Updated On']]
locations = df[['Latitude', 'Longitude', 'Location']]

In [149]:
# from here, i want to one-hot encode pretty much everything ? that we decide to keep from the above columns
# different interations of each model could also include adjusting which features we remove, how they're encoded, etc. etc. :3

In [150]:
# getting rid of unnamed: 0, ID, case number, updated on, description, lat, long, loc 
# target variable : community area
# testing : block, IUCR, location description, beat, FBI Code, 

In [151]:
df.columns

Index(['Unnamed: 0', 'ID', 'Case Number', 'Date', 'Block', 'IUCR',
       'Primary Type', 'Description', 'Location Description', 'Arrest',
       'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code',
       'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude',
       'Longitude', 'Location'],
      dtype='object')

In [152]:
# removing unwanted features
columns_to_remove = ['Unnamed: 0', 'ID', 'Case Number', 'Updated On', 'Description', 'Latitude', 'Longitude', 'Location', 
                'X Coordinate', 'Y Coordinate', 'FBI Code', 'Block', 'Date']
df = df.drop(columns = columns_to_remove)

In [153]:
# making 'Arrest' 1/0
df['Arrest'] = df['Arrest'].astype(int)

# changing 'domestic' feature to be 0/1
df['Domestic'] = df['Domestic'].astype(int)

In [154]:
df.head()

Unnamed: 0,IUCR,Primary Type,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,Year
0,486,BATTERY,APARTMENT,1,1,1022,10.0,24.0,29.0,2016
1,486,BATTERY,RESIDENCE,0,1,313,3.0,20.0,42.0,2016
2,470,PUBLIC PEACE VIOLATION,STREET,0,0,1524,15.0,37.0,25.0,2016
3,460,BATTERY,SIDEWALK,0,0,1532,15.0,28.0,25.0,2016
4,820,THEFT,RESIDENCE,0,1,1523,15.0,28.0,25.0,2016


In [155]:
# possible ideas for avoiding label encoding cat. variables

In [156]:
columns_to_encode = ['Primary Type', 'Location Description', 'IUCR']
df = pd.get_dummies(df, columns=columns_to_encode)

for column in df.columns:
        if column.startswith('Primary Type_') or column.startswith('Location Description_') or column.startswith('IUCR_'):
            df[column] = df[column].astype(int)

In [157]:
df.head()

Unnamed: 0,Arrest,Domestic,Beat,District,Ward,Community Area,Year,Primary Type_ARSON,Primary Type_ASSAULT,Primary Type_BATTERY,...,IUCR_5094,IUCR_5110,IUCR_5111,IUCR_5112,IUCR_5113,IUCR_5114,IUCR_5121,IUCR_5130,IUCR_5131,IUCR_5132
0,1,1,1022,10.0,24.0,29.0,2016,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,1,313,3.0,20.0,42.0,2016,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1524,15.0,37.0,25.0,2016,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1532,15.0,28.0,25.0,2016,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1523,15.0,28.0,25.0,2016,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [159]:
# standardizing all numbers in the dataframe

numerical_features = ['Beat', 'District', 'Ward', 'Year', 'Community Area']

scaler = StandardScaler()
scaler.fit(df[numerical_features])
scaled_features = scaler.transform(df[numerical_features])
df[numerical_features]=scaled_features

df.head()

Unnamed: 0,Arrest,Domestic,Beat,District,Ward,Community Area,Year,Primary Type_ARSON,Primary Type_ASSAULT,Primary Type_BATTERY,...,IUCR_5094,IUCR_5110,IUCR_5111,IUCR_5112,IUCR_5113,IUCR_5114,IUCR_5121,IUCR_5130,IUCR_5131,IUCR_5132
0,1,1,-0.185476,-0.181792,0.083027,-0.395454,1.510226,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,1,-1.210577,-1.195622,-0.206959,0.211174,1.510226,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0.540336,0.542373,1.025482,-0.582108,1.510226,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0.551903,0.542373,0.373013,-0.582108,1.510226,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0.53889,0.542373,0.373013,-0.582108,1.510226,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# saving this as a csv file
df.to_csv('final_cleaning.csv', index=False)