In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [22]:
# loading the raw data
df = pd.read_csv('Chicago_Crimes_2012_to_2017.csv')

In [23]:
# checking for missing values
df_missing = df.isnull().sum()
df_missing

Unnamed: 0                  0
ID                          0
Case Number                 1
Date                        0
Block                       0
IUCR                        0
Primary Type                0
Description                 0
Location Description     1658
Arrest                      0
Domestic                    0
Beat                        0
District                    1
Ward                       14
Community Area             40
FBI Code                    0
X Coordinate            37083
Y Coordinate            37083
Year                        0
Updated On                  0
Latitude                37083
Longitude               37083
Location                37083
dtype: int64

In [24]:
# checking the size of the data
df.shape

(1456714, 23)

In [25]:
# since we're working with such a large dataset, I feel comfortable getting rid of the rows with any missing data
# we would still have well over 1 mil. rows of data to work with
df = df.dropna(axis = 0, how='any')
df_new_missing = df.isnull().sum()
df_new_missing

Unnamed: 0              0
ID                      0
Case Number             0
Date                    0
Block                   0
IUCR                    0
Primary Type            0
Description             0
Location Description    0
Arrest                  0
Domestic                0
Beat                    0
District                0
Ward                    0
Community Area          0
FBI Code                0
X Coordinate            0
Y Coordinate            0
Year                    0
Updated On              0
Latitude                0
Longitude               0
Location                0
dtype: int64

In [26]:
# checking the shape again, just to make sure we're still working with a healthy dataset
df.shape

(1418365, 23)

In [27]:
# getting rid of any unecessary/redundant features
# i want to get rid of 'Unnamed: 0', 'ID', 'Case Number', 'Date', 'Description', 'Updated On', 'Location', 'Primary Type'
# 'Unnamed: 0', 'ID', 'Case Number' are all just identifiers, which might not be necessary, but i don't mind keeping 'ID'
# the other features seem a bit too specific, and might make it difficult, but I can also see their importance in predicting crime
# 'Primary Type' is describing the crime, but IUCR is the code for the crime, which makes it a bit redundant to have both

In [28]:
# sorting into categories of data
categorical = df[['Block','Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'FBI Code']]
numerical = df[['IUCR', 'Ward', 'District', 'Beat', 'Community Area', 'Year']]
dates = df[['Date', 'Updated On']]
locations = df[['Latitude', 'Longitude', 'Location']]

In [29]:
# from here, i want to one-hot encode pretty much everything ? that we decide to keep from the above columns
# different interations of each model could also include adjusting which features we remove, how they're encoded, etc. etc. :3

In [30]:
# getting rid of unnamed: 0, ID, case number, updated on, description, lat, long, loc 
# target variable : community area
# testing : block, IUCR, location description, beat, FBI Code, 

In [31]:
df.columns

Index(['Unnamed: 0', 'ID', 'Case Number', 'Date', 'Block', 'IUCR',
       'Primary Type', 'Description', 'Location Description', 'Arrest',
       'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code',
       'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude',
       'Longitude', 'Location'],
      dtype='object')

In [32]:
# removing unwanted features
columns_to_remove = ['Unnamed: 0', 'ID', 'Case Number', 'Updated On', 'Description', 'Latitude', 'Longitude', 'Location', 
                'X Coordinate', 'Y Coordinate']
df = df.drop(columns = columns_to_remove)

In [33]:
df.head()

Unnamed: 0,Date,Block,IUCR,Primary Type,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,Year
0,05/03/2016 11:40:00 PM,013XX S SAWYER AVE,486,BATTERY,APARTMENT,True,True,1022,10.0,24.0,29.0,08B,2016
1,05/03/2016 09:40:00 PM,061XX S DREXEL AVE,486,BATTERY,RESIDENCE,False,True,313,3.0,20.0,42.0,08B,2016
2,05/03/2016 11:31:00 PM,053XX W CHICAGO AVE,470,PUBLIC PEACE VIOLATION,STREET,False,False,1524,15.0,37.0,25.0,24,2016
3,05/03/2016 10:10:00 PM,049XX W FULTON ST,460,BATTERY,SIDEWALK,False,False,1532,15.0,28.0,25.0,08B,2016
4,05/03/2016 10:00:00 PM,003XX N LOTUS AVE,820,THEFT,RESIDENCE,False,True,1523,15.0,28.0,25.0,06,2016


In [34]:
# adjusting date column to become two separate columns, date & time
df['Date'] = pd.to_datetime(df['Date'])  # Convert to datetime objects

df['Date_1'] = df['Date'].dt.date
df['Time'] = df['Date'].dt.time

  df['Date'] = pd.to_datetime(df['Date'])  # Convert to datetime objects


In [35]:
# removing the date feature, now that we have date and time - and renaming date_1 to date for clarity
df = df.drop(columns = 'Date')

df = df.rename(columns = {'Date_1' : 'Date'})

In [36]:
df.head()

Unnamed: 0,Block,IUCR,Primary Type,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,Year,Date,Time
0,013XX S SAWYER AVE,486,BATTERY,APARTMENT,True,True,1022,10.0,24.0,29.0,08B,2016,2016-05-03,23:40:00
1,061XX S DREXEL AVE,486,BATTERY,RESIDENCE,False,True,313,3.0,20.0,42.0,08B,2016,2016-05-03,21:40:00
2,053XX W CHICAGO AVE,470,PUBLIC PEACE VIOLATION,STREET,False,False,1524,15.0,37.0,25.0,24,2016,2016-05-03,23:31:00
3,049XX W FULTON ST,460,BATTERY,SIDEWALK,False,False,1532,15.0,28.0,25.0,08B,2016,2016-05-03,22:10:00
4,003XX N LOTUS AVE,820,THEFT,RESIDENCE,False,True,1523,15.0,28.0,25.0,06,2016,2016-05-03,22:00:00


In [37]:
# label encoding all categorical variables, as one-hot encoding fried my laptop :)
from sklearn.preprocessing import LabelEncoder

features_to_encode = ['Block', 'IUCR', 'Primary Type', 'Location Description', 'Arrest', 'FBI Code']

label_encoder = LabelEncoder()

for feature in features_to_encode:
    df[feature] = label_encoder.fit_transform(df[feature])

df.head()

Unnamed: 0,Block,IUCR,Primary Type,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,Year,Date,Time
0,6452,51,2,17,1,True,1022,10.0,24.0,29.0,10,2016,2016-05-03,23:40:00
1,24340,51,2,109,0,True,313,3.0,20.0,42.0,10,2016,2016-05-03,21:40:00
2,21841,42,27,125,0,False,1524,15.0,37.0,25.0,24,2016,2016-05-03,23:31:00
3,20464,39,2,121,0,False,1532,15.0,28.0,25.0,10,2016,2016-05-03,22:10:00
4,1909,84,31,109,0,True,1523,15.0,28.0,25.0,7,2016,2016-05-03,22:00:00


In [38]:
# changing 'domestic' feature to be 0/1
df['Domestic'] = df['Domestic'].astype(int)

In [39]:
# dropping date and time features
to_drop = ['Date', 'Time']
df = df.drop(columns=to_drop)

In [40]:
# standardizing all numbers in the dataframe

numerical_features = ['Block', 'IUCR', 'Primary Type', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'FBI Code', 'Year']

scaler = StandardScaler()
scaler.fit(df[numerical_features])
scaled_features = scaler.transform(df[numerical_features])
df[numerical_features]=scaled_features

df.head()

Unnamed: 0,Block,IUCR,Primary Type,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,Year
0,-0.968428,-0.851869,-1.078048,-1.920481,1.680029,2.364939,-0.185476,-0.181792,0.083027,29.0,-0.293737,1.510226
1,0.889025,-0.851869,-1.078048,0.379894,-0.595228,2.364939,-1.210577,-1.195622,-0.206959,42.0,-0.293737,1.510226
2,0.629534,-0.967165,1.022525,0.77996,-0.595228,-0.422844,0.540336,0.542373,1.025482,25.0,1.858371,1.510226
3,0.486549,-1.005598,-1.078048,0.679943,-0.595228,-0.422844,0.551903,0.542373,0.373013,25.0,-0.293737,1.510226
4,-1.440164,-0.429117,1.358617,0.379894,-0.595228,2.364939,0.53889,0.542373,0.373013,25.0,-0.754903,1.510226


In [41]:
# saving to a csv to work with just the clean data
df.to_csv('final_data.csv', index = False)