# Chicago Crimes
This examples shows an exploratory data analysis (EDA)  of crimes in Chicago. 

Original example can be found [here](https://medium.com/@ahsanzafar222/chicago-crime-data-cleaning-and-eda-a744c687a291) and [here](https://www.kaggle.com/fahd09/eda-of-crime-in-chicago-2005-2016).


### Notes on running these queries:

Bodo is used by defaults, which distributes data chunks across cores automatically.

Using dataset found [here](https://www.kaggle.com/currie32/crimes-in-chicago) which is ~1.5GB.

To run the code:
1. Make sure you [add your AWS account credentials to Saturn Cloud](https://saturncloud.io/docs/examples/python/load-data/qs-load-data-s3/#create-aws-credentials) to access the data.
2. If you want to run a query in regular pandas:
    1. Comment lines with Jupyter parallel magic (%%px) and bodo decorator (@bodo.jit) from all the code cells.
    2. Then, re-run cells from the beginning.


### Start an IPyParallel cluster
Run the following code in a cell to start an IPyParallel cluster. 4 cores are used in this example. 

In [None]:
import ipyparallel as ipp
import psutil

n = min(psutil.cpu_count(logical=False), 8)
rc = ipp.Cluster(engines="mpi", n=n).start_and_connect_sync(activate=True)

### Verifying your setup
Run the following code to verify that your IPyParallel cluster is set up correctly:

In [None]:
%%px
import bodo

print(f"Hello World from rank {bodo.get_rank()}. Total ranks={bodo.get_size()}")

## Importing the Packages

These are the main packages we are going to work with:
 - Bodo to parallelize Python code automatically
 - Pandas to work with data

In [None]:
%%px
import json
import os
import time

import numpy as np
import pandas as pd

## Load Crimes Data in Chicago 2005 - 2017

In [None]:
%%px
@bodo.jit(cache=True)
def load_chicago_crimes():
    t1 = time.time()
    crimes1 = pd.read_csv(
        "s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2005_to_2007.csv"
    )
    crimes2 = pd.read_csv(
        "s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2008_to_2011.csv"
    )
    crimes3 = pd.read_csv(
        "s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2012_to_2017.csv"
    )
    crimes = pd.concat([crimes1, crimes2, crimes3], ignore_index=False, axis=0)
    crimes = crimes.sort_values(by="ID")
    print("Reading time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes


crimes1 = load_chicago_crimes()
if bodo.get_rank() == 0:
    print(crimes1.head())

## Preprocessing and Cleaning
 1. Drop duplicated cases, filter unused columns, and add day of week and date of the crime.
 2. Keep only the most frequent crime type categories.


In [None]:
%%px
@bodo.jit(distributed=["crimes"], cache=True)
def data_cleanup(crimes):
    t1 = time.time()
    crimes = crimes.drop_duplicates()
    crimes.drop(
        [
            "Unnamed: 0",
            "Case Number",
            "IUCR",
            "Updated On",
            "Year",
            "FBI Code",
            "Beat",
            "Ward",
            "Community Area",
            "Location",
        ],
        inplace=True,
        axis=1,
    )
    crimes.Date = pd.to_datetime(crimes.Date, format="%m/%d/%Y %I:%M:%S %p")
    crimes["dow"] = crimes["Date"].dt.dayofweek
    crimes["date only"] = crimes["Date"].dt.floor("D")
    crimes = crimes.sort_values(by="ID")
    print("Data cleanup time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes


crimes = data_cleanup(crimes1)
if bodo.get_rank() == 0:
    print(crimes.head())

In [None]:
%%px
@bodo.jit(cache=True)
def get_top_crime_types(crimes):
    t1 = time.time()
    top_crime_types = crimes["Primary Type"].value_counts().index[0:10]
    print("Getting top crimes Time: ", ((time.time() - t1) * 1000), " (ms)")
    return top_crime_types


top_crime_types = get_top_crime_types(crimes)
top_crime_types = bodo.allgatherv(top_crime_types)
if bodo.get_rank() == 0:
    print(top_crime_types)

In [None]:
%%px
@bodo.jit(cache=True)
def filter_crimes(crimes, top_crime_types):
    t1 = time.time()
    top_crimes = crimes[crimes["Primary Type"].isin(top_crime_types)]
    print("Filtering crimes Time: ", ((time.time() - t1) * 1000), " (ms)")
    return top_crimes


crimes = filter_crimes(crimes, top_crime_types)
if bodo.get_rank() == 0:
    print(crimes.head())

## Crime Analysis

### Find Pattern of each crime over the years



In [None]:
%%px
@bodo.jit(cache=True)
def get_crimes_count_date(crimes):
    t1 = time.time()
    crimes_count_date = crimes.pivot_table(
        index="date only", columns="Primary Type", values="ID", aggfunc="count"
    )
    print("Computing Crime Pattern Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_count_date


crimes_count_date = get_crimes_count_date(crimes)

In [None]:
%%px
@bodo.jit
def get_crimes_type_date(crimes_count_date):
    t1 = time.time()
    crimes_count_date.index = pd.DatetimeIndex(crimes_count_date.index)
    result = crimes_count_date.fillna(0).rolling(365).sum()
    result = result.sort_index(ascending=False)
    print("Computing Crime Pattern Time: ", ((time.time() - t1) * 1000), " (ms)")
    return result


get_crimes_type_date = get_crimes_type_date(crimes_count_date)
if bodo.get_rank() == 0:
    print(get_crimes_type_date.head())

## A general view of crime records by time, type and location

### Determining the pattern on daily basis

In [None]:
%%px
@bodo.jit(distributed=["crimes", "crimes_days"], cache=True)
def get_crimes_by_days(crimes):
    t1 = time.time()
    crimes_days = (
        crimes.groupby("dow", as_index=False)["ID"].count().sort_values(by="dow")
    )
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_days


crimes_days = get_crimes_by_days(crimes)
if bodo.get_rank() == 0:
    print(crimes_days.head())

### Determining the pattern on monthly basis

In [None]:
%%px
@bodo.jit(distributed=["crimes", "crimes_months"], cache=True)
def get_crimes_by_months(crimes):
    t1 = time.time()
    crimes["month"] = crimes["Date"].dt.month
    crimes_months = (
        crimes.groupby("month", as_index=False)["ID"].count().sort_values(by="month")
    )
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_months


crimes_months = get_crimes_by_months(crimes)
if bodo.get_rank() == 0:
    print(crimes_months.head())

### Determining the pattern by crime type

In [None]:
%%px
@bodo.jit(distributed=["crimes", "crimes_type"], cache=True)
def get_crimes_by_type(crimes):
    t1 = time.time()
    crimes_type = (
        crimes.groupby("Primary Type", as_index=False)["ID"]
        .count()
        .sort_values(by="ID", ascending=False)
    )
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_type


crimes_type = get_crimes_by_type(crimes)
if bodo.get_rank() == 0:
    print(crimes_type.head())

### Determining the pattern by location

In [None]:
%%px
@bodo.jit(distributed=["crimes", "crimes_location"], cache=True)
def get_crimes_by_location(crimes):
    t1 = time.time()
    crimes_location = (
        crimes.groupby("Location Description", as_index=False)["ID"]
        .count()
        .sort_values(by="ID", ascending=False)
    )
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_location


crimes_location = get_crimes_by_location(crimes)
if bodo.get_rank() == 0:
    print(crimes_location.head())

In [None]:
# To stop the cluster run the following command.
rc.cluster.stop_cluster_sync()