In [1]:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkContext

In [2]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:
dataPath = '../Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv'

crimeDataSchema = StructType([StructField("IncidntNum", LongType(), True),
                              StructField("Category", StringType(), True),
                              StructField("Descript", StringType(), True),
                              StructField("DayOfWeek", StringType(), True),
                              StructField("Date", StringType(), True),
                              StructField("Time", StringType(), True),
                              StructField("PdDistrict", StringType(), True),
                              StructField("Resolution", StringType(), True),
                              StructField("Address", StringType(), True),
                              StructField("X", DoubleType(), True),
                              StructField("Y", DoubleType(), True),
                              StructField("Location", StringType(), True),
                              StructField("PdId", LongType(), True)])

crimeDF = (sqlContext.read
           .format('csv')
           .option('delimiter', ',')
           .option('header', 'true')
           .load(dataPath, schema=crimeDataSchema))

# crimeDF.take(1)

# Visualizations

## 1 Counts of Different Crimes
Let's first understand what types of crimes there are, and the frequencies of each.

### 1.1 Overall Trends

In [None]:
crime_types = crimeDF.groupBy('Category').count()
category_rows = crime_types.select('Category', 'Count').orderBy('Count', ascending=True).collect()

In [None]:
category_counts = [(row.Category, row.Count) for row in category_rows]
# print(category_counts)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

cats = [r[0] for r in category_counts]
vals = [r[1] for r in category_counts]

plt.figure(figsize=(10,15))
plt.barh(np.arange(len(cats)), vals, align='center')
plt.yticks(np.arange(len(cats)), cats)
plt.title('Crime Counts')
plt.xlabel('Counts')

for i, val in enumerate(vals):
    plt.text(val+1000, i-0.1, str(val))

plt.show()

### 1.2 Most Popular Crimes Per District

In [None]:
crimes = (crimeDF.groupBy('Category').count().select('Category').orderBy('Count', ascending=False).collect())
crimes = [row.Category for row in crimes]
print("Crimes: ", crimes)

districts = (crimeDF.groupBy('PdDistrict').count().select('PdDistrict').collect())
districts = [row.PdDistrict for row in districts if row.PdDistrict is not None]
print("Districts: ", districts)

category_district_count = (crimeDF.groupBy('Category', 'PdDistrict')
                           .count()
                           .select('Category', 'PdDistrict', 'Count')
                           .collect())
category_district_count = [(r.Category, r.PdDistrict, r.Count) for r in category_district_count]
# print(category_district_count)

In [None]:
# Create mapping from text to num for 2d heatmap
crime_index = {crime: index for (index, crime) in enumerate(crimes)}
# print(crime_index)

district_index = {district: index for (index, district) in enumerate(districts)}
# print(district_index)

In [None]:
heatmap_grid = np.zeros([len(crimes), len(districts)])

for (crime, dist, count) in category_district_count:
    if not dist is None:
        heatmap_grid[crime_index[crime]][district_index[dist]] = count    
    
# print(heatmap_grid)    

In [None]:
import matplotlib.patheffects as PathEffects

fig, ax = plt.subplots(figsize=(30, 50))
im = ax.imshow(heatmap_grid, cmap='hot')

ax.set_xticks(np.arange(len(districts)))
ax.set_yticks(np.arange(len(crimes)))

ax.set_xticklabels(districts)
ax.set_yticklabels(crimes)
ax.xaxis.tick_top()

plt.setp(ax.get_xticklabels(), ha="center")

for i in range(len(crimes)):
    for j in range(len(districts)):
        text = ax.text(j, i, heatmap_grid[i, j],
                       ha="center", va="center", color="w")
        text.set_path_effects([PathEffects.withStroke(linewidth=5, foreground='black')])

ax.set_title("Crimes comitted per district")
fig.tight_layout()
plt.show()

## 2 How Crimes Fluctuate over Time

### 2.1 Yearly Trends

### 2.2 Monthly Trends

### 2.3 Day of the Week Trends

### 2.4 Hourly Trends

In [None]:
# Parse the date and extra the year


## 3 Different Crimes Per Location