In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import folium
import geopandas as gpd
from shapely.geometry import Point, Polygon
import csv

import warnings
warnings.filterwarnings("ignore")

# Compiled Yellow and Green Taxi Datasets by Month

In [26]:
taxi = defaultdict()

taxi['jan'] = pd.read_csv("../raw_data/taxi_jan_2019.csv")
taxi['feb'] = pd.read_csv("../raw_data/taxi_feb_2019.csv")
taxi['mar'] = pd.read_csv("../raw_data/taxi_mar_2019.csv")
taxi['apr'] = pd.read_csv("../raw_data/taxi_apr_2019.csv")
taxi['may'] = pd.read_csv("../raw_data/taxi_may_2019.csv")
taxi['jun'] = pd.read_csv("../raw_data/taxi_jun_2019.csv")
taxi['jul'] = pd.read_csv("../raw_data/taxi_jul_2019.csv")
taxi['aug'] = pd.read_csv("../raw_data/taxi_aug_2019.csv")
taxi['sep'] = pd.read_csv("../raw_data/taxi_sep_2019.csv")
taxi['oct'] = pd.read_csv("../raw_data/taxi_oct_2019.csv")
taxi['nov'] = pd.read_csv("../raw_data/taxi_nov_2019.csv")
taxi['dec'] = pd.read_csv("../raw_data/taxi_dec_2019.csv")

# Preprocessing Taxi Data

In [27]:
# taxi_2019 contains the following data of indexes for each zone each month
taxi_index = ['average trip distance','credit payment','cash payment','no payment','dispute payment','unknown payment', \
              'voided trip','average fare','average tip','pickups']
months = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
month_num = ['01','02','03','04','05','06','07','08','09','10','11','12']

taxi_2019 = defaultdict()

for month in months:
    taxi[month] = taxi[month].drop(columns = ['congestion_surcharge'])

for month in months:
    taxi_2019[month] = defaultdict()
    
    for i in range(1,266):
        taxi_2019[month][i] = defaultdict()

        zone_data = taxi[month].loc[taxi[month]['PULocationID'] == i]
        taxi_2019[month][i]['credit payment'] = len(zone_data.loc[zone_data['payment_type'] == 1])
        taxi_2019[month][i]['cash payment'] = len(zone_data.loc[zone_data['payment_type'] == 2])
        taxi_2019[month][i]['no payment'] = len(zone_data.loc[zone_data['payment_type'] == 3])
        taxi_2019[month][i]['dispute payment'] = len(zone_data.loc[zone_data['payment_type'] == 4])
        taxi_2019[month][i]['unknown payment'] = len(zone_data.loc[zone_data['payment_type'] == 5])
        taxi_2019[month][i]['voided trip'] = len(zone_data.loc[zone_data['payment_type'] == 6])

        if (len(zone_data) != 0):
            taxi_2019[month][i]['average trip distance'] = sum(zone_data['trip_distance'])/len(zone_data.loc[zone_data['trip_distance'] != 0])
            taxi_2019[month][i]['average fare'] = sum(zone_data['fare_amount'])/len(zone_data)
            taxi_2019[month][i]['average tip'] = sum(zone_data['tip_amount'])/len(zone_data)
        else:
            taxi_2019[month][i]['average trip distance'] = 0
            taxi_2019[month][i]['average fare'] = 0
            taxi_2019[month][i]['average extra'] = 0
            taxi_2019[month][i]['average tip'] = 0

        taxi_2019[month][i]['pickups'] = len(zone_data)

# Reading Arrests, Shootings, and Complaints Datasets

In [18]:
arrests = pd.read_csv("../raw_data/arrests_2019.csv")
shootings = pd.read_csv("../raw_data/shootings_2019.csv")
complaints = pd.read_csv("../raw_data/complaints_2019.csv")

In [19]:
arrests_months = defaultdict()

for i in range(len(months)):
    arrests_months[months[i]] = arrests.loc[arrests['ARREST_DATE'].str.startswith(month_num[i])]
    arrests_months[months[i]] = arrests_months[months[i]].reset_index()

In [20]:
shootings_months = defaultdict()

for i in range(len(months)):
    shootings_months[months[i]] = shootings.loc[shootings['OCCUR_DATE'].str.startswith(month_num[i])]
    shootings_months[months[i]] = shootings_months[months[i]].reset_index()

In [21]:
complaints_months = defaultdict()

for i in range(len(months)):
    complaints_months[months[i]] = complaints.loc[complaints['RPT_DT'].str.startswith(month_num[i])]
    complaints_months[months[i]] = complaints_months[months[i]].reset_index()

In [22]:
# need shapefile to figure out which zones contain each crime's location
sf = gpd.read_file("../raw_data/taxi_zones/taxi_zones.shp")
zone = pd.read_csv("../raw_data/taxi_zones/taxi_zone_lookup.csv")

# converts the geometry shape to to latitude and longitude
# adapted from MAST30034 Lab 2 Python Notebook from https://github.com/akiratwang/MAST30034_Python/blob/main/tutorials/Lab2_Python.ipynb
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")

In [23]:
# function used to return true if string is a NaN
def isNaN(string):
    return string != string

# Preprocessing Arrests Data

In [24]:
# arrests_months_zone contains the following data of indexes for each zone each month
arrests_index = ['a.num','a.p.age: <18','a.p.age: 18-24','a.p.age: 25-44','a.p.age: 45-64','a.p.age: 65+', \
    'a.p.sex: M','a.p.sex: F','a.p.race: BLACK','a.p.race: WHITE','a.p.race: BLACK HISPANIC', \
    'a.p.race: AMERICAN INDIAN/ALASKAN NATIVE','a.p.race: WHITE HISPANIC','a.p.race: ASIAN / PACIFIC ISLANDER', \
    'a.law: F','a.law: M','a.law: V','a.law: I','a.law: U']

arrests_months_zone = defaultdict()

for month in arrests_months:
    stats = defaultdict()

    for i in range(1,266):
        stats[i] = defaultdict(float)

    for i in range(len(arrests_months[month])):
        split = arrests_months[month]['Lon_Lat'][i].split()
        point = Point(float(split[1][1:]), float(split[2][:-1]))
        
        perp_age_group = arrests_months[month]['AGE_GROUP'][i]
        perp_sex = arrests_months[month]['PERP_SEX'][i]
        perp_race = arrests_months[month]['PERP_RACE'][i]
        law_cat = arrests_months[month]['LAW_CAT_CD'][i]

        found = False

        for j in range(263):
            if point.within(sf['geometry'][j]):
                stats[j+1]['a.num'] += 1
                stats[j+1]['a.p.age: ' + perp_age_group] += 1
                stats[j+1]['a.p.sex: ' + perp_sex] += 1
                stats[j+1]['a.p.race: ' + perp_race] += 1
                
                if (not isNaN(law_cat)):
                    stats[j+1]['a.law: ' + law_cat] += 1
                else:
                    stats[j+1]['a.law: U'] += 1
                    
                
                found = True
                break

        if not found:
            stats[265]['a.num'] += 1

    arrests_months_zone[month] = stats

# Preprocessing Shootings Data

In [25]:
# shootings_months_zone contains the following data of indexes for each zone each month
shootings_index = ['s.num','s.murder','s.p.age: <18','s.p.age: 18-24','s.p.age: 25-44','s.p.age: 45-64', \
    's.p.age: 65+','s.p.age: UNKNOWN','s.p.sex: M','s.p.sex: F','s.p.sex: U','s.p.race: BLACK','s.p.race: WHITE', \
    's.p.race: BLACK HISPANIC','s.p.race: AMERICAN INDIAN/ALASKAN NATIVE','s.p.race: UNKNOWN', \
    's.p.race: WHITE HISPANIC','s.p.race: ASIAN / PACIFIC ISLANDER','s.v.age: <18','s.v.age: 18-24', \
    's.v.age: 25-44','s.v.age: 45-64','s.v.age: 65+','s.v.age: UNKNOWN','s.v.sex: M','s.v.sex: F','s.v.sex: U', \
    's.v.race: BLACK','s.v.race: WHITE','s.v.race: BLACK HISPANIC','s.v.race: UNKNOWN','s.v.race: WHITE HISPANIC', \
    's.v.race: ASIAN / PACIFIC ISLANDER','s.v.race: AMERICAN INDIAN/ALASKAN NATIVE']

shootings_months_zone = defaultdict()

for month in shootings_months:
    stats = defaultdict()

    for i in range(1,266):
        stats[i] = defaultdict(float)

    for i in range(len(shootings_months[month])):
        split = shootings_months[month]['Lon_Lat'][i].split()
        point = Point(float(split[1][1:]), float(split[2][:-1]))
        
        murder = shootings_months[month]['STATISTICAL_MURDER_FLAG'][i]
        perp_age_group = shootings_months[month]['PERP_AGE_GROUP'][i]
        perp_sex = shootings_months[month]['PERP_SEX'][i]
        perp_race = shootings_months[month]['PERP_RACE'][i]
        vic_age_group = shootings_months[month]['VIC_AGE_GROUP'][i]
        vic_sex = shootings_months[month]['VIC_SEX'][i]
        vic_race = shootings_months[month]['VIC_RACE'][i]

        found = False

        for j in range(263):
            if point.within(sf['geometry'][j]):
                stats[j+1]['s.num'] += 1
                if (murder):
                    stats[j+1]['s.murder'] += 1
                    
                if (not isNaN(perp_age_group)):
                    stats[j+1]['s.p.age: ' + perp_age_group] += 1
                else:
                    stats[j+1]['s.p.age: UNKNOWN'] += 1
                    
                if (not isNaN(perp_sex)):
                    stats[j+1]['s.p.sex: ' + perp_sex] += 1
                else:
                    stats[j+1]['s.p.sex: U'] += 1  
                    
                if (not isNaN(perp_race)):
                    stats[j+1]['s.p.race: ' + perp_race] += 1
                else:
                    stats[j+1]['s.p.race: UNKNOWN'] += 1
                    
                stats[j+1]['s.v.age: ' + vic_age_group] += 1
                stats[j+1]['s.v.sex: ' + vic_sex] += 1
                stats[j+1]['s.v.race: ' + vic_race] += 1
                
                found = True
                break

        if not found:
            stats[265]['s.num'] += 1

    shootings_months_zone[month] = stats

# Preprocessing Complaints Data

In [28]:
# complaints_months_zone contains the following data of indexes for each zone each month
complaints_index = ['c.num','c.p.age: <18','c.p.age: 18-24','c.p.age: 25-44','c.p.age: 45-64','c.p.age: 65+', \
    'c.p.age: UNKNOWN','c.p.sex: M','c.p.sex: F','c.p.sex: U','c.p.race: BLACK','c.p.race: WHITE', \
    'c.p.race: BLACK HISPANIC','c.p.race: AMERICAN INDIAN/ALASKAN NATIVE','c.p.race: UNKNOWN', \
    'c.p.race: WHITE HISPANIC','c.p.race: ASIAN / PACIFIC ISLANDER','c.v.age: <18','c.v.age: 18-24', \
    'c.v.age: 25-44','c.v.age: 45-64','c.v.age: 65+','c.v.age: UNKNOWN','c.v.sex: M','c.v.sex: F','c.v.sex: U', \
    'c.v.race: BLACK','c.v.race: WHITE','c.v.race: BLACK HISPANIC','c.v.race: UNKNOWN','c.v.race: WHITE HISPANIC', \
    'c.v.race: ASIAN / PACIFIC ISLANDER','c.v.race: AMERICAN INDIAN/ALASKAN NATIVE']

complaints_months_zone = defaultdict()

for month in complaints_months:
    stats = defaultdict()

    for i in range(1,266):
        stats[i] = defaultdict(float)

    for i in range(len(complaints_months[month])):
        
        try:
            split = complaints_months[month]['Lat_Lon'][i].split(',')
            point = Point(float(split[1][:-1]), float(split[0][1:]))
            
            susp_age_group = complaints_months[month]['SUSP_AGE_GROUP'][i]
            susp_sex = complaints_months[month]['SUSP_SEX'][i]
            susp_race = complaints_months[month]['SUSP_RACE'][i]
            vic_age_group = complaints_months[month]['VIC_AGE_GROUP'][i]
            vic_sex = complaints_months[month]['VIC_SEX'][i]
            vic_race = complaints_months[month]['VIC_RACE'][i]

            found = False

            for j in range(263):
                if point.within(sf['geometry'][j]):
                    stats[j+1]['c.num'] += 1
                    stats[j+1][offense] += 1
                    
                    if (not isNaN(susp_age_group)):
                        stats[j+1]['c.s.age: ' + susp_age_group] += 1
                    else:
                        stats[j+1]['c.s.age: UNKNOWN'] += 1

                    if (not isNaN(susp_sex)):
                        stats[j+1]['c.s.sex: ' + susp_sex] += 1
                    else:
                        stats[j+1]['c.s.sex: U'] += 1  

                    if (not isNaN(susp_race)):
                        stats[j+1]['c.s.race: ' + susp_race] += 1
                    else:
                        stats[j+1]['c.s.race: UNKNOWN'] += 1

                    if (not isNaN(vic_age_group)):
                        stats[j+1]['c.v.age: ' + vic_age_group] += 1
                    else:
                        stats[j+1]['c.v.age: UNKNOWN'] += 1

                    if (not isNaN(vic_sex)):
                        stats[j+1]['c.v.sex: ' + vic_sex] += 1
                    else:
                        stats[j+1]['c.v.sex: U'] += 1  

                    if (not isNaN(vic_race)):
                        stats[j+1]['c.v.race: ' + vic_race] += 1
                    else:
                        stats[j+1]['c.v.race: UNKNOWN'] += 1
                    
                    found = True
                    break

            if not found:
                stats[265]['c.num'] += 1
                
        except:
            stats[265]['c.num'] += 1

    complaints_months_zone[month] = stats

# Compiling All Wanted Features into CSVs

In [31]:
# compiles data indexes in order of [arrests, shootings, complaints, taxi] indexes as shown above for each zone each month

for i in range(12):
    
    with open('../preprocessed_data/' + months[i] + '_data.csv', 'w') as w:
        
        writer = csv.writer(w)
        
        for zone in range(0,264):
            
            data = []
            
            if (zone == 0):
                for stat in arrests_index:
                    data.append(stat)
                for stat in shootings_index:
                    data.append(stat)
                for stat in complaints_index:
                    data.append(stat)
                for stat in taxi_index:
                    data.append(stat)
                
            else:
                for stat in arrests_index:
                    data.append(arrests_months_zone[months[i]][zone][stat])

                for stat in shootings_index:
                    data.append(shootings_months_zone[months[i]][zone][stat])

                for stat in complaints_index:
                    data.append(complaints_months_zone[months[i]][zone][stat])

                for stat in taxi_index:
                    data.append(taxi_2019[months[i]][zone][stat])
            
            writer.writerow(data)