# Load tables and create basic derived fields and tables

In [None]:
# Imports and constants

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import seaborn as sns
import numpy as np
%matplotlib inline

plt.rcParams['figure.figsize'] = [20, 5]

race_groups = {
    'AFRICAN AMERICAN': 'AFRICAN AMERICAN',
    'ASIAN INDIAN': 'ASIAN/PACIFIC ISLANDER',
    'CAMBODIAN': 'ASIAN/PACIFIC ISLANDER',
    'CAUCASIAN': 'CAUCASIAN',
    'CHINESE': 'ASIAN/PACIFIC ISLANDER',
    'FILIPINO': 'ASIAN/PACIFIC ISLANDER',
    'GUAMANIAN': 'ASIAN/PACIFIC ISLANDER',
    'HAWAIIAN': 'ASIAN/PACIFIC ISLANDER',
    'HISPANIC/LATIN/MEXICAN': 'HISPANIC/LATIN/MEXICAN',
    'JAPANESE': 'ASIAN/PACIFIC ISLANDER',
    'KOREAN': 'ASIAN/PACIFIC ISLANDER',
    'LAOTIAN': 'ASIAN/PACIFIC ISLANDER',
    'OTHER': 'OTHER',
    'OTHER ASIAN': 'ASIAN/PACIFIC ISLANDER',
    'PACIFIC ISLANDER': 'ASIAN/PACIFIC ISLANDER',
    'SAMOAN': 'ASIAN/PACIFIC ISLANDER',
    'UNKNOWN': 'UNKNOWN',
    'VIETNAMESE': 'ASIAN/PACIFIC ISLANDER'
}

### About `ARREST BOOKING NO_PK` and `GO NO`

Each `ARREST BOOKING` number represents a single arrest/citation event. Each may have multiple `GO NO` associated with it, but should have at most one unique value for every other column (e.g. only a single person (`PIN`) will be listed for a given arrest/citation event)

In [None]:
# Arrest table and derived fields

arrests = (
    pd
    .read_csv('../input/sjpd-incidents-arrests-charges/AB Data_010115-093020_v2.csv',
              dtype={
                  'ARREST BOOKING NO_PK': 'str',
                  'GO NO': 'str',
                  'PIN': 'category',
                  'AGE': 'Int64',
                  'SEX': 'category',
                  'RACE': 'category',
                  'ETHNICITY': 'category',
                  'ARREST TIME': 'str',
                  'LOCATION OF ARREST_BLOCK': 'category',
                  'ARREST REASON': 'category',
                  'ARREST TYPE': 'category',
                  'BEAT': 'category',
                  'SUMMARY OF FACTS': 'category',
                  'ARREST OFFICER': 'category',
                  'officer_name': 'category',
                  'CURRENT STATUS': 'category',
                  'YOUNG OFFENDER': 'category'
              },
              parse_dates=['ARREST DATE']
             )
    .drop(columns='GO NO')
    .drop_duplicates()
)

arrests['DISTRICT'] = arrests.BEAT.str[0].astype('category')
arrests['RACE_GROUP'] = arrests.apply(lambda x: race_groups.get(x.RACE), axis=1)
arrests['ETHNIC_GROUP'] = arrests.apply(lambda x: race_groups.get(x.ETHNICITY), axis=1)
arrests['OFFICER_KNOWN'] = np.where(arrests['ARREST OFFICER'].isnull(), False, True)

arrests.describe(include='all', datetime_is_numeric=True)

In [None]:
# Suspects listed in the arrests table (Person ID Number)
people = (
    arrests
    .groupby('PIN')
    .agg(
        RACE=('RACE', lambda x: pd.Series.mode(x)[0]), # mode may return multiple values!
        RACE_GROUP=('RACE_GROUP', lambda x: pd.Series.mode(x).get(0)),
        AGE=('AGE', pd.Series.max)
    )
)

In [None]:
charges = pd.read_csv('../input/sjpd-incidents-arrests-charges/Charge Data_010115-093020_v2.csv',
                      dtype={
                          'GO NO': 'category',
                          'PIN': 'category',
                          'STATUTE': 'category',
                          'CLASS': 'category',
                          'CHARGE DESCRIPTION': 'category',
                          'OFFENSE TIME': 'category',
                          'BOND AMT': 'float',
                          'WARRANT NUMBER': 'category',
                          'SEVERITY': 'category'
                      },
                      parse_dates=['OFFENSE DATE'],
                     )
charges.describe(include='all', datetime_is_numeric=True)

# What sorts of arrests are missing the 'officer' field?

In [None]:
(
    arrests
    .groupby('OFFICER_KNOWN')
    .agg(
        Count=('ARREST BOOKING NO_PK','count'),
        TopSummary=('SUMMARY OF FACTS', pd.Series.mode)
    )
    .sort_values(by='Count', ascending=False)
)

# How consistent is bond for similar charges?

In [None]:
(
    charges
     .rename(columns={'BOND AMT': 'BOND'})
     .groupby("CHARGE DESCRIPTION")
     .agg({'BOND': ['min', 'mean', 'median', 'max', 'std', 'count']})
     ['BOND']
     .query("(count > 15) & (std > 0)")
     .sort_values(by='std', ascending=False)
)

In [None]:
deviating_bond = (
    charges
    .rename(columns={'BOND AMT': 'BOND'})
    .groupby("CHARGE DESCRIPTION")
    .agg(Count=('BOND', 'count'), Std=('BOND', 'std'))
    .query("(Count > 15) & (Std > 0)")
    .index
)

# How likely are people to be arrested (instead of cited)... 

## ...overall?

In [None]:
arrests['CURRENT STATUS'].value_counts(normalize=True)

## ...based on their race?

In [None]:
def percent_arrested(series):
    outcomes = series.value_counts(normalize=True)
    return outcomes['CHARGED/BOOKED'] if 'CHARGED/BOOKED' in outcomes else 0

(
    arrests
    .groupby('RACE_GROUP')
    .agg(
        ArrestRate=('CURRENT STATUS', percent_arrested),
        SampleSize=('CURRENT STATUS', 'count')
    )
    .sort_values(by=['SampleSize','ArrestRate'], ascending=False)
)

## ...based on their sex?

In [None]:
(
    arrests
    .groupby(['SEX'])
    .agg(
        ArrestRate=('CURRENT STATUS', percent_arrested),
        SampleSize=('CURRENT STATUS', 'count')
    )
)

## ...based on their age?

In [None]:
stops_by_age = (
    arrests
    .groupby(
        pd.cut(arrests.AGE, [0,5,12,15,18,25,35,50,65,80,100])
    )
    .agg(
        ArrestRate=('CURRENT STATUS', percent_arrested),
        StopCount=('CURRENT STATUS', 'count'),
        TopStopReason=('SUMMARY OF FACTS', pd.Series.mode)
    )
)
stops_by_age

In [None]:
booked = arrests.query("`CURRENT STATUS` == 'CHARGED/BOOKED'")['AGE']
cited = arrests.query("`CURRENT STATUS` == 'CITED'")['AGE']

In [None]:
# Stops by District and Race
import functools

def find_token_rate(s, token):
    return round(s.value_counts(normalize=True).get(token, 0), 2)

#race_beats = (
#    arrests
#    .groupby('DISTRICT')
#    .agg(
#        TotalStops=('DISTRICT', 'count'),
#        HispanicStops=('RACE_GROUP', functools.partial(find_token_rate, token='HISPANIC/LATIN/MEXICAN')),
#        BlackStops=('RACE_GROUP', functools.partial(find_token_rate, token='AFRICAN AMERICAN')),
#        WhiteStops=('RACE_GROUP', functools.partial(find_token_rate, token='CAUCASIAN')),
#        AsianStops=('RACE_GROUP', functools.partial(find_token_rate, token='ASIAN/PACIFIC ISLANDER')),
#    )
#    .query('TotalStops > 2')
#    .sort_values(by=['TotalStops'], ascending=False)
#)
#race_beats
#race_beats.sort_values(by='TotalStops').drop(columns='TotalStops').plot(kind='bar', stacked=True)


# Most common reasons for stops

## Overall

In [None]:
(
    arrests
    .groupby('SUMMARY OF FACTS')
    .agg(
        count=('SUMMARY OF FACTS', 'count')
    )
    .sort_values(by='count', ascending=False)
    [0:20]
)

## For children

In [None]:
(
    arrests
    .query('AGE < 18', engine='python')
    .groupby('SUMMARY OF FACTS')
    .agg(
        count=('SUMMARY OF FACTS', 'count')
    )
    .sort_values(by='count', ascending=False)
    [0:20]
)

## By race and sex

In [None]:
(
    arrests
    .groupby(['RACE_GROUP','SEX'])
    .agg(
        TotalStops=('ARREST BOOKING NO_PK', 'count'),
        TopStop=('SUMMARY OF FACTS', pd.Series.mode)
    )
    .query('TotalStops > 15')
)

# Bail for children charged with vandalism (without a warrant)

In [None]:
child_vandalism = (
    charges
    .query("`CHARGE DESCRIPTION` == 'VANDALISM'")
    .merge(people, on='PIN', how='left')
    .assign(Warrant=lambda x: ~pd.isnull(x['WARRRANT NUMBER']))
    .query('(AGE < 18) & (Warrant == False)', engine='python')
)
(
    child_vandalism
    .groupby('RACE_GROUP')
    .agg({
        'BOND AMT': ['min', 'median', 'mean', 'max'],
        'RACE_GROUP': 'count',
    })
)
#sns.kdeplot(data=child_vandalism, x='BOND AMT')

# Average stops per person based on race and sex

In [None]:
(
    arrests
    .groupby(['RACE_GROUP'])#, 'SEX'])
    .agg(
        TotalStops=('ARREST BOOKING NO_PK', 'count'),
        PeopleStopped=('PIN', 'nunique'),
    )
    .assign(AvgStopsPerPerson=lambda x: x['TotalStops'] / x['PeopleStopped'])
    #.sort_values(by=['AvgStopsPerPerson','TotalStops'], ascending=False)
    .plot(kind='bar', y='AvgStopsPerPerson')
)

In [None]:
sns.heatmap(pd.crosstab(arrests['AGE'], [arrests['RACE_GROUP'], arrests['SEX']]))

In [None]:
sns.heatmap(pd.crosstab(arrests['AGE'], arrests['CURRENT STATUS']))

In [None]:
sns.heatmap(pd.crosstab(arrests['DISTRICT'], arrests['RACE_GROUP']))

# Repeated stops by race

In [None]:
repeated_stops_by_race = (
    arrests
    .groupby('PIN')
    .agg(
        RaceGroup=('RACE_GROUP', pd.Series.mode),
        RaceCount=('RACE_GROUP', pd.Series.nunique),
        TotalStops=('ARREST BOOKING NO_PK', 'count')
    )
    .query('RaceCount == 1')
)

sns.heatmap(pd.crosstab(repeated_stops_by_race.RaceGroup, repeated_stops_by_race.TotalStops), norm=LogNorm())