In [60]:
import pandas as pd
import numpy as np
from datetime import datetime

from astral import Observer
from astral.sun import sun
from pytz import timezone
from timezonefinder import TimezoneFinder
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.interpolate import BSpline

In [6]:
def validate_df(df):
    assert 'date' in df.columns
    assert 'time' in df.columns
    assert 'lat' in df.columns
    assert 'lng' in df.columns
    assert 'subject_race' in df.columns

In [7]:
def time_to_mins(time):    
    # Return nan if time is nan
    if time != time:
        return time
    
    # Return time in mins
    mins = int(time.hour * 60 + time.minute + time.second / 60)
    return mins

In [38]:
def clean_df(df):
    df['date'] = pd.to_datetime(df['date'])
    if isinstance(df.time.values[0], str):
        df['time'] = pd.to_datetime(df.time).dt.time
    return df

In [9]:
def filter_for_intertwilight_zone(df):
    """Assumes dusk & time are column names in df.
    """
    earliest_dusk = df.dusk.min()
    latest_dusk = df.dusk.max()
    result = df[df.time >= earliest_dusk].copy()
    result = result[result.time <= latest_dusk]
    return result

In [79]:
def black_white_stops_only(df):
    """Stanford OPP only compares black vs white stops."""
    df = df.copy()
    is_black = df.subject_race == 'black'
    is_white = df.subject_race == 'white'
    return df[np.logical_or(is_black, is_white)]

In [10]:
def remove_grey_area_stops(df):
    """Remove stops that occurred after sun started setting and before sun finally vanishing.
    It's not clear whether it's light out or dark out, so OPP removes these stops.
    
    Assumes sunset, dusk, and time are columns in df.
    
    Notes: It probably isn't necessary to remove these stops. You could model 'lighting' as a
    continuous value in [0, 1], instead of as a boolean. You get more data, but you do have to
    assume that the ability to racial profile is linear with the amount of lighting.
    """
    after_sunset = df.sunset < df.time
    before_dusk = df.time < df.dusk
    df = df[~np.logical_and(after_sunset, before_dusk)].copy()
    return df

In [11]:
def get_sunset_df(stops_df):
    """Return a dataframe where each row is a unique date and the columns denote 
    the sunset & dusk time for that date
    """
    dates = pd.to_datetime(stops_df['date']).unique()
    center_lat, center_lng = stops_df['lat'].mean(), stops_df['lng'].mean()
    tf = TimezoneFinder()
    tz_name = tf.timezone_at(lng=center_lng, lat=center_lat)
    tz = timezone(tz_name)
    city = Observer(center_lat, center_lng)
    suns = np.array([sun(city, pd.Timestamp(date), tzinfo=tz) for date in dates])
    sunsets = np.array([sun['sunset'].time() for sun in suns])
    dusks = np.array([sun['dusk'].time() for sun in suns])
    sunset_df = pd.DataFrame({'date': dates, 'sunset': sunsets, 'dusk': dusks})
    return sunset_df

In [94]:
def add_mutations(df):
    """Add columns like sunset, dusk, is_dark, time_mins, is_black, is_white."""
    sunset_df = get_sunset_df(df)
    df = df.merge(sunset_df, on='date')
    df['time_mins'] = df.time.apply(time_to_mins)
    df['time_mins'] = pd.to_numeric(df.time_mins, errors='coerce')
    df['is_dark'] = df['time'] > df['dusk']
    df['is_black'] = df.subject_race == 'black'
    df['is_white'] = df.subject_race == 'white'
    
    # Bin time into 5-minute intervals
    n_bins = (df.time_mins.max() - df.time_mins.min()) / 5
    df['time_bins'] = pd.cut(df['time_mins'], int(n_bins))
    
    return df

In [95]:
def preprocess(df):
    df = df.copy()
    df = clean_df(df)
    df = black_white_stops_only(df)
    df = add_mutations(df)
    df = filter_for_intertwilight_zone(df)
    df = remove_grey_area_stops(df)
    return df

In [96]:
sf = pd.read_csv('../data/raw_data/san_francisco.csv', low_memory=False)
sf = preprocess(sf)

In [84]:
sf.columns

Index(['raw_row_number', 'date', 'time', 'location', 'lat', 'lng', 'district',
       'subject_age', 'subject_race', 'subject_sex', 'type', 'arrest_made',
       'search_conducted', 'search_vehicle', 'search_basis', 'reason_for_stop',
       'raw_search_vehicle_description', 'raw_result_of_contact_description',
       'sunset', 'dusk', 'time_mins', 'is_dark', 'is_black', 'is_white'],
      dtype='object')

In [99]:
mod = smf.glm(formula='is_black ~ is_dark + time_bins + district + subject_sex + subject_age', 
              family=sm.families.Binomial(),
              data=sf)

In [100]:
mod = mod.fit(maxiter=20)

In [101]:
mod.summary()

0,1,2,3
Dep. Variable:,"['is_black[False]', 'is_black[True]']",No. Observations:,93780.0
Model:,GLM,Df Residuals:,93719.0
Model Family:,Binomial,Df Model:,60.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-47634.0
Date:,"Sun, 20 Jun 2021",Deviance:,95269.0
Time:,17:29:04,Pearson chi2:,94000.0
No. Iterations:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.1394,0.045,25.192,0.000,1.051,1.228
is_dark[T.True],0.0897,0.019,4.796,0.000,0.053,0.126
"time_bins[T.Interval(5.014, 10.028, closed='right')]",-2.062e-11,1.11e-08,-0.002,0.999,-2.18e-08,2.18e-08
"time_bins[T.Interval(10.028, 15.042, closed='right')]",-5.859e-11,3.16e-08,-0.002,0.999,-6.2e-08,6.19e-08
"time_bins[T.Interval(15.042, 20.056, closed='right')]",1.087e-11,5.86e-09,0.002,0.999,-1.15e-08,1.15e-08
"time_bins[T.Interval(20.056, 25.07, closed='right')]",1.344e-11,7.25e-09,0.002,0.999,-1.42e-08,1.42e-08
"time_bins[T.Interval(25.07, 30.084, closed='right')]",-6.743e-12,3.64e-09,-0.002,0.999,-7.14e-09,7.13e-09
"time_bins[T.Interval(30.084, 35.098, closed='right')]",-8.71e-12,4.7e-09,-0.002,0.999,-9.22e-09,9.2e-09
"time_bins[T.Interval(35.098, 40.111, closed='right')]",-2.088e-11,1.13e-08,-0.002,0.999,-2.21e-08,2.21e-08
