In [3]:
import pandas as pd
import numpy as np
from datetime import datetime

from astral import Observer
from astral.sun import sun
from pytz import timezone
from timezonefinder import TimezoneFinder
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [4]:
def validate_df(df):
    assert 'date' in df.columns
    assert 'time' in df.columns
    assert 'lat' in df.columns
    assert 'lng' in df.columns
    assert 'subject_race' in df.columns

In [5]:
def time_to_mins(time):    
    # Return nan if time is nan
    if time != time:
        return time
    
    # Return time in mins
    mins = int(time.hour * 60 + time.minute + time.second / 60)
    return mins

In [6]:
def clean_df(df):
    df['date'] = pd.to_datetime(df['date'])
    if isinstance(df.time.values[0], str):
        df['time'] = pd.to_datetime(df.time).dt.time
    return df

In [7]:
def filter_for_intertwilight_zone(df):
    """Assumes dusk & time are column names in df.
    """
    earliest_dusk = df.dusk.min()
    latest_dusk = df.dusk.max()
    result = df[df.time >= earliest_dusk].copy()
    result = result[result.time <= latest_dusk]
    return result

In [8]:
def black_white_stops_only(df):
    """Stanford OPP only compares black vs white stops."""
    df = df.copy()
    is_black = df.subject_race == 'black'
    is_white = df.subject_race == 'white'
    return df[np.logical_or(is_black, is_white)]

In [9]:
def remove_grey_area_stops(df):
    """Remove stops that occurred after sun started setting and before sun finally vanishing.
    It's not clear whether it's light out or dark out, so OPP removes these stops.
    
    Assumes sunset, dusk, and time are columns in df.
    
    Notes: It probably isn't necessary to remove these stops. You could model 'lighting' as a
    continuous value in [0, 1], instead of as a boolean. You get more data, but you do have to
    assume that the ability to racial profile is linear with the amount of lighting.
    """
    after_sunset = df.sunset < df.time
    before_dusk = df.time < df.dusk
    df = df[~np.logical_and(after_sunset, before_dusk)].copy()
    return df

In [10]:
def get_sunset_df(stops_df):
    """Return a dataframe where each row is a unique date and the columns denote 
    the sunset & dusk time for that date
    """
    dates = pd.to_datetime(stops_df['date']).unique()
    center_lat, center_lng = stops_df['lat'].mean(), stops_df['lng'].mean()
    tf = TimezoneFinder()
    tz_name = tf.timezone_at(lng=center_lng, lat=center_lat)
    tz = timezone(tz_name)
    city = Observer(center_lat, center_lng)
    suns = np.array([sun(city, pd.Timestamp(date), tzinfo=tz) for date in dates])
    sunsets = np.array([sun['sunset'].time() for sun in suns])
    dusks = np.array([sun['dusk'].time() for sun in suns])
    sunset_df = pd.DataFrame({'date': dates, 'sunset': sunsets, 'dusk': dusks})
    return sunset_df

In [20]:
def add_mutations(df):
    """Add columns like sunset, dusk, is_dark, time_mins, is_black, is_white."""
    sunset_df = get_sunset_df(df)
    df = df.merge(sunset_df, on='date')
    df['time_mins'] = df.time.apply(time_to_mins)
    df['time_mins'] = pd.to_numeric(df.time_mins, errors='coerce')
    df['is_dark'] = df['time'] > df['dusk']
    df['is_black'] = df.subject_race == 'black'
    df['is_white'] = df.subject_race == 'white'
    
    # Bin time into 15-minute intervals
    n_bins = (df.time_mins.max() - df.time_mins.min()) / 15
    print(n_bins)
    df['time_bins'] = pd.cut(df['time_mins'], int(n_bins))
    
    return df

In [21]:
def preprocess(df):
    df = df.copy()
    df = clean_df(df)
    df = black_white_stops_only(df)
    df = add_mutations(df)
    df = filter_for_intertwilight_zone(df)
    df = remove_grey_area_stops(df)
    return df

In [22]:
sf = pd.read_csv('../data/raw_data/san_francisco.csv', low_memory=False)
sf = preprocess(sf)

In [23]:
sf.columns

Index(['raw_row_number', 'date', 'time', 'location', 'lat', 'lng', 'district',
       'subject_age', 'subject_race', 'subject_sex', 'type', 'arrest_made',
       'search_conducted', 'search_vehicle', 'search_basis', 'reason_for_stop',
       'raw_search_vehicle_description', 'raw_result_of_contact_description',
       'sunset', 'dusk', 'time_mins', 'is_dark', 'is_black', 'is_white',
       'time_bins'],
      dtype='object')

In [36]:
sf['is_black'] = sf['is_black'].astype(int)

In [37]:
mod = smf.glm(formula='is_black ~ is_dark + time_bins + district + subject_sex + subject_age', 
              family=sm.families.Binomial(),
              data=sf)

In [38]:
mod = mod.fit(maxiter=20)

In [39]:
print(mod.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               is_black   No. Observations:                93780
Model:                            GLM   Df Residuals:                    93742
Model Family:                Binomial   Df Model:                           37
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -47652.
Date:                Sun, 20 Jun 2021   Deviance:                       95304.
Time:                        19:33:23   Pearson chi2:                 9.40e+04
No. Iterations:                    20                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------