In [17]:
import hashlib
import io
import os
import sqlite3
import pdfkit
import tempfile

from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from star.analysis import Analysis
from star.models import VODModel, load_csv_as_dataframe, Location, EmptyModel, \
    pickle_dataframe, clear_pickle

from flask import Blueprint, flash, request, current_app, render_template, \
    redirect, url_for, session, g, Response, make_response, send_file


In [14]:

from star import create_app
app = create_app()


In [19]:
app.config['DATABASE']

'/tmp/star.sqlite3'

In [39]:
date_column = 'stopdate'
time_column = 'Time'
target_column = 'race'
officer_id_column = 'officerid'

if date_column == time_column:
    datetime_column = date_column
    datetime_columns = [date_column]
else:
    datetime_column = "__datetime"
    datetime_columns = {"__datetime": [date_column, time_column]}

cols = {
    "datetime_column": datetime_column,
    "datetime_columns": datetime_columns,
    "date_column": date_column,
    "time_column": time_column,
    "target_column": target_column,
    "officer_id_column": officer_id_column,
}

cols
df = pd.read_csv('./DPD_Traffic_2008-2015_filtered.csv',
                           parse_dates=cols['datetime_columns'],
                           low_memory=False, # may want to include this to prevent DtypeWarning
                           )
df.head()

Unnamed: 0.1,__datetime,Unnamed: 0,officerid,race,Black,ethnic,sex,age
0,2010-03-19 00:06:17,0,1746,A,0,N,F,23
1,2010-03-22 05:15:57,1,1599,B,1,N,F,50
2,2010-03-29 03:30:30,2,1653,B,1,N,F,34
3,2010-03-30 00:16:16,3,1749,B,1,N,M,30
4,2010-04-01 00:39:46,4,1749,W,0,H,M,17


In [34]:
options = {
            "target_group": 'B', # field value from target field
            "dst_restrict": True # Daylight savings time : True to restrict that that range vs using entire dataset
        }


possible_locations = Location.geolocate("Durham, NC")
location = possible_locations[0].as_dict()
location

{'name': 'Durham, NC, USA',
 'region': None,
 'latitude': 35.9940329,
 'longitude': -78.898619,
 'timezone': 'America/New_York',
 'elevation': 120}

In [40]:
model = VODModel(df, location=location, columns=cols, options=options)
model.data_frame

Before dropping na rows 0.0004947185516357422
After dropping na rows 0.023861169815063477
Before setting index 0.023974180221557617
After setting index 0.024515151977539062
Before stripping non-evening hours 0.02458024024963379
After stripping non-evening hours 0.6527855396270752
Before stripping non-seasonal days 0.6528782844543457
After stripping non-seasonal days 1.354276180267334
Before converting to target group 1.3543696403503418
After converting to target group 1.3582146167755127
Before converting to light 1.3582501411437988
After converting to light 2.0307581424713135
Before stripping twilight hours 2.030851125717163
After stripping twilight hours 2.033127784729004
Before creating year column 2.033186674118042
After creating year column 2.0472309589385986


Unnamed: 0_level_0,datetime,Unnamed: 0,officerid,original_target,Black,ethnic,sex,age,target,light,dusk,year,month,day_of_week,time_in_seconds
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-03-26 17:30:12,2015-03-26 17:30:12,804,1515,B,1,N,F,35,1,1,2015-03-26 19:57:53-04:00,2015,3,4,63012
2010-11-13 19:27:42,2010-11-13 19:27:42,805,1450,B,1,N,F,27,1,0,2010-11-13 17:36:32-05:00,2010,11,6,70062
2014-11-05 18:19:01,2014-11-05 18:19:01,806,2094,B,1,N,F,40,1,0,2014-11-05 17:42:37-05:00,2014,11,3,65941
2012-12-02 19:20:09,2012-12-02 19:20:09,1647,2077,B,1,N,F,42,1,0,2012-12-02 17:29:31-05:00,2012,12,7,69609
2012-10-09 18:37:59,2012-10-09 18:37:59,2142,2023,W,0,H,M,22,0,1,2012-10-09 19:12:53-04:00,2012,10,2,67079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-12-04 18:18:42,2010-12-04 18:18:42,210317,436,B,1,N,M,26,1,0,2010-12-04 17:29:28-05:00,2010,12,6,65922
2010-12-05 18:16:15,2010-12-05 18:16:15,210318,436,W,0,H,M,28,0,0,2010-12-05 17:29:29-05:00,2010,12,7,65775
2010-12-05 18:17:20,2010-12-05 18:17:20,210319,436,B,1,N,M,42,1,0,2010-12-05 17:29:29-05:00,2010,12,7,65840
2011-03-09 18:43:50,2011-03-09 18:43:50,210507,302,B,1,,F,29,1,0,2011-03-09 18:43:09-05:00,2011,3,3,67430


In [41]:
analysis = Analysis()
results = analysis.analyze(model.data_frame)
results



{'model_summary': <class 'statsmodels.iolib.summary.Summary'>
 """
                                GEE Regression Results                              
 Dep. Variable:                      target   No. Observations:                 5676
 Model:                                 GEE   No. clusters:                      472
 Method:                        Generalized   Min. cluster size:                   1
                       Estimating Equations   Max. cluster size:                 266
 Family:                           Binomial   Mean cluster size:                12.0
 Dependence structure:         Exchangeable   Num. iterations:                     7
 Date:                     Fri, 23 Jul 2021   Scale:                           1.000
 Covariance type:                    robust   Time:                         15:07:28
                                 coef    std err          z      P>|z|      [0.025      0.975]
 ------------------------------------------------------------------------

In [42]:
min_twilight, max_twilight = model.find_twilight_range()
itp_range = "{} - {}".format(min_twilight.strftime("%H:%M:%S"),
                             max_twilight.strftime("%H:%M:%S"))

min_date, max_date = model.find_date_range()
date_range = "{} - {}".format(min_date.strftime("%x"),
                              max_date.strftime("%x"))

{"min_twilight": min_twilight, 
     "max_twilight": max_twilight, 
     "itp_range": itp_range, 
     "min_date": min_date, 
     "max_date": max_date,
     "date_range": date_range}

{'min_twilight': datetime.time(17, 29, 28),
 'max_twilight': datetime.time(21, 6, 35),
 'itp_range': '17:29:28 - 21:06:35',
 'min_date': datetime.date(2008, 2, 8),
 'max_date': datetime.date(2015, 10, 31),
 'date_range': '02/08/08 - 10/31/15'}

In [None]:
# not intending to run but this is what ultimately gets rendered to the browser as the results
return render_template("analyze.html",
                           datetime=datetime.now().strftime("%x %X %Z"),
                           location=location,
                           original_filename=filename,
                           original_record_count=len(df.index),
                           final_record_count=len(model.data_frame.index),
                           date_range=date_range,
                           itp_range=itp_range,
                           light_count=model.light_count(),
                           dark_count=model.dark_count(),
                           results=results)