In [None]:
import pandas as pd

from src.acsdata import AcsData

acs = AcsData('tests/data/acs_test.csv')

In [120]:
import datetime
import sys

import pandas as pd


class Stop:
    def __init__(self, stop_filepath=None, acs=None, chunk=None, chunksize=1000000):
        self.chunked_summary = None
        self.acs = acs
        self.filepath = stop_filepath
        self.chunk = chunk
        self.chunksize = chunksize
        self.load_dataframe()
        if self.chunk is None:
            self.summary = self.create_summary()
        else:
            self.summary = self.create_chunked_summary()
            self.summary = self.create_summary()
        self.add_acs_data_to_summary()
        self.add_differences()
        export_filename = self.filepath.split('/')[-1]
        self.summary.to_csv('tests/data/summaries/' + export_filename)

    def load_dataframe(self,chunk=None):
        if chunk is None:
            df = pd.read_csv(self.filepath)
        else:
            df = chunk

        df = df[df['county_fips'].notna()]
        df = df[df['driver_race'].notna()]
        df['driver_race'] = df['driver_race'].str.lower()
        df['county_fips'] = df['county_fips'].astype(int).astype(str)
        cols_to_drop = ['location_raw', 'county_name', 'driver_race_raw']

        if 'officer_id' in df.columns:
            df['state_officer_id'] = df['state'].str.lower() + df['officer_id'].astype(int).astype(str)
            cols_to_drop.append('officer_id')

        df = df.drop(cols_to_drop, axis=1)

        self.df = df

    def create_summary(self):
        if self.chunk is None:
            summary = self.add_stop_percentage_to_summary_table()
        else:
            summary = self.summary
        stop_percentage_label = 'stop_percentage'
        summary[stop_percentage_label] = summary['stops'] / summary['stops'].groupby(level=0).sum()
        pivot = self.create_single_columns_from_summary_table(summary)
        return pivot

    def create_single_columns_from_summary_table(self, summary):
        summary = summary.reset_index()
        melt = summary.melt(id_vars=['county_fips', 'driver_race'], value_vars=['stops', 'stop_percentage'])
        pivot = melt.pivot_table(index=['county_fips'], columns=['driver_race', 'variable'], values='value')
        pivot.columns = ['_'.join(col).strip() for col in pivot.columns.values]
        pivot.columns = pivot.columns.get_level_values(0)
        return pivot

    def add_stop_percentage_to_summary_table(self):
        summary = self.df.groupby(['county_fips', 'driver_race']).agg('count')
        summary = summary[['id']]
        summary['stops'] = summary['id']
        summary = summary[['stops']]
        return summary

    def add_acs_data_to_summary(self):
        if not self.acs:
            return

        merge = pd.merge(self.summary, self.acs.summary, on='county_fips')
        self.summary = merge

    def add_differences(self):
        if not self.acs:
            return

        columns = self.summary.columns
        for race in self.acs.races:
            col_name = race + "_difference"
            stop_percentage_name = race + "_stop_percentage"
            pop_percentage_name = race + "_percentage"
            if (pop_percentage_name in columns) and (stop_percentage_name in columns):
                self.summary[col_name] = self.summary[stop_percentage_name] - self.summary[pop_percentage_name]

        return True

    def create_chunked_summary(self):
        total_summary = pd.DataFrame()
        counter = 1
        for chunk in pd.read_csv(self.filepath, chunksize=self.chunksize):
            now = datetime.datetime.now()
            print(self.filepath + " - " + str(self.chunksize * counter) + " - " + now.strftime("%H:%M:%S"))
            self.load_dataframe(chunk=chunk)
            summary = self.add_stop_percentage_to_summary_table()
            total_summary = pd.concat([total_summary, summary])
            counter = counter + 1

        group = total_summary.reset_index()
        group = group.groupby(['county_fips', 'driver_race']).agg('sum')
        return group


In [121]:
stops = Stop(stop_filepath="tests/data/stops_test_no_officer_id.csv",chunk=True,acs=acs,chunksize=10)

tests/data/stops_test_no_officer_id.csv - 10 - 18:06:25
tests/data/stops_test_no_officer_id.csv - 20 - 18:06:25
tests/data/stops_test_no_officer_id.csv - 30 - 18:06:25
tests/data/stops_test_no_officer_id.csv - 40 - 18:06:25
tests/data/stops_test_no_officer_id.csv - 50 - 18:06:25
tests/data/stops_test_no_officer_id.csv - 60 - 18:06:25
tests/data/stops_test_no_officer_id.csv - 70 - 18:06:25
tests/data/stops_test_no_officer_id.csv - 80 - 18:06:25
tests/data/stops_test_no_officer_id.csv - 90 - 18:06:25
tests/data/stops_test_no_officer_id.csv - 100 - 18:06:26
tests/data/stops_test_no_officer_id.csv - 110 - 18:06:26


In [110]:
stops.summary

Unnamed: 0_level_0,asian_stop_percentage,asian_stops,black_stop_percentage,black_stops,hispanic_stop_percentage,hispanic_stops,white_stop_percentage,white_stops,total_population,white,...,other,white_percentage,black_percentage,hispanic_percentage,asian_percentage,other_percentage,white_difference,black_difference,hispanic_difference,asian_difference
county_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56001,0.02,2.0,0.04,4.0,0.03,3.0,0.91,91.0,37836,31506,...,316,0.832699,0.013453,0.097183,0.030104,0.008352,0.077301,0.026547,-0.067183,-0.010104
