# **DS 6001: Live Coding 9**
## **Randa Ampah**

In [1]:
import pandas as pd 
import numpy as np

In [2]:
cases = pd.read_csv('./data/data100k.csv')

In [3]:
cases.head(3).T

Unnamed: 0,0,1,2
person_id,102090000000110,343221000000125,343221000000125
HearingDate,2019-02-28,2009-12-07,2011-01-20
CodeSection,A.46.2-862,B.46.2-301,A.46.2-707
codesection,covered elsewhere,covered elsewhere,covered elsewhere
ChargeType,Misdemeanor,Misdemeanor,Misdemeanor
chargetype,Misdemeanor,Misdemeanor,Misdemeanor
Class,1,1,3
DispositionCode,Guilty,Guilty,Guilty
disposition,Conviction,Conviction,Conviction
Plea,,,


## What code sections are most frequent?

In [4]:
cases['CodeSection'].value_counts()

CodeSection
A.46.2-862    26379
B.46.2-301    25967
46.2-300      17934
C.46.2-862    11728
18.2-250.1    10573
              ...  
42-51             1
14.26             1
26-51(11)         1
A46.2-870         1
18.2-95/26        1
Name: count, Length: 4207, dtype: int64

The most frequent non-traffic violation code section in this sample of the data is marijuana.


## Which ones most often lead to convictions?

In [5]:
cases['DispositionCode'].value_counts()

DispositionCode
Guilty                     156563
Nolle Prosequi              54680
Dismissed                   42520
Guilty In Absentia          31958
Not Guilty                   5807
Not Guilty/Acquitted         1623
Not True Bill                 250
No Indictment Presented       178
Dismissed/Other                19
Name: count, dtype: int64

In [6]:
cases['convicted'] = [x in ['Guilty', 'Guilty In Absentia'] for x in cases['DispositionCode']]
cases['convicted'].value_counts()

convicted
True     188521
False    105077
Name: count, dtype: int64

In [7]:
cases_convict_rate = cases.groupby('CodeSection').agg({'convicted':['mean','count']}).reset_index()
cases_convict_rate.columns = ['CodeSection','convict_rate','count']
cases_convict_rate = cases_convict_rate.query("count >= 30") # query takes a string and interprets the string to see what you want
cases_convict_rate.sort_values('convict_rate',ascending=False)

Unnamed: 0,CodeSection,convict_rate,count
1806,23-55,0.981818,55
1633,21-336,0.960000,50
1755,23-22.1(A),0.954198,131
2103,29-17(C),0.942857,70
4111,G.18.2-266,0.930233,43
...,...,...,...
2321,3.2-6503.1,0.071429,42
140,11.1-2,0.052632,38
2481,35-416,0.033333,30
1433,19.2-100,0.000000,238


## Which ones have the most severe racial dispartities?

In [8]:
cases['Race'].unique()

array(['Black(Non-Hispanic)', 'Hispanic', 'White Caucasian(Non-Hispanic)',
       'MISSING', 'Asian Or Pacific Islander', 'Black (Non-Hispanic)',
       'White Caucasian (Non-Hispanic)',
       'Other(Includes Not Applicable.. Unknown)',
       'Other (Includes Not Applicable.. Unknown)', 'Black', 'White',
       'Unknown (Includes Not Applicable.. Unknown)', 'American Indian',
       'Unknown', 'Asian or Pacific Islander',
       'American Indian Or Alaskan Native'], dtype=object)

In [9]:
replace_map = {
    'Black(Non-Hispanic)':'Black (Non-Hispanic)', 
    'Hispanic':'Hispanic', 
    'White Caucasian(Non-Hispanic)':'White Caucasian (Non-Hispanic)',
    'MISSING':'Other, unknown, or missing', 
    'Asian Or Pacific Islander':'Asian or Pacific Islander', 
    'Black (Non-Hispanic)':'Black (Non-Hispanic)',
    'White Caucasian (Non-Hispanic)':'White Caucasian (Non-Hispanic)',
    'Other(Includes Not Applicable.. Unknown)':'Other, unknown, or missing',
    'Other (Includes Not Applicable.. Unknown)':'Other, unknown, or missing', 
    'Black':'Black (Non-Hispanic)', 
    'White':'White Caucasian (Non-Hispanic)',
    'Unknown (Includes Not Applicable.. Unknown)':'Other, unknown, or missing', 
    'American Indian':'American Indian or Alaskan Native',
    'Unknown':'Other, unknown, or missing', 
    'Asian or Pacific Islander':'Asian or Pacific Islander',
    'American Indian Or Alaskan Native':'American Indian or Alaskan Native'}

cases['Race'] = cases['Race'].replace(replace_map)
cases['Race'].value_counts()

Race
White Caucasian (Non-Hispanic)       159627
Black (Non-Hispanic)                 115627
Hispanic                               9319
Other, unknown, or missing             5928
Asian or Pacific Islander              2794
American Indian or Alaskan Native       303
Name: count, dtype: int64

## In what localities (fips) are these disparities most severe?

### Bringing Census Data into Dataset

In [10]:
census_race_url = 'https://virginia.box.com/shared/static/i8i5onrkveks849pkky0gwgxlax8d8fe.xlsx'
census_hisp_url = 'https://virginia.box.com/shared/static/fegrn0p0igzl95snji3ku6edwu0hy3dj.xlsx'

In [19]:
census_race = pd.read_excel(census_race_url, skiprows=[0,1,2,3,5,6,7])
todrop = [x for x in census_race.columns if 'Unnamed' in x]
census_race = census_race.drop(todrop, axis=1)
census_race

  warn(f"Print area cannot be set to Defined name: {defn.value}.")


Unnamed: 0,FIPS,Jurisdiction,Total Population,American Indian,Asian,Black,Pacific Islander,Two or more races,White
0,1,Accomack County,33246,441,370,9859,79,609,23125
1,3,Albemarle County,113535,1069,7925,12581,242,3210,95210
2,5,Alleghany County,14986,126,92,906,17,283,14136
3,7,Amelia County,13268,177,138,2759,15,259,10445
4,9,Amherst County,31273,495,339,6475,55,828,24796
...,...,...,...,...,...,...,...,...,...
128,800,Suffolk city,96194,1052,3115,43068,232,2975,51977
129,810,Virginia Beach city,457672,6241,44491,104827,1846,20837,324018
130,820,Waynesboro city,22550,309,522,3665,35,792,18840
131,830,Williamsburg city,15590,202,1164,2754,37,591,12055


In [23]:
census_hisp = pd.read_excel(census_hisp_url, skiprows=[0,1,2,3,5,6,7,8,9])
census_hisp = census_hisp[['FIPS','Unnamed: 6']]
census_hisp = census_hisp.rename({'Unnamed: 6':'Hispanic'}, axis=1)
census_hisp.head()

  warn(f"Print area cannot be set to Defined name: {defn.value}.")


Unnamed: 0,FIPS,Hispanic
0,1,3170
1,3,6750
2,5,265
3,7,507
4,9,849


### Merging the 2 Census Datasets

In [26]:
census = pd.merge(census_race,census_hisp, 
                  on = 'FIPS',
                  how = 'outer',
                  validate = 'one_to_one',
                  indicator = 'matched' 
                  )
census

# validate: 
# indicator: tells you if every row found in just left or right or both
#            ex. if USA only in left df and United States only in right, 
#            will let you know so you can fix

Unnamed: 0,FIPS,Jurisdiction,Total Population,American Indian,Asian,Black,Pacific Islander,Two or more races,White,Hispanic,matched
0,1,Accomack County,33246,441,370,9859,79,609,23125,3170,both
1,3,Albemarle County,113535,1069,7925,12581,242,3210,95210,6750,both
2,5,Alleghany County,14986,126,92,906,17,283,14136,265,both
3,7,Amelia County,13268,177,138,2759,15,259,10445,507,both
4,9,Amherst County,31273,495,339,6475,55,828,24796,849,both
...,...,...,...,...,...,...,...,...,...,...,...
128,800,Suffolk city,96194,1052,3115,43068,232,2975,51977,4684,both
129,810,Virginia Beach city,457672,6241,44491,104827,1846,20837,324018,40525,both
130,820,Waynesboro city,22550,309,522,3665,35,792,18840,2244,both
131,830,Williamsburg city,15590,202,1164,2754,37,591,12055,1183,both


In [27]:
census['matched'].value_counts()

matched
both          133
left_only       0
right_only      0
Name: count, dtype: int64

In [28]:
census = census.drop('matched', axis = 1)
census

Unnamed: 0,FIPS,Jurisdiction,Total Population,American Indian,Asian,Black,Pacific Islander,Two or more races,White,Hispanic
0,1,Accomack County,33246,441,370,9859,79,609,23125,3170
1,3,Albemarle County,113535,1069,7925,12581,242,3210,95210,6750
2,5,Alleghany County,14986,126,92,906,17,283,14136,265
3,7,Amelia County,13268,177,138,2759,15,259,10445,507
4,9,Amherst County,31273,495,339,6475,55,828,24796,849
...,...,...,...,...,...,...,...,...,...,...
128,800,Suffolk city,96194,1052,3115,43068,232,2975,51977,4684
129,810,Virginia Beach city,457672,6241,44491,104827,1846,20837,324018,40525
130,820,Waynesboro city,22550,309,522,3665,35,792,18840,2244
131,830,Williamsburg city,15590,202,1164,2754,37,591,12055,1183


### Aggregate Cases by Race, reshaped

In [29]:
cases

Unnamed: 0,person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,...,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime,convicted
0,102090000000110,2019-02-28,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False,True
1,343221000000125,2009-12-07,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,False,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
2,343221000000125,2011-01-20,A.46.2-707,covered elsewhere,Misdemeanor,Misdemeanor,3,Guilty,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
3,343221000000125,2011-07-01,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty In Absentia,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
4,343221000000125,2012-10-15,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293593,247061000000309,2019-10-04,14.2-81,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
293594,247061000000309,2019-10-18,14.2-81,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False,True
293595,295161000000000,2016-10-04,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty,Conviction,,...,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False,True
293596,5120000001160,2017-10-04,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty In Absentia,Conviction,,...,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False,True


In [41]:
convicted = cases.query('convicted == True')
convicted

Unnamed: 0,person_id,HearingDate,CodeSection,codesection,ChargeType,chargetype,Class,DispositionCode,disposition,Plea,...,within10,class1_2,class3_4,expungable,old_expungable,expungable_no_lifetimelimit,reason,sameday,lifetime,convicted
0,102090000000110,2019-02-28,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False,True
1,343221000000125,2009-12-07,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,False,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
2,343221000000125,2011-01-20,A.46.2-707,covered elsewhere,Misdemeanor,Misdemeanor,3,Guilty,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
3,343221000000125,2011-07-01,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty In Absentia,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
4,343221000000125,2012-10-15,B.46.2-301,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty,Conviction,,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293593,247061000000309,2019-10-04,14.2-81,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Not eligible,False,Not eligible,Conviction of misdemeanor charges that are not...,False,False,True
293594,247061000000309,2019-10-18,14.2-81,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty In Absentia,Conviction,Tried In Absentia,...,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False,True
293595,295161000000000,2016-10-04,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,,Guilty,Conviction,,...,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False,True
293596,5120000001160,2017-10-04,A.46.2-862,covered elsewhere,Misdemeanor,Misdemeanor,1,Guilty In Absentia,Conviction,,...,True,False,False,Automatic (pending),False,Automatic (pending),Conviction of misdemeanor charges listed in 19...,False,False,True


In [42]:
convicted_race = convicted.groupby(['CodeSection','fips','Race']).size().reset_index()
convicted_race = convicted_race.rename({0:'count'}, axis=1)
convicted_race

Unnamed: 0,CodeSection,fips,Race,count
0,01-2007,51,White Caucasian (Non-Hispanic),1
1,1,550,Black (Non-Hispanic),3
2,1,550,White Caucasian (Non-Hispanic),1
3,1-12,650,Black (Non-Hispanic),27
4,1-12,650,White Caucasian (Non-Hispanic),6
...,...,...,...,...
27500,Z.18.2-91,840,White Caucasian (Non-Hispanic),2
27501,Z.18.2-91; 26,700,Black (Non-Hispanic),1
27502,Z.18.2-95,67,Black (Non-Hispanic),1
27503,Z.18.2-95,83,Black (Non-Hispanic),1


In [43]:
convicted_race = convicted_race.pivot_table(index = ['CodeSection','fips'],
                                            columns = 'Race',
                                            values = 'count',
                                            fill_value=0).reset_index()
convicted_race

Race,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic,"Other, unknown, or missing",White Caucasian (Non-Hispanic)
0,01-2007,51,0.0,0.0,0.0,0.0,0.0,1.0
1,1,550,0.0,0.0,3.0,0.0,0.0,1.0
2,1-12,650,0.0,0.0,27.0,0.0,0.0,6.0
3,1-200,29,0.0,0.0,1.0,0.0,0.0,0.0
4,1-200,105,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
18700,Z.18.2-91,840,0.0,0.0,0.0,0.0,0.0,2.0
18701,Z.18.2-91; 26,700,0.0,0.0,1.0,0.0,0.0,0.0
18702,Z.18.2-95,67,0.0,0.0,1.0,0.0,0.0,0.0
18703,Z.18.2-95,83,0.0,0.0,1.0,0.0,0.0,0.0


In [44]:
convicted_race.columns

Index(['CodeSection', 'fips', 'American Indian or Alaskan Native',
       'Asian or Pacific Islander', 'Black (Non-Hispanic)', 'Hispanic',
       'Other, unknown, or missing', 'White Caucasian (Non-Hispanic)'],
      dtype='object', name='Race')

In [46]:
convicted_race['total_convict'] = convicted_race[['American Indian or Alaskan Native',
                                                  'Asian or Pacific Islander', 
                                                  'Black (Non-Hispanic)', 'Hispanic',
                                                  'Other, unknown, or missing', ''
                                                  'White Caucasian (Non-Hispanic)']].sum(axis=1)
convicted_race

Race,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic,"Other, unknown, or missing",White Caucasian (Non-Hispanic),total_convict
0,01-2007,51,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1,550,0.0,0.0,3.0,0.0,0.0,1.0,4.0
2,1-12,650,0.0,0.0,27.0,0.0,0.0,6.0,33.0
3,1-200,29,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,1-200,105,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
18700,Z.18.2-91,840,0.0,0.0,0.0,0.0,0.0,2.0,2.0
18701,Z.18.2-91; 26,700,0.0,0.0,1.0,0.0,0.0,0.0,1.0
18702,Z.18.2-95,67,0.0,0.0,1.0,0.0,0.0,0.0,1.0
18703,Z.18.2-95,83,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Merge Reshaped Case Data with Census Data

In [47]:
cases_census = pd.merge(convicted_race, census,
                        left_on = 'fips',
                        right_on = 'FIPS',
                        how = 'outer',
                        validate = 'many_to_one',
                        indicator = 'matched')

In [48]:
cases_census['matched'].value_counts()

matched
both          17691
left_only      1014
right_only        8
Name: count, dtype: int64

In [49]:
cases_census.query("matched == 'left_only'")['fips'].unique()

array([122., 560., 701., 702., 711., 712., 761., 762., 764.])

In [50]:
cases_census.query("matched == 'right_only'")['FIPS'].unique()

array([ 95., 580., 660., 678., 683., 685., 720., 735.])

In [51]:
cases_census

Unnamed: 0,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic_x,"Other, unknown, or missing",White Caucasian (Non-Hispanic),total_convict,FIPS,Jurisdiction,Total Population,American Indian,Asian,Black,Pacific Islander,Two or more races,White,Hispanic_y,matched
0,10-127,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,Accomack County,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both
1,10-2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,Accomack County,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both
2,18.2-102,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,1.0,Accomack County,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both
3,18.2-103,1.0,0.0,0.0,7.0,0.0,0.0,9.0,16.0,1.0,Accomack County,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both
4,18.2-104,1.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,1.0,Accomack County,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18708,D.18.2-266,840.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,840.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both
18709,G.46.2-878,840.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,840.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both
18710,J.18.2-266,840.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,840.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both
18711,Z.18.2-89,840.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,840.0,Winchester city,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both


In [56]:
cases_census['black_overrep'] = (cases_census['Black (Non-Hispanic)']/cases_census['total_convict'])/(cases_census['Black']/cases_census['Total Population'])

In [57]:
cases_census

Unnamed: 0,CodeSection,fips,American Indian or Alaskan Native,Asian or Pacific Islander,Black (Non-Hispanic),Hispanic_x,"Other, unknown, or missing",White Caucasian (Non-Hispanic),total_convict,FIPS,...,Total Population,American Indian,Asian,Black,Pacific Islander,Two or more races,White,Hispanic_y,matched,black_overrep
0,10-127,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both,0.000000
1,10-2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both,0.000000
2,18.2-102,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,1.0,...,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both,1.686074
3,18.2-103,1.0,0.0,0.0,7.0,0.0,0.0,9.0,16.0,1.0,...,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both,1.475314
4,18.2-104,1.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,1.0,...,33246.0,441.0,370.0,9859.0,79.0,609.0,23125.0,3170.0,both,3.372147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18708,D.18.2-266,840.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,840.0,...,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,0.000000
18709,G.46.2-878,840.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,840.0,...,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,0.000000
18710,J.18.2-266,840.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,840.0,...,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,0.000000
18711,Z.18.2-89,840.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,840.0,...,28136.0,479.0,928.0,3908.0,62.0,1000.0,23797.0,5322.0,both,0.000000
