In [1]:
import numpy as np
import pandas as pd
import re 
import datetime

# DOL Data Exploration

## H2A Violation Data

In [2]:
url = "https://enfxfr.dol.gov/data_catalog/WHD/whd_whisard_20210415.csv.zip"
raw_dol = pd.read_csv(url, 
                      index_col=None,
                      dtype={7:'string'})
raw_dol['findings_start_date'] = pd.to_datetime(raw_dol['findings_start_date'], errors='coerce')
raw_dol['findings_end_date'] = pd.to_datetime(raw_dol['findings_end_date'], errors='coerce')
print(f'raw dataframe has {len(raw_dol.columns)} columns and {len(raw_dol)} rows')

raw dataframe has 110 columns and 313928 rows


In [3]:

list_states= ['TX', 'MS', 'LA', 'KY', 'AL', 'TN']
raw_dol_states = raw_dol[raw_dol["st_cd"].isin(list_states)]
raw_dol_states = raw_dol_states[raw_dol_states["h2a_violtn_cnt"] > 0]
raw_dol_states = raw_dol_states[raw_dol_states["ld_dt"] >= "2016-1-1"]
print(f'When we subset to catchment state H2A violations and post 2016, we have {raw_dol_states.shape[0]} rows')


When we subset to catchment state H2A violations and post 2016, we have 329 rows


In [4]:
h2a_states = raw_dol_states

h2a_states['ld_dt'] = pd.to_datetime(h2a_states['ld_dt'])
h2a_states['ld_dt'] = h2a_states['ld_dt'].dt.date
h2a_states['year'] = pd.to_datetime(h2a_states['ld_dt']).dt.year

h2a_states['address'] = h2a_states.street_addr_1_txt + ', ' + h2a_states.cty_nm + ', ' + h2a_states.st_cd
h2a_states.shape

(329, 112)

In [5]:
agg_by_state = h2a_states.groupby('st_cd').agg(cases = ('case_id', 'nunique'),
                                               addresses = ('address', 'nunique'), 
                                               violations = ('h2a_violtn_cnt', 'sum'),
                                               cmp_dollars = ('h2a_cmp_assd_amt', 'sum'),
                                               first_load_date = ('ld_dt', 'min'),
                                               last_load_date = ('ld_dt', 'max')).sort_values('cases', 
                                                                                              ascending=False)
agg_by_state

Unnamed: 0_level_0,cases,addresses,violations,cmp_dollars,first_load_date,last_load_date
st_cd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KY,78,76,1285,421809.2,2016-11-02,2021-04-15
MS,70,65,3182,598784.56,2016-11-07,2021-04-15
TX,58,58,1111,447777.15,2016-11-02,2021-04-15
LA,54,54,1389,320911.03,2016-11-02,2021-04-15
TN,40,39,863,238350.2,2016-11-02,2021-01-27
AL,29,27,230,83295.6,2017-07-18,2021-04-15


In [6]:
agg_by_year = h2a_states.groupby('year').agg(cases = ('case_id', 'nunique'),
                                             addresses = ('address', 'nunique'), 
                                             violations = ('h2a_violtn_cnt', 'sum'),
                                             cmp_dollars = ('h2a_cmp_assd_amt', 'sum')).sort_values('year',
                                                                                                     ascending=False)

agg_by_year

Unnamed: 0_level_0,cases,addresses,violations,cmp_dollars
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021,26,26,304,116322.28
2020,89,89,2527,775809.96
2019,44,44,1133,112069.55
2018,74,73,1430,569280.65
2017,63,62,1560,227482.8
2016,33,33,1106,309962.5


In [7]:
print(agg_by_state.to_latex())
print(agg_by_year.to_latex())

\begin{tabular}{lrrrrll}
\toprule
{} &  cases &  addresses &  violations &  cmp\_dollars & first\_load\_date & last\_load\_date \\
st\_cd &        &            &             &              &                 &                \\
\midrule
KY    &     78 &         76 &        1285 &    421809.20 &      2016-11-02 &     2021-04-15 \\
MS    &     70 &         65 &        3182 &    598784.56 &      2016-11-07 &     2021-04-15 \\
TX    &     58 &         58 &        1111 &    447777.15 &      2016-11-02 &     2021-04-15 \\
LA    &     54 &         54 &        1389 &    320911.03 &      2016-11-02 &     2021-04-15 \\
TN    &     40 &         39 &         863 &    238350.20 &      2016-11-02 &     2021-01-27 \\
AL    &     29 &         27 &         230 &     83295.60 &      2017-07-18 &     2021-04-15 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrrr}
\toprule
{} &  cases &  addresses &  violations &  cmp\_dollars \\
year &        &            &             &              \\
\midrule
2021 &   

In [8]:
employer_count = h2a_states.groupby('legal_name').agg(cases = ('case_id', 'nunique'),
                                                      addresses = ('address', 'nunique'))
employer_count[employer_count.cases > 1].sort_values('cases', ascending=False)


Unnamed: 0_level_0,cases,addresses
legal_name,Unnamed: 1_level_1,Unnamed: 2_level_1
"H2A Complete II, Inc.",3,3
"Black Gold Farms, Inc",2,2
"Bonnie Plants, Inc.",2,2
"Eubanks Produce, Inc.",2,1
Martin's Nursery Inc.,2,2
"Servico, Inc.",2,2
Southside Tobacco Growers Association,2,2
Tims Family Farm,2,1


In [9]:
h2a_states[['case_id', 'address', 'legal_name', 'ld_dt']][h2a_states.legal_name == 'Black Gold Farms, Inc']

Unnamed: 0,case_id,address,legal_name,ld_dt
171860,1759387,"2305 HWY 17, Delhi, LA","Black Gold Farms, Inc",2016-11-02
287608,1864368,"2305 Hwy. 17, Delhi, LA","Black Gold Farms, Inc",2020-01-27


In [10]:
address_count = h2a_states.groupby('address').agg(cases = ('case_id', 'nunique'),
                                                            employers = ('legal_name', 'nunique'))
address_count[address_count.cases > 1].sort_values('cases', ascending=False).head(10)

Unnamed: 0_level_0,cases,employers
address,Unnamed: 1_level_1,Unnamed: 2_level_1
"124 Jimmy Beckley Drive, Bruce, MS",3,3
"100 Moseley Rd., Abbeville, AL",2,2
"110 Tims Lane, Ripley, TN",2,1
"222 Highway 341 South, Vardaman, MS",2,2
"3266 Lower Jackstown Road, Carlisle, KY",2,2
"331 Produce Road, Lucedale, MS",2,1
"5375 MS Hwy 404, Gore Springs, MS",2,2
"5746 County Road 40, Shorter, AL",2,2
"9408 Mulligan Road, Owensboro, KY",2,2


In [11]:
Jimmy_Beckley = h2a_states[['case_id', 'address', 'legal_name', 'ld_dt']][h2a_states.address == '124 Jimmy Beckley Drive, Bruce, MS']
Jimmy_Beckley

Unnamed: 0,case_id,address,legal_name,ld_dt
260315,1708970,"124 Jimmy Beckley Drive, Bruce, MS","Lewis M. Bailey, IV Farms, Inc.",2017-07-18
265906,1775153,"124 Jimmy Beckley Drive, Bruce, MS","Lewis M. Bailey, IV Farm Inc.",2018-03-24
284763,1884517,"124 Jimmy Beckley Drive, Bruce, MS","Lewis M. Bailey Farms, Inc.",2020-03-17


In [12]:
Mulligan = h2a_states[['case_id', 'address', 'legal_name', 'ld_dt']][h2a_states.address == '9408 Mulligan Road, Owensboro, KY']
Mulligan

Unnamed: 0,case_id,address,legal_name,ld_dt
277617,1826254,"9408 Mulligan Road, Owensboro, KY","Cecil Tobacco Company, LLC",2017-11-06
309020,1828480,"9408 Mulligan Road, Owensboro, KY",Los Villatoros Harvesting LLC,2020-07-16


In [13]:
print(Jimmy_Beckley.to_latex())
print(Mulligan.to_latex())

\begin{tabular}{lrlll}
\toprule
{} &  case\_id &                             address &                       legal\_name &       ld\_dt \\
\midrule
260315 &  1708970 &  124 Jimmy Beckley Drive, Bruce, MS &  Lewis M. Bailey, IV Farms, Inc. &  2017-07-18 \\
265906 &  1775153 &  124 Jimmy Beckley Drive, Bruce, MS &    Lewis M. Bailey, IV Farm Inc. &  2018-03-24 \\
284763 &  1884517 &  124 Jimmy Beckley Drive, Bruce, MS &      Lewis M. Bailey Farms, Inc. &  2020-03-17 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrlll}
\toprule
{} &  case\_id &                            address &                     legal\_name &       ld\_dt \\
\midrule
277617 &  1826254 &  9408 Mulligan Road, Owensboro, KY &    Cecil Tobacco Company,  LLC &  2017-11-06 \\
309020 &  1828480 &  9408 Mulligan Road, Owensboro, KY &  Los Villatoros Harvesting LLC &  2020-07-16 \\
\bottomrule
\end{tabular}

