# Debarment Data Extrapolation

In [1]:
import tabula
import pandas as pd

In [2]:
# pdf path names
path1 = "https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/H-2A_Expired_Debarments_OFLC_webpage_Final_Draft.pdf"
path2 = "https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/Debarment_List.pdf"

### Pulling out h2a debarment data

In [3]:
# read in with tabula
debar_h2a = tabula.read_pdf(path1, lattice=True, pages = "all", pandas_options={'header':None})
# pull out headers
headers = debar_h2a[0].values.tolist()[2]
# cut out excessive rows at the beginning
debar_h2a[0] = debar_h2a[0].iloc[5:]
# assign headers to each df in the list of dfs
for df in debar_h2a: df.columns = headers
# concat all df
h2a_concat = pd.concat(debar_h2a)

In [4]:
# deal with Nas
h2a_concat = h2a_concat[~h2a_concat.Name.str.contains('FY', na=False)]
h2a_final = h2a_concat.dropna(how='all')
h2a_final.reset_index(drop=True, inplace=True)

In [5]:
h2a_final

Unnamed: 0,Name,"City, State",Violation,Duration,Start date,End date
0,Wagner Farm,"East Windsor, CT",Failure to respond to audit (partial\rresponse),2 years,1/19/2014,1/18/2016
1,J&J Harvesting,"Leads, ND",Failure to respond to audit (partial\rresponse),2 years,1/19/2014,1/18/2016
2,"Stahlman Apiaries, Inc","Selby, SD",Failure to respond to audit (partial\rresponse),1 year,2/19/2015,2/14/2016
3,Trust Nursery,"Pulaski, NY",Failure to respond to audit (partial\rresponse),1 year,3/21/2014,3/20/2015
4,Anton Fertilizer Inc.,"Dighton, KS",Failure to respond to audit (no\rresponse),2 years,3/30/2014,3/29/2016
...,...,...,...,...,...,...
124,Glendale Fruit Farms,"Waterport, New York",Failure to comply with the employer's\robligat...,3 months,4/14/2020,7/7/2020
125,J.C. Castro dba Castro Harvesting*,"Vidalia, Georgia",WHD Debarment,3 years,2/27/2018,2/27/2021
126,Marisa Garcia Pineda- H2A FLC*,"Statonsburg, North Carolina",WHD Debarment,3 years,3/5/2018,3/4/2021
127,"Vasquez Citrus & Hauling, Inc.*","Lake Placid, Florida",WHD Debarment,3 years,3/7/2018,3/6/2021


In [6]:
h2a_final.isnull().sum()

Name           0
City, State    1
Violation      0
Duration       0
Start date     0
End date       0
dtype: int64

In [7]:
# uncomment to save to csv
#h2a_final.to_csv("h2a_debarment.csv")

### Pulling out debarment data

In [51]:
# read in with tabula
debar = tabula.read_pdf(path2, lattice=True, pages = "all", pandas_options={'header':None})
# pull out headers
headers2 = debar[0].values.tolist()[2]
# cut out excessive rows at the beginning
for i in range(0,6):
    debar[i] = debar[i].iloc[4:]
#debar[0] = debar[0].iloc[4:]
# assign headers to each df in the list of dfs
for df in debar: df.columns = headers2
# concat all df
debar_concat = pd.concat(debar)
# cleaning and index
debar_final = debar_concat.replace('\r',' ', regex=True)
debar_final = debar_final.dropna(how='all') 
debar_final.reset_index(drop=True, inplace=True)

In [53]:
debar_final.isnull().sum()

Entity                 0
Entity Type            0
Employer Location      0
Start of Debarment     0
End of Debarment       0
Violation              0
CFR Citation          73
dtype: int64

In [54]:
debar_final

Unnamed: 0,Entity,Entity Type,Employer Location,Start of Debarment,End of Debarment,Violation,CFR Citation
0,Grace Yu,Agent,"Duluth, Georgia","May 13, 2020","May 12, 2023",Failure to respond to audit request,20 CFR §§ 656.20 and 656.31(f)(1)(iv)
1,"Harrison Poultry, Inc.",Employer,"Bethlehem, Georgia","June 29, 2018","June 29, 2021",Failure to respond to audit request,20 CFR §§ 656.20 and 656.31(f)(1)(iv)
2,Raul G. Sebazco,Agent,"Miami, Florida","September 2, 2020","September 1, 2023",Participated or facilitated prohibited actions,20 CFR 656.31(f)(1)(i)
3,"Lorenzo Construction, LLC",Employer,"Crofton, Maryland","February 26, 2021","February 26, 2022",Failure to respond to audit request,20 CFR §§ 656.20 and 656.31(f)(1)(iv)
4,"Victory Processing, LLC",Employer,"Gainesville, Georgia","March 31, 2021","March 21, 2022",Failure to respond to audit request,20 CFR §§ 656.20 and 656.31(f)(1)(iv)
...,...,...,...,...,...,...,...
107,"Ph Construction, Inc.",Employer,"Georgetown, Texas","June 25, 2020","June 24, 2023",Failure to Pay the Prevailing Wage,20 CFR 655.73(a)(2) and (f)(1)
108,"Pitts Construction, Inc",Employer,"Austin, Texas","April 3, 2020","April 2, 2023",Failure to Pay the Prevailing Wage,20 CFR §655.73(a)(2) and 20 CFR §655.73(f)(1)
109,Samjosh Excavations & Construction LLC,Employer,"Haledon, New Jersey","August 1, 2019","August 1, 2022",Failure to respond to audit request,20 CFR §655.73(f)(8)
110,"Valdez Lawn Care & Snow Removal, LLC*",Employer,"Hastings, Minnesota","January 27, 2021","January 27, 2024",WHD Debarment,


In [55]:
# uncomment to save to csv
#debar_final.to_csv("debarment_data.csv")