# Debarment Data Extraction

In [28]:
import tabula
import pandas as pd
import os

In [6]:
# pdf path names
## can also read from raw dir if 
## path names change
path1 = "https://www.foreignlaborcert.doleta.gov/pdf/H-2A_Expired_Debarments_OFLC_webpage_Final_Draft.pdf"
path2 = "https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/Debarment_List.pdf"

### Pulling out h2a debarment data

In [7]:
# read in with tabula
debar_h2a = tabula.read_pdf(path1, lattice=True, pages = "all", pandas_options={'header':None})
# pull out headers
headers = debar_h2a[0].values.tolist()[2]
# cut out excessive rows at the beginning
debar_h2a[0] = debar_h2a[0].iloc[5:]
# assign headers to each df in the list of dfs
for df in debar_h2a: df.columns = headers
# concat all df
h2a_concat = pd.concat(debar_h2a)

In [16]:
pd.set_option('display.max_rows', 200)


In [18]:
# deal with Nas
h2a_concat = h2a_concat[~h2a_concat.Name.str.contains('FY', na=False)]
h2a_final = h2a_concat.dropna(how='all')
h2a_final.reset_index(drop=True, inplace=True)

In [20]:
h2a_final.isnull().sum()

Name           0
City, State    1
Violation      0
Duration       0
Start date     0
End date       0
dtype: int64

### Pulling out debarment data

In [21]:
# read in with tabula
debar = tabula.read_pdf(path2, lattice=True, pages = "all", pandas_options={'header':None})
# pull out headers
headers2 = debar[0].values.tolist()[2]
# cut out excessive rows at the beginning
for i in range(0,6):
    debar[i] = debar[i].iloc[4:]
#debar[0] = debar[0].iloc[4:]
# assign headers to each df in the list of dfs
for df in debar: df.columns = headers2
# concat all df
debar_concat = pd.concat(debar)
# cleaning and index
debar_final = debar_concat.replace('\r',' ', regex=True)
debar_final = debar_final.dropna(how='all') 
debar_final.reset_index(drop=True, inplace=True)

In [22]:
debar_final.isnull().sum()

Entity                 0
Entity Type            0
Employer Location      0
Start of Debarment     0
End of Debarment       0
Violation              0
CFR Citation          73
dtype: int64

In [33]:
### look at overlap with other debarment file
debar_ent_cap = debar_final.Entity.str.upper()
debar_name_cap = h2a_final.Name.str.upper()

### no intersect at least with partially cleaned entities
len(set(debar_ent_cap).intersection(debar_name_cap))

0

In [30]:
# save to csv
WRITE_CSV = True
if WRITE_CSV:
    debar_final.to_csv("../data/intermediate/current_debarment.csv", index = False)
    h2a_final.to_csv("../data/intermediate/expired_h2a_debarment.csv", index = False)
