In [1]:
# we're going to be doing this the ocky way
# first we put the bacon on the grill

# Converting the 86R dataset to usable .txt files

In [2]:
#we first need to import the packages we are going to use
import os
from ftplib import FTP
import pandas as pd
from pdfminer.high_level import extract_text

In [3]:
# now we access the FTP directory using the link given by the TX legislature website
ftp = FTP('ftp.legis.state.tx.us')
ftp.login() # since this is a public server we don't need credentials

# we now change working directories to the house_bills folder for the 86R
# this contains the folders which have 100 pdf files
ftp.cwd('bills/86R/billtext/pdf/house_bills')

# store the folder names of all the house_bills along with raw ftp data
raw_ftp_house_bills_folder_names = []
ftp.dir(raw_ftp_house_bills_folder_names.append)

print(raw_ftp_house_bills_folder_names[0:2])

['05-27-19  04:30PM       <DIR>          HB00001_HB00099', '05-27-19  12:20PM       <DIR>          HB00100_HB00199']


In [4]:
# Obviously the ftp.dir() is getting some data we don't need since we just want the folder names
# So we truncate the information we don't need and just save the folder name
house_bill_folder_names = []

for folder in raw_ftp_house_bills_folder_names:
    # get the start of the file name
    start_index = folder.find("HB")
    # add just the folder name to our new array by truncating the string
    house_bill_folder_names.append(folder[start_index: ])

print(house_bill_folder_names[0:2])

['HB00001_HB00099', 'HB00100_HB00199']


In [5]:
# we will need a csv to store some of the information about the bills
# let's create a pandas dataframe with a column for the link/directory each .txt file is located at
df = pd.DataFrame({"Link":[]})

# we need to keep track of the location of each file so we can store it in our pandas dataframe
txt_file_links = []

In [7]:
# Now we can start navigating into the folder and downloading the files
# first we want to save our current path as the root for our house bills
os.chdir(r'C:\Users\hurle\TGIF_project\notebooks')
for folder in house_bill_folder_names:
    # change our working directory to the folder being iterated over
    with FTP('ftp.legis.state.tx.us') as ftp:
        ftp.login()
        ftp.cwd('bills/86R/billtext/pdf/house_bills')

        house_bill_root = ftp.pwd()
        print(house_bill_root)
        #house_bill_folder_names = house_bill_folder_names[1:]

        curr_directory = house_bill_root +'/' + folder
        print(curr_directory)
        ftp.cwd(curr_directory)

        #store all the filenames in the current directory in files
        files = []
        ftp.dir(files.append)

        #make array to store files we want
        i_bills = []

        for file in files:
            if "I.pdf" in file:
                start_index = file.find("HB") #truncate the file the ftp.dir() gets that we don't want
                i_bills.append(file[start_index:])
        
        # we want to check if the files have already been downloaded and processed, if so, then 
        # we can skip downloading and converting them
        i_bills_temp = []
        
        # we need to check if the .txt file already exists in /txt_files
        os.chdir("txt_files")
        txt_files = os.listdir(os.getcwd())
        os.chdir("..")
        
        for filename in i_bills:
            if filename[:len(filename) - 4] + ".txt" not in txt_files:
               i_bills_temp.append(filename)
               
        i_bills = i_bills_temp
        
        #now we try to download the files
        for filename in i_bills:
            #we open the file to write to
            with open(filename, "wb") as file:
                #and use ftp's retrbinary to write to the file
                ftp.retrbinary(f"RETR {filename}", file.write)



        # we iterate over each .pdf and store save it as a .txt file in the /txt_files folder
        for file in i_bills:
            #extract and save the text from the file
            text = extract_text(file)

            #make new file in /txt_files
            os.chdir('txt_files')
            txt_filename = file[:len(file) - 4] + ".txt"
            with open(txt_filename, "w") as f:
                try:
                    f.write(text)
                    f.close()
                    # add the .txt file name to our txt_file_links folder
                    txt_file_links.append(os.getcwd() + '/' + txt_filename)
                except:
                    print("%s could not be added" %(txt_filename))
                    os.chdir("..")
                    continue


            os.chdir("..")

        # remove files and prepare to download new files
        for file in i_bills:
            os.remove(file)

    ftp.close()


/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB00001_HB00099
/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB00100_HB00199
/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB00200_HB00299
/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB00300_HB00399
/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB00400_HB00499
/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB00500_HB00599
/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB00600_HB00699
/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB00700_HB00799
/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB00800_HB00899
/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB00900_HB00999
/bills/86R/billtext/pdf/house_bills
/bills/86R/billtext/pdf/house_bills/HB01000_HB01099
/bills/86R/billtext/pdf/house_bi

In [8]:
# update our pandas dataframe with the file locations
df = pd.DataFrame({"Link":txt_file_links})

In [9]:
# add columns for the legislative body and the legislative session
df['Legislative Body'] = "House"
df['Session'] = '86R'

In [10]:
# print our current dataframe
df.head()

Unnamed: 0,Link,Legislative Body,Session
0,C:\Users\hurle\TGIF_project\notebooks\txt_file...,House,86R
1,C:\Users\hurle\TGIF_project\notebooks\txt_file...,House,86R
2,C:\Users\hurle\TGIF_project\notebooks\txt_file...,House,86R
3,C:\Users\hurle\TGIF_project\notebooks\txt_file...,House,86R
4,C:\Users\hurle\TGIF_project\notebooks\txt_file...,House,86R


In [11]:
# modify the Link so that the root is the project directory
for index, row in df.iterrows():
    row['Link'] = row['Link'][row['Link'].find("TGIF"):]

df.head()

Unnamed: 0,Link,Legislative Body,Session
0,TGIF_project\notebooks\txt_files/HB01000I.txt,House,86R
1,TGIF_project\notebooks\txt_files/HB01001I.txt,House,86R
2,TGIF_project\notebooks\txt_files/HB01002I.txt,House,86R
3,TGIF_project\notebooks\txt_files/HB01003I.txt,House,86R
4,TGIF_project\notebooks\txt_files/HB01004I.txt,House,86R


In [12]:
# save our dataframe as a csv to the txt_files directory
df.to_csv('txt_files/directory.csv')

In [20]:
# remove all pdf files from the txt_files directory
txt_files = os.listdir(os.getcwd() + '\\txt_files')
os.chdir("txt_files")
files_to_be_removed = []

print(txt_files)

for file in txt_files:
    if "pdf" in file:
        files_to_be_removed.append(file)

for file in files_to_be_removed:
    os.remove(file)
    
os.chdir("..")

['directory.csv', 'HB00001I.txt', 'HB00002I.txt', 'HB00003I.txt', 'HB00004I.txt', 'HB00005I.txt', 'HB00006I.txt', 'HB00007I.txt', 'HB00008I.txt', 'HB00009I.txt', 'HB00010I.txt', 'HB00011I.txt', 'HB00012I.txt', 'HB00013I.txt', 'HB00014I.txt', 'HB00015I.txt', 'HB00016I.txt', 'HB00017I.txt', 'HB00018I.txt', 'HB00019I.txt', 'HB00020I.txt', 'HB00021I.txt', 'HB00022I.txt', 'HB00023I.txt', 'HB00024I.txt', 'HB00025I.txt', 'HB00026I.txt', 'HB00027I.txt', 'HB00028I.txt', 'HB00029I.txt', 'HB00030I.txt', 'HB00031I.txt', 'HB00032I.txt', 'HB00033I.txt', 'HB00034I.txt', 'HB00035I.txt', 'HB00036I.txt', 'HB00037I.txt', 'HB00038I.txt', 'HB00039I.txt', 'HB00040I.txt', 'HB00041I.txt', 'HB00042I.txt', 'HB00043I.txt', 'HB00044I.txt', 'HB00045I.txt', 'HB00046I.txt', 'HB00047I.txt', 'HB00048I.txt', 'HB00049I.txt', 'HB00050I.txt', 'HB00051I.txt', 'HB00052I.txt', 'HB00053I.txt', 'HB00054I.txt', 'HB00055I.txt', 'HB00056I.txt', 'HB00057I.txt', 'HB00058I.txt', 'HB00059I.txt', 'HB00060I.txt', 'HB00061I.txt', 'HB000