In [1]:
import PyPDF2 
import textract
from nltk.tokenize import word_tokenize
import re 
import pandas as pd
from pathlib import Path

In [2]:
def pdf_to_text(filename):

    #filename = "Anderson_2010_SF1_Profile.pdf"
    #open allows you to read the file.
    pdfFileObj = open(filename,'rb')
    #The pdfReader variable is a readable object that will be parsed.
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    #Discerning the number of pages will allow us to parse through all the pages.
    num_pages = pdfReader.numPages
    count = 0
    text = ""
    #The while loop will read each page.
    while count < num_pages:
        pageObj = pdfReader.getPage(count)
        count +=1
        text += pageObj.extractText()
    #This if statement exists to check if the above library returned words. 
    #It's done because PyPDF2 cannot read scanned files.
    if text != "":
        text = text
    #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text.
    else:
        text = textract.process(fileurl, method='tesseract', language='eng')

    #Now we have a text variable that contains all the text derived from our PDF file. 
    #Type print(text) to see what it contains. It likely contains a lot of spaces, possibly junk such as '\n,' etc.
    
    return text

In [3]:
def subset_text_doc(text):
    
    #spliting up text file and finding relevant sections
    splits = text.split("===================================================================================================================================================")
    pop_sum = splits[3]
    pop_sum_table_split = pop_sum.split(" |")
    
    new_table_list = []
    for s in pop_sum_table_split:
        s = s.replace('|---------------------------------------------------------------------------------|', '')
        new_table_list.append(s)

    # Separating odd and even index elements 
    left_tables = [] 
    right_tables = [] 
    for i in range(0, len(new_table_list)): 
        if i % 2: 
            right_tables.append(new_table_list[i]) 
        else : 
            left_tables.append(new_table_list[i])

    return left_tables, splits

In [6]:
def get_clean_summary_table(left_tables, splits):
    '''
    This functions takes as arguments the left tables of the texas census text file and the overall file split into sections
    It will then convert the text of muiltiple tables into pandas data frames that then are merged
    The primary key of the table is the texas county
    '''

    pop_sum = left_tables[0:10]
    pop_sum_d = pop_sum[3:10]

    pop_sum_clean_index = []

    for s in pop_sum_d:
        s = s.replace("|", "")
        s = s.replace("Under 18", "Under Eightteen")
        s = s.replace("18 years", "Eighteen years")
        re_s = re.findall("[a-zA-Z ]+", s)
        for string in re_s:
            string = string.rstrip().lstrip()
            pop_sum_clean_index.append(string)

    pop_sum_clean_num = []

    for s in pop_sum_d:
        s = s.replace("|", "")
        s = s.replace("Under 18", "Under Eightteen")
        s = s.replace("18 years", "Eighteen years")
        num = re.findall("[0-9]?[0-9]?[0-9]?\,?[0-9]?[0-9]?[0-9]?\,?\.?[0-9]+", s) 
        for number in num:
            number = number.replace(",","")
            pop_sum_clean_num.append(float(number))
    
    race_sum = left_tables[29:39]
            
    race_sum_clean_index = []

    for s in race_sum:
        s = s.replace("|", "")
        re_s = re.findall("[a-zA-Z\,?\/?\(?\)? ]+", s)
        for string in re_s:
            string = string.rstrip().lstrip()
            if string != ',' and string != '':
                race_sum_clean_index.append(string)
                
    race_sum_clean_num = []

    for s in race_sum:
        s = s.replace("|", "")
        num = re.findall("[0-9]+\,?[0-9]?[0-9]?[0-9]?\,?\.?[0-9]+", s)
        #print(num)
        for number in num[:1]:
            #print(number)
            number = number.replace(",","")
            race_sum_clean_num.append(float(number))

    county = re.findall("[a-zA-Z]+\ ?[a-zA-Z]+ County", splits[0])[0][:-7]
    pop_dict = {"county":pop_sum_clean_index,county:pop_sum_clean_num}
    pop_df = pd.DataFrame(pop_dict)
    pop_df = pop_df.T
    pop_df.columns = pop_df.iloc[0]
    pop_df = pop_df.iloc[1:]
    pop_df = pop_df.reset_index()
    
    race_dict = {"county":race_sum_clean_index,county:race_sum_clean_num}
    race_df = pd.DataFrame(race_dict)
    race_df = race_df.T
    race_df.columns = race_df.iloc[0]
    race_df = race_df.iloc[1:]
    race_df = race_df.reset_index()
    race_df = race_df.rename(columns = {"index":"County"})
    race_df.reset_index(drop=True, inplace=True)

    pop_df.columns = ["County",'Total Population', 'Total Families', 'Total Households',
       'Population in Families', 'Population in Households',
       'Family: Under Eightteen years','House: Under Eightteen years',
       'Family: Eighteen years and over', 'House: Eighteen years and over',
       'Average Family Size', 'Average Household Size',
       'Family: Under Eightteen years', 'House: Under Eightteen years',
       'Family: Eighteen years and over']
    
    race_pop_sum_df = pop_df.merge(race_df, left_on = "County", right_on = "County")

    return race_pop_sum_df

In [7]:
pathlist = Path("/Users/tsbloxsom/Google Drive/2020 Projects/Census Project").rglob('*.pdf')

appended_data = []

for path in pathlist:
    # because path is object not string
    path_in_str = str(path)
    print(path_in_str)
    
    filename = path_in_str[59:]
    
    text = pdf_to_text(filename)
    #print(text)
    
    left_tables, splits = subset_text_doc(text)
    
    df = get_clean_summary_table(left_tables, splits)
    
    appended_data.append(df)

appended_data = pd.concat(appended_data)

\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Anderson_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Andrews_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Angelina_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Aransas_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Archer_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Armstrong_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Atascosa_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Austin_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Bailey_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Bandera_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Bastrop_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\

\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Hale_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Hall_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Hamilton_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Hansford_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Hardeman_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Hardin_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Harrison_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Harris_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Hartley_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Haskell_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Hays_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Proj

\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Presidio_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Rains_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Randall_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Reagan_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Real_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Red_River_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Reeves_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Refugio_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Robertson_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Roberts_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\2020 Projects\Census Project\Rockwall_2010_SF1_Profile.pdf
\Users\tsbloxsom\Google Drive\202

In [None]:
#appended_data = pd.concat(appended_data)

In [8]:
appended_data

Unnamed: 0,County,Total Population,Total Families,Total Households,Population in Families,Population in Households,Family: Under Eightteen years,House: Under Eightteen years,Family: Eighteen years and over,House: Eighteen years and over,...,Family: Eighteen years and over.1,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian/Other Pacific Islander alone,Some Other Race alone,Two or More Races,Hispanic or Latino (Any race),"White alone, not Hispanic"
0,Anderson County,58458,12158,17218,37417,44241,11297,11471,26120,32770,...,2.15,13660,2437,92,68,22,735,220,1534,12958
0,Andrews County,14786,3976,5259,12922,14704,4263,4312,8659,10392,...,2.18,4315,76,51,31,1,707,78,2117,2973
0,Angelina County,86771,22429,31090,71618,83818,22794,23170,48824,60648,...,2.18,23686,4624,149,243,15,2045,338,4108,21841
0,Aransas County,23158,6536,9795,18281,22699,4405,4519,13876,18180,...,2.12,8826,106,80,154,1,465,163,1692,7684
0,Archer County,9054,2597,3538,7786,8998,2153,2185,5633,6813,...,2.17,3376,10,25,15,3,83,37,173,3295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Wood County,41964,12149,17118,34315,40861,8295,8507,26020,32354,...,2.14,15733,655,98,45,15,419,163,861,15346
0,Yoakum County,7879,2120,2643,7112,7826,2480,2501,4632,5325,...,2.18,2062,30,26,12,2,457,55,1280,1299
0,Young County,18550,5223,7343,15436,18271,4399,4458,11037,13813,...,2.11,6819,83,54,21,0,275,91,818,6337
0,Zapata County,14018,3423,4297,12729,13988,4768,4805,7961,9183,...,2.33,4014,28,16,11,0,225,23,3820,448


In [9]:
appended_data.to_csv('5. County Stats.csv', index = False)