In [3]:
import numpy as np
import pandas as pd
import tabula

In [31]:
def pdf_to_csv_2016(pdf_file):
    df_year = tabula.read_pdf(pdf_file, multiple_tables=True, pages='all')

    df = pd.concat(df_year, axis=0, ignore_index=True)
    
    # Reset 
    df.columns = df.iloc[0]
    df = df[df['Claim Number'] != 'Claim Number']
    
    # Drop empty final column
    if len(df.columns) > 11:
        df = df.iloc[:, :-1]

    # Prepare airport_name for row concat operation below
    df['Airport Name'] = df['Airport Name'].astype(str)

    ### Loop to clean up broken row separation as computed by tabula
    # if claim_number is NOT NaN
    for i in range(1,len(df)):
        if df['Claim Number'].iloc[i] is not np.nan:
            # then pull the airport_name from above row and str add it to the current airport name
            df['Airport Name'].iloc[i] = df['Airport Name'].iloc[i-1] + ' ' + df['Airport Name'].iloc[i]

    df.dropna(subset=['Claim Number'], inplace=True)
    return df

In [81]:
def pdf_to_csv_2017(pdf_file):
    df_year = tabula.read_pdf(pdf_file, multiple_tables=True, pages='all', guess=False, stream=True)

    df = pd.concat(df_year, axis=0, ignore_index=True)
    
    # Reset 
    df.columns = df.iloc[0]
    df = df[df['Claim Number'] != 'Claim Number']

    # Prepare airport_name for row concat operation below
    df['Airport Name'] = df['Airport Name'].astype(str)

    ### Loop to clean up broken row separation as computed by tabula
    # if Claim Type is NOT NaN
    for i in range(1,len(df)):
        if df['Claim Type'].iloc[i] is not np.nan:        
            # then insert prev Claim Number into the current row which has all the other info
            df['Claim Number'].iloc[i] = df['Claim Number'].iloc[i-1]
    
    # keep only completd rows
    df.dropna(subset=['Claim Type'], inplace=True)
    return df

In [9]:
df2016 = pdf_to_csv_2016('data/raw/raw_pdfs/claims-2016.pdf')
print(df2016.shape)
df2016.to_csv('data/raw/claims-2016.csv')

(8407, 11)


In [82]:
df2017 = pdf_to_csv_2017('data/raw/raw_pdfs/claims-2017.pdf')
print(df2017.shape)
df2017.to_csv('data/raw/claims-2017.csv')

(8418, 11)
