In [137]:
import numpy as np
import pandas as pd
import tabula

In [138]:
def pdf_to_csv(pdf_file):
    df_year = tabula.read_pdf(pdf_file, multiple_tables=True, pages='all')

    # Create column names appropriate for all categories
    col_names = ['claim_number', 'date_received', 'incident_date', 'airport_code', 'airport_name',
               'airline', 'claim_type', 'claim_site', 'item_category', 'close_amount', 'disposition']

    # Dictionary to replace names of columns as gathered by Tabula
    col_dict = {}
    for i, name in enumerate(col_names):
        col_dict[i] = name

    df = pd.concat(df_year, axis=0, ignore_index=True)

    # Drop empty final column
    if len(df.columns) > 11:
        df = df.iloc[:, :-1]

    # Fix headers
    df.rename(columns=col_dict, inplace=True)
    df = df.iloc[1:, :]

    # Prepare airport_name for row concat operation below
    df.airport_name = df.airport_name.astype(str)

    ### Loop to clean up broken row separation as computed by tabula
    # if claim_number is NOT NaN
    for i in range(1,len(df)):
        if df.claim_number.iloc[i] is not np.nan:
            # then pull the airport_name from above row and str add it to the current airport name
            df.airport_name.iloc[i] = df.airport_name.iloc[i-1] + ' ' + df.airport_name.iloc[i]

    df.dropna(subset=['claim_number'], inplace=True)
    return df

In [None]:
df2016 = pdf_to_csv('raw/claims-2016.pdf')
print(df2016.shape)
df.to_csv('raw/claims-2016.csv')

In [None]:
df2016 = pdf_to_csv('raw/claims-2017.pdf')
print(df2016.shape)
df.to_csv('raw/claims-2017.csv')