In [3]:
from tabula import read_pdf
import tabula
import pandas as pd
import re
import math
import warnings
import PyPDF2
from tqdm import tqdm_notebook as tqdm
warnings.filterwarnings(action='always')

pd.options.display.max_rows = 4000


In [26]:
def page_transform(page, pdf):

    df = tabula.read_pdf(pdf, pages=page)

    df_revision = df.copy()
    df_null = df.isna()

    df_revision.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=[" "," "], regex=True, inplace=True)

    unit = []
    station = []
    line = []
    availability = []
    comment = ''
    comments = []

    if len(df.columns) == 3:

        df_comment = pd.DataFrame(
            {'full comment': df[df.columns[2]].str.contains(r'^(Th).*$'),
             'starts percent': df[df.columns[2]].str.contains(r'\d{1,2}\.\d{2}\%'),
             'continued comment': df[df.columns[2]].str.contains(r'^[a-zA-SU-Z]*[\s]'),
             'all caps': df[df.columns[2]].str.contains(r'^[^a-z]*$'),
             'no spaces': df[df.columns[2]].str.contains(r'^[^\s]*$'),
            })

        for i in range(len(df)):
            if re.match(string = str(df[df.columns[0]][i]), pattern = r'E[SL]\d{3}'):
                unit.append(df[df.columns[0]][i])
            if re.match(string = str(df[df.columns[1]][i]).split(' ')[-1], pattern = r'[ABCDEFGJLMNRQWZ1-7]'):
                if re.match(string = str(df[df.columns[1]][i]).split(' ')[-1], pattern = r'^((?!Location).)*$'):
                    line.append(df[df.columns[1]][i].split(' ')[-1])
                if ' '.join(df[df.columns[1]][i].split(' ')[:-1]) is not '':
                    station.append(' '.join(df[df.columns[1]][i].split(' ')[:-1]))
            if re.match(string = str(df[df.columns[2]][i]).split(' ')[0], pattern = r'\d{1,2}\.\d{2}\%'):
                availability.append(df[df.columns[2]][i].split(' ')[0])
            if df_comment['starts percent'][i] == True:
                if len(comment) > 0:
                    comments.append(comment)
                    comment = ''
                comment += ' '.join(df_revision[df_revision.columns[2]][i].split(' ')[1:])

            if df_comment['full comment'][i] == True and df_null[df_revision.columns[1]][i] == True:
                comment += df_revision[df_revision.columns[2]][i]

            if df_comment['full comment'][i] == True and df_null[df_revision.columns[1]][i] == False:
                if len(comment) > 0:
                    comments.append(comment)
                    comment = ''
                comments.append(df_revision[df_revision.columns[2]][i])

            if df_comment['continued comment'][i] == True and i != 0:
                comment += df_revision[df_revision.columns[2]][i]

        if len(comment) > 0:
            comments.append(comment)        

        df_output = pd.DataFrame(
            {'unit': unit,
             'station': station,
             'line': line,
             'availability': availability,
             'comments' : comments
            })

    if len(df.columns) > 4:
        df = df[list(df.columns[:-1])]

    if len(df.columns) == 4:

        df_comment = pd.DataFrame(
            {'full comment': df[df.columns[3]].str.contains(r'^(Th|In an effort to improve reliability).*$'),
             'starts percent': df[df.columns[3]].str.contains(r'\d{1,2}\.\d{2}\%'),
             'continued comment': df[df.columns[3]].str.contains(r'(?!In an effort to improve reliability)[a-zA-SU-Z]*[\s]'),
             'all caps': df[df.columns[3]].str.contains(r'^[^a-z]*$'),
             'no spaces': df[df.columns[3]].str.contains(r'^[^\s]*$'),
            })

        for i in range(len(df)):
            if df.iloc[i][1] != 'Location':
                if re.match(string = str(df.iloc[i][0]), pattern = r'E|[SL]\d{3}'):
                    unit.append(df.iloc[i][0])
                if str(df.iloc[i][1]) != 'nan':
                    station.append(' '.join(str(df.iloc[i][1]).split(' ')[:-1]))
                if re.match(string = str(df.iloc[i][1]).split(' ')[-1], pattern = r'[ABCDEFGJLMNRQWZ1-7]'):
                    line.append(str(df.iloc[i][1]).split(' ')[-1])

            if df.iloc[i].isna().sum() == 0:
                if re.match(string = df.iloc[i][2], pattern = r'\d{1,2}\.\d{2}\%'):
                    availability.append(df.iloc[i][2])

            if df.iloc[i].isna().sum() == 1:
                if re.match(string = df.iloc[i][3].split(' ')[0], pattern = r'\d{1,2}\.\d{2}\%'):
                    availability.append(df.iloc[i][3].split(' ')[0])

            if df_comment['starts percent'][i] == True:
                if len(comment) > 0:
                    comments.append(comment)
                    comment = ''
                comment += ' '.join(df_revision[df_revision.columns[3]][i].split(' ')[1:])

            if df_comment['full comment'][i] == True and df_null[df_null.columns[2]][i] == True:
                comment += df_revision[df_revision.columns[3]][i]

            if df_comment['full comment'][i] == True and df_null[df_null.columns[2]][i] == False:
                if len(comment) > 0:
                    comments.append(comment)
                    comment = ''
                comments.append(df_revision[df_revision.columns[3]][i])

            if df_comment['continued comment'][i] == True:
                comment += df_revision[df_revision.columns[3]][i]

        if len(comment) > 0:
            comments.append(comment)        

        comments = [i for n, i in enumerate(comments) if i not in comments[:n]] 

        df_output = pd.DataFrame(
            {'unit': unit,
             'station': station,
             'line': line,
             'availability': availability,
             'comments' : comments
            })
        
        return df_output


In [27]:
def pdf_transform_v6(pdf):
    df_new = pd.DataFrame(
        {'unit': [],
         'station': [],
         'line': [],
         'availability': [],
         'comments' : []
        })
    
    pages = []
    
    pdfFileObj = open(pdf, 'rb') 
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    
    for i in tqdm(range(pdfReader.trailer['/Root']['/Pages']['/Count']), leave = False, desc = 'Scanning PDF Pages'):
        pageObj = pdfReader.getPage(i) 
        current_page = pageObj.extractText()
        if 'WITH LESS THAN 85% AVAILABILITY' in current_page:
            pages.append(i)
            
    pdfFileObj.close()
    
    for i in tqdm(pages, leave = False, desc = 'Processing Chart: '):
        df_temp = page_transform(i + 1, pdf)
        df_new = pd.concat([df_new, df_temp], axis = 0).reset_index(drop = True)
        
    return df_new
        

In [29]:
pdf = '171113_1030_Transit.pdf'
pdf_transform_v6(pdf).to_csv("CSVs/" + pdf + ".csv")

HBox(children=(IntProgress(value=0, description='Scanning PDF Pages', max=191, style=ProgressStyle(description…



HBox(children=(IntProgress(value=0, description='Processing Chart: ', max=3, style=ProgressStyle(description_w…





