In [23]:
import pandas as pd
import os
import requests
from fastai.vision import *

os.environ['NO_PROXY'] = '127.0.0.1'
pdf_folder = '../weekly_lists/2020-06-22_2020-06-28/PDFs/'
current_folder = '2020-06-22_2020-06-28'
save_filename = "../weekly_lists/2020-06-22_2020-06-28/results/barzooka_preprint_results_" + current_folder + ".csv"

In [3]:
learn = load_learner(path='.', file='export.pkl')



### Functions for bar graph detection per PDF

In [4]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

#re_pg = re.compile(r'Index \d+ out of bounds for length (\d+)')
re_pg = re.compile(r'Index: \d+, Size: (\d+)')


def req_internal(url):
    http = urllib3.PoolManager(cert_reqs='CERT_NONE')
    page = http.request('get', url, timeout=120)
    return page.data.decode('utf-8')

def count_pages(paper_id, folder):
    """cantaloupe iiif server returns the highest page index with an error
    if out of range is requested
    """
    url = "http://127.0.0.1:8182/iiif/2/{}:{}.pdf/full/500,/0/default.jpg?page=1000"
    url = url.format(folder, paper_id)
    page = req_internal(url)
    try:
        count = re_pg.findall(page)[0]
    except:
        count = 0
        
    return int(count)

In [5]:
def detect_graph_types_from_iiif(paper_id, folder, learner, debug=False):
    """Pull images from iiif server
    """
    pages = count_pages(paper_id, folder)
    if pages == 0:
        return empty_result(paper_id, folder)

    url = "http://127.0.0.1:8182/iiif/2/{}:{}.pdf/full/560,560/0/default.png?page={}"
    images = [open_image(io.BytesIO(requests.get(url.format(folder, paper_id, pg)).content)) for pg in range(1, pages+1)]
    
    classes_detected = detect_graph_types_from_list(images, learner)
    classes_detected['paper_id'] = paper_id.replace("%2b", "/")
    classes_detected['folder'] = folder
    
    return classes_detected


def empty_result(paper_id, folder):
    classes_detected = dict()  
    classes_detected['bar'] = 0
    classes_detected['pie'] = 0
    classes_detected['hist'] = 0
    classes_detected['bardot'] = 0
    classes_detected['box'] = 0
    classes_detected['dot'] = 0
    classes_detected['violin'] = 0
    classes_detected['paper_id'] = paper_id.replace("%2b", "/")
    classes_detected['folder'] = folder
    
    return classes_detected


def detect_graph_types_from_list(images, learner):
    """Predicts graph types for each image and returns pages with bar graphs
    """
    page_predictions = np.array([predict_graph_type(images[idx], learner) for idx in range(0, len(images))])
    bar_pages = np.where(page_predictions == 'bar')[0] + 1 #add 1 to page idx such that page counting starts at 1
    pie_pages = np.where(page_predictions == 'pie')[0] + 1
    hist_pages = np.where(page_predictions == 'hist')[0] + 1
    bardot_pages = np.where(page_predictions == 'bardot')[0] + 1
    box_pages = np.where(page_predictions == 'box')[0] + 1
    dot_pages = np.where(page_predictions == 'dot')[0] + 1
    violin_pages = np.where(page_predictions == 'violin')[0] + 1
    positive_pages = hist_pages.tolist() + bardot_pages.tolist() + box_pages.tolist() + dot_pages.tolist() + violin_pages.tolist()
    if len(positive_pages) > 0:
        positive_pages = list(set(positive_pages)) #remove duplicates and sort
        positive_pages.sort()


    classes_detected = dict()
    classes_detected['bar'] = len(bar_pages.tolist())
    classes_detected['pie'] = len(pie_pages.tolist())
    classes_detected['hist'] = len(hist_pages.tolist())
    classes_detected['bardot'] = len(bardot_pages.tolist())
    classes_detected['box'] = len(box_pages.tolist())
    classes_detected['dot'] = len(dot_pages.tolist())
    classes_detected['violin'] = len(violin_pages.tolist())

    """
    classes_detected = [len(bar_pages.tolist()),
                        len(pie_pages.tolist()),
                        len(hist_pages.tolist()),
                        len(bardot_pages.tolist()),
                        len(box_pages.tolist()),
                        len(dot_pages.tolist()),
                        len(violin_pages.tolist())                        
                       ]
    """

    return classes_detected


def predict_graph_type(img, learner):
    """Use fastai model on each image to predict types of pages
    """
    class_names = {
        "0": ["approp"],
        "1": ["bar"],
        "2": ["bardot"],
        "3": ["box"],
        "4": ["dot"],
        "5": ["hist"],
        "6": ["other"],
        "7": ["pie"],
        "8": ["text"],
        "9": ["violin"]
    }
    
    pred_class,pred_idx,outputs = learner.predict(img)
    
    if pred_idx.sum().tolist() == 0: #if there is no predicted class 
        #(=no class over threshold) give out class with highest prediction probability
        highest_pred = str(np.argmax(outputs).tolist())
        pred_class = class_names[highest_pred]
    else: 
        pred_class = pred_class.obj #extract class name as text
        
    return(pred_class)



In [6]:
paper_id = "10.1101%2b2020.06.11.20127019"
folder = '06_15-06_21'
detect_graph_types_from_iiif(paper_id, folder, learn)

{'bar': 0,
 'pie': 0,
 'hist': 3,
 'bardot': 0,
 'box': 0,
 'dot': 0,
 'violin': 0,
 'paper_id': '10.1101/2020.06.11.20127019',
 'folder': '06_15-06_21'}

### Predict number of pages with each graph type for all PDFs

In [12]:
paper_list = []
for root, dirs, files in os.walk(pdf_folder):
    for filename in files:
        paper_dict = {"paper_id": filename[:-4].replace("+", "%2b")}
        paper_list.append(paper_dict)
        
paper_table = pd.DataFrame(paper_list)

In [13]:
paper_table

Unnamed: 0,paper_id
0,10.1101%2b2020.06.14.20130732
1,10.1101%2b2020.06.15.150482
2,10.1101%2b2020.06.15.20117747
3,10.1101%2b2020.06.16.155457
4,10.1101%2b2020.06.17.20134262
...,...
288,10.1101%2b2020.06.27.174961
289,10.1101%2b2020.06.27.174979
290,10.1101%2b2020.06.27.175166
291,10.1101%2b2020.06.27.175448


In [14]:
paper_table.iloc[100:].head()

Unnamed: 0,paper_id
100,10.1101%2b2020.06.22.20137273
101,10.1101%2b2020.06.22.20137299
102,10.1101%2b2020.06.22.20137380
103,10.1101%2b2020.06.22.20137406
104,10.1101%2b2020.06.22.20137422


In [16]:
barzooka_results_list = [] 
for index, row in paper_table.iterrows():
    print(row['paper_id'])
    barzooka_result = detect_graph_types_from_iiif(row['paper_id'], current_folder, learn)
    barzooka_results_list.append(barzooka_result)
    
barzooka_results = pd.DataFrame(barzooka_results_list)  
barzooka_results.to_csv(save_filename)

10.1101%2b2020.06.14.20130732
10.1101%2b2020.06.15.150482
10.1101%2b2020.06.15.20117747
10.1101%2b2020.06.16.155457
10.1101%2b2020.06.17.20134262
10.1101%2b2020.06.18.147074
10.1101%2b2020.06.18.156851
10.1101%2b2020.06.18.20115873
10.1101%2b2020.06.18.20130377
10.1101%2b2020.06.18.20132571
10.1101%2b2020.06.18.20132977
10.1101%2b2020.06.18.20134577
10.1101%2b2020.06.18.20134593
10.1101%2b2020.06.18.20134619
10.1101%2b2020.06.18.20134759
10.1101%2b2020.06.18.20134841
10.1101%2b2020.06.18.20134916
10.1101%2b2020.06.18.20135004
10.1101%2b2020.06.18.20135012
10.1101%2b2020.06.18.20135046
10.1101%2b2020.06.18.20135111
10.1101%2b2020.06.18.20135145
10.1101%2b2020.06.18.20135152
10.1101%2b2020.06.18.20135210
10.1101%2b2020.06.19.20109173
10.1101%2b2020.06.19.20128207
10.1101%2b2020.06.19.20134379
10.1101%2b2020.06.19.20135426
10.1101%2b2020.06.19.20135640
10.1101%2b2020.06.19.20135830
10.1101%2b2020.06.19.20136093
10.1101%2b2020.06.20.162701
10.1101%2b2020.06.20.163097
10.1101%2b2020.06.20.2

10.1101%2b2020.06.26.20140905
10.1101%2b2020.06.26.20140921
10.1101%2b2020.06.26.20141044
10.1101%2b2020.06.26.20141077
10.1101%2b2020.06.26.20141085
10.1101%2b2020.06.26.20141135
10.1101%2b2020.06.26.20141150
10.1101%2b2020.06.26.20141242
10.1101%2b2020.06.26.20141341
10.1101%2b2020.06.27.174961
10.1101%2b2020.06.27.174979
10.1101%2b2020.06.27.175166
10.1101%2b2020.06.27.175448
10.1101%2b2020.06.27.20141002


FileNotFoundError: [Errno 2] No such file or directory: 'results/barzooka_preprint_results_2020-06-22_2020-06-28.csv'

In [17]:
barzooka_results = pd.DataFrame(barzooka_results_list) 
barzooka_results

Unnamed: 0,bar,pie,hist,bardot,box,dot,violin,paper_id,folder
0,0,0,0,0,1,0,0,10.1101/2020.06.14.20130732,2020-06-22_2020-06-28
1,0,0,0,0,0,0,0,10.1101/2020.06.15.150482,2020-06-22_2020-06-28
2,0,0,0,0,0,0,0,10.1101/2020.06.15.20117747,2020-06-22_2020-06-28
3,0,0,0,0,0,0,0,10.1101/2020.06.16.155457,2020-06-22_2020-06-28
4,0,0,0,0,0,0,0,10.1101/2020.06.17.20134262,2020-06-22_2020-06-28
...,...,...,...,...,...,...,...,...,...
288,0,0,0,0,0,0,0,10.1101/2020.06.27.174961,2020-06-22_2020-06-28
289,0,0,0,0,0,0,0,10.1101/2020.06.27.174979,2020-06-22_2020-06-28
290,0,0,0,0,0,0,0,10.1101/2020.06.27.175166,2020-06-22_2020-06-28
291,1,0,0,0,0,0,0,10.1101/2020.06.27.175448,2020-06-22_2020-06-28


In [24]:
barzooka_results.to_csv(save_filename)