In [28]:
import pandas as pd
import os
import requests

os.environ['NO_PROXY'] = '127.0.0.1'

In [8]:
from fastai.vision import *
ml_path = 'S:/Partner/BIH/QUEST/CENTER/1-Research/Improve My Research Button/Publication Bargraph Classifier/webapp/barzooka'
learn = load_learner(path=ml_path, file='export.pkl')

### Functions for bar graph detection per PDF

In [97]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

re_pg = re.compile(r'Index \d+ out of bounds for length (\d+)')

def req_internal(url):
    http = urllib3.PoolManager(cert_reqs='CERT_NONE')
    page = http.request('get', url, timeout=120)
    return page.data.decode('utf-8')

def count_pages(paper_id, year):
    """cantaloupe iiif server returns the highest page index with an error
    if out of range is requested
    """
    url = "http://127.0.0.1:8182/iiif/2/{}:{}.pdf/full/500,/0/default.jpg?page=1000"
    url = url.format(year, paper_id)
    page = req_internal(url)
    count = re_pg.findall(page)[0]
    return int(count)

In [98]:
def detect_graph_types_from_iiif(paper_id, year, learner, debug=False):
    """Pull images from iiif server
    """
    pages = count_pages(paper_id, year)

    url = "http://127.0.0.1:8182/iiif/2/{}:{}.pdf/full/560,560/0/default.png?page={}"
    images = [open_image(io.BytesIO(requests.get(url.format(year, paper_id, pg)).content)) for pg in range(1, pages+1)]
    
    classes_detected = detect_graph_types_from_list(images, learner)
    classes_detected['paper_id'] = paper_id.replace("%2b", "/")
    classes_detected['year'] = year
    
    return classes_detected


def detect_graph_types_from_list(images, learner):
    """Predicts graph types for each image and returns pages with bar graphs
    """
    page_predictions = np.array([predict_graph_type(images[idx], learner) for idx in range(0, len(images))])
    bar_pages = np.where(page_predictions == 'bar')[0] + 1 #add 1 to page idx such that page counting starts at 1
    pie_pages = np.where(page_predictions == 'pie')[0] + 1
    hist_pages = np.where(page_predictions == 'hist')[0] + 1
    bardot_pages = np.where(page_predictions == 'bardot')[0] + 1
    box_pages = np.where(page_predictions == 'box')[0] + 1
    dot_pages = np.where(page_predictions == 'dot')[0] + 1
    violin_pages = np.where(page_predictions == 'violin')[0] + 1
    positive_pages = hist_pages.tolist() + bardot_pages.tolist() + box_pages.tolist() + dot_pages.tolist() + violin_pages.tolist()
    if len(positive_pages) > 0:
        positive_pages = list(set(positive_pages)) #remove duplicates and sort
        positive_pages.sort()


    classes_detected = dict();  
    classes_detected['bar'] = len(bar_pages.tolist())
    classes_detected['pie'] = len(pie_pages.tolist())
    classes_detected['hist'] = len(hist_pages.tolist())
    classes_detected['bardot'] = len(bardot_pages.tolist())
    classes_detected['box'] = len(box_pages.tolist())
    classes_detected['dot'] = len(dot_pages.tolist())
    classes_detected['violin'] = len(violin_pages.tolist())

    """
    classes_detected = [len(bar_pages.tolist()),
                        len(pie_pages.tolist()),
                        len(hist_pages.tolist()),
                        len(bardot_pages.tolist()),
                        len(box_pages.tolist()),
                        len(dot_pages.tolist()),
                        len(violin_pages.tolist())                        
                       ]
    """

    return classes_detected


def predict_graph_type(img, learner):
    """Use fastai model on each image to predict types of pages
    """
    class_names = {
        "0": ["approp"],
        "1": ["bar"],
        "2": ["bardot"],
        "3": ["box"],
        "4": ["dot"],
        "5": ["hist"],
        "6": ["other"],
        "7": ["pie"],
        "8": ["text"],
        "9": ["violin"]
    }
    
    pred_class,pred_idx,outputs = learner.predict(img)
    
    if pred_idx.sum().tolist() == 0: #if there is no predicted class 
        #(=no class over threshold) give out class with highest prediction probability
        highest_pred = str(np.argmax(outputs).tolist())
        pred_class = class_names[highest_pred]
    else: 
        pred_class = pred_class.obj #extract class name as text
        
    return(pred_class)



In [99]:
paper_id = '10.1371%2bjournal.pone.0148798'
year = '2016'
detect_graph_types_from_iiif(paper_id, year, learn)

{'bar': 1,
 'pie': 0,
 'hist': 0,
 'bardot': 0,
 'box': 0,
 'dot': 0,
 'violin': 0,
 'paper_id': '10.1371/journal.pone.0148798',
 'year': '2016'}

### Predict number of pages with each graph type for all PDFs

In [62]:
pdf_folder = 'C:\Datenablage\charite_dashboard\PDFs'

In [105]:
paper_list = []
for root, dirs, files in os.walk(pdf_folder):
    for filename in files:
        paper_dict = {"paper_id": filename[:-4].replace("+", "%2b"),
               "year": root[-4:]}
        paper_list.append(paper_dict)
        

paper_table = pd.DataFrame(paper_list)  

In [106]:
paper_table.head()

Unnamed: 0,paper_id,year
0,10.1001%2bjamaoncol.2015.0830,2015
1,10.1002%2bacn3.189,2015
2,10.1002%2bacr.22453,2015
3,10.1002%2badhm.201500503,2015
4,10.1002%2bajh.23952,2015


In [116]:
barzooka_results_list = [] 
for index, row in paper_table.iterrows():
    print(row['paper_id'], row['year'])
    barzooka_result = detect_graph_types_from_iiif(row['paper_id'], row['year'], learn)
    barzooka_results_list.append(barzooka_result)
    
barzooka_results = pd.DataFrame(barzooka_results_list)  
barzooka_results.to_csv("..\\results\\Barzooka.csv")

10.1001%2bjamaoncol.2015.0830 2015
10.1002%2bacn3.189 2015
10.1002%2bacr.22453 2015
10.1002%2badhm.201500503 2015
10.1002%2bajh.23952 2015
