In [28]:
import pandas as pd
import os
import requests

os.environ['NO_PROXY'] = '127.0.0.1'

In [8]:
from fastai.vision import *
ml_path = 'S:/Partner/BIH/QUEST/CENTER/1-Research/Improve My Research Button/Publication Bargraph Classifier/webapp/barzooka'
learn = load_learner(path=ml_path, file='export.pkl')

### Functions for bar graph detection per PDF

In [141]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

re_pg = re.compile(r'Index \d+ out of bounds for length (\d+)')

def req_internal(url):
    http = urllib3.PoolManager(cert_reqs='CERT_NONE')
    page = http.request('get', url, timeout=120)
    return page.data.decode('utf-8')

def count_pages(paper_id, year):
    """cantaloupe iiif server returns the highest page index with an error
    if out of range is requested
    """
    url = "http://127.0.0.1:8182/iiif/2/{}:{}.pdf/full/500,/0/default.jpg?page=1000"
    url = url.format(year, paper_id)
    page = req_internal(url)
    try:
        count = re_pg.findall(page)[0]
    except:
        count = 0
        
    return int(count)

In [146]:
def detect_graph_types_from_iiif(paper_id, year, learner, debug=False):
    """Pull images from iiif server
    """
    pages = count_pages(paper_id, year)
    if pages == 0:
        return empty_result(paper_id, year)

    url = "http://127.0.0.1:8182/iiif/2/{}:{}.pdf/full/560,560/0/default.png?page={}"
    images = [open_image(io.BytesIO(requests.get(url.format(year, paper_id, pg)).content)) for pg in range(1, pages+1)]
    
    classes_detected = detect_graph_types_from_list(images, learner)
    classes_detected['paper_id'] = paper_id.replace("%2b", "/")
    classes_detected['year'] = year
    
    return classes_detected


def empty_result(paper_id, year):
    classes_detected = dict()  
    classes_detected['bar'] = 0
    classes_detected['pie'] = 0
    classes_detected['hist'] = 0
    classes_detected['bardot'] = 0
    classes_detected['box'] = 0
    classes_detected['dot'] = 0
    classes_detected['violin'] = 0
    classes_detected['paper_id'] = paper_id.replace("%2b", "/")
    classes_detected['year'] = year
    
    return classes_detected


def detect_graph_types_from_list(images, learner):
    """Predicts graph types for each image and returns pages with bar graphs
    """
    page_predictions = np.array([predict_graph_type(images[idx], learner) for idx in range(0, len(images))])
    bar_pages = np.where(page_predictions == 'bar')[0] + 1 #add 1 to page idx such that page counting starts at 1
    pie_pages = np.where(page_predictions == 'pie')[0] + 1
    hist_pages = np.where(page_predictions == 'hist')[0] + 1
    bardot_pages = np.where(page_predictions == 'bardot')[0] + 1
    box_pages = np.where(page_predictions == 'box')[0] + 1
    dot_pages = np.where(page_predictions == 'dot')[0] + 1
    violin_pages = np.where(page_predictions == 'violin')[0] + 1
    positive_pages = hist_pages.tolist() + bardot_pages.tolist() + box_pages.tolist() + dot_pages.tolist() + violin_pages.tolist()
    if len(positive_pages) > 0:
        positive_pages = list(set(positive_pages)) #remove duplicates and sort
        positive_pages.sort()


    classes_detected = dict()
    classes_detected['bar'] = len(bar_pages.tolist())
    classes_detected['pie'] = len(pie_pages.tolist())
    classes_detected['hist'] = len(hist_pages.tolist())
    classes_detected['bardot'] = len(bardot_pages.tolist())
    classes_detected['box'] = len(box_pages.tolist())
    classes_detected['dot'] = len(dot_pages.tolist())
    classes_detected['violin'] = len(violin_pages.tolist())

    """
    classes_detected = [len(bar_pages.tolist()),
                        len(pie_pages.tolist()),
                        len(hist_pages.tolist()),
                        len(bardot_pages.tolist()),
                        len(box_pages.tolist()),
                        len(dot_pages.tolist()),
                        len(violin_pages.tolist())                        
                       ]
    """

    return classes_detected


def predict_graph_type(img, learner):
    """Use fastai model on each image to predict types of pages
    """
    class_names = {
        "0": ["approp"],
        "1": ["bar"],
        "2": ["bardot"],
        "3": ["box"],
        "4": ["dot"],
        "5": ["hist"],
        "6": ["other"],
        "7": ["pie"],
        "8": ["text"],
        "9": ["violin"]
    }
    
    pred_class,pred_idx,outputs = learner.predict(img)
    
    if pred_idx.sum().tolist() == 0: #if there is no predicted class 
        #(=no class over threshold) give out class with highest prediction probability
        highest_pred = str(np.argmax(outputs).tolist())
        pred_class = class_names[highest_pred]
    else: 
        pred_class = pred_class.obj #extract class name as text
        
    return(pred_class)



In [147]:
paper_id = "10.1002%2bejhf.1351"
year = '2018'
detect_graph_types_from_iiif(paper_id, year, learn)

{'bar': 0,
 'pie': 0,
 'hist': 0,
 'bardot': 0,
 'box': 0,
 'dot': 0,
 'violin': 0,
 'paper_id': '10.1002/ejhf.1351',
 'year': '2018'}

### Predict number of pages with each graph type for all PDFs

In [155]:
pdf_folder = 'C:\Datenablage\charite_dashboard\PDFs'

In [156]:
paper_list = []
for root, dirs, files in os.walk(pdf_folder):
    for filename in files:
        paper_dict = {"paper_id": filename[:-4].replace("+", "%2b"),
               "year": root[-4:]}
        paper_list.append(paper_dict)
        
paper_table = pd.DataFrame(paper_list)

In [157]:
#as the processing takes very long, run one year at a time
year = "2015"
paper_table_filtered = paper_table[paper_table.year == year]

In [158]:
barzooka_results_list = [] 
for index, row in paper_table_filtered.iterrows():
    print(row['paper_id'], row['year'])
    barzooka_result = detect_graph_types_from_iiif(row['paper_id'], row['year'], learn)
    barzooka_results_list.append(barzooka_result)
    
barzooka_results = pd.DataFrame(barzooka_results_list)  
barzooka_results.to_csv("..\\results\\Barzooka_" + year + ".csv")

10.1001%2bjamaoncol.2015.0830 2015
10.1002%2bacn3.189 2015
10.1002%2bacr.22453 2015
10.1002%2badhm.201500503 2015
10.1002%2bajh.23952 2015
10.1002%2bajh.24120 2015
10.1002%2bajmg.a.37365 2015
10.1002%2bajmg.a.37464 2015
10.1002%2bajmg.b.32299 2015
10.1002%2bajmg.b.32312 2015
10.1002%2bana.24336 2015
10.1002%2bana.24398 2015
10.1002%2bana.24554 2015
10.1002%2banie.201502931 2015
10.1002%2banie.201505138 2015
10.1002%2bart.38996 2015
10.1002%2bart.39008 2015
10.1002%2bbdra.23321 2015
10.1002%2bbjs.10050 2015
10.1002%2bbmc.3554 2015
10.1002%2bbrb3.365 2015
10.1002%2bbrb3.421 2015
10.1002%2bbtpr.2182 2015
10.1002%2bca.22650 2015
10.1002%2bcam4.448 2015
10.1002%2bcam4.566 2015
10.1002%2bccd.25834 2015
10.1002%2bccd.25892 2015
10.1002%2bccd.26216 2015
10.1002%2bccd.26257 2015
10.1002%2bccr3.302 2015
10.1002%2bccr3.467 2015
10.1002%2bcjp2.25 2015
10.1002%2bclc.22375 2015
10.1002%2bcmmi.1660 2015
10.1002%2bcne.23865 2015
10.1002%2bcpp.1975 2015
10.1002%2bcyto.a.22626 2015
10.1002%2bcyto.a.2264

10.1007%2bs00380-015-0763-0 2015
10.1007%2bs00380-015-0784-8 2015
10.1007%2bs00381-015-2635-4 2015
10.1007%2bs00381-015-2727-1 2015
10.1007%2bs00381-015-2787-2 2015
10.1007%2bs00384-015-2130-0 2015
10.1007%2bs00391-015-0886-z 2015
10.1007%2bs00391-015-0958-0 2015
10.1007%2bs00391-015-0959-z 2015
10.1007%2bs00392-015-0820-9 2015
10.1007%2bs00392-015-0828-1 2015
10.1007%2bs00392-015-0841-4 2015
10.1007%2bs00392-015-0863-y 2015
10.1007%2bs00392-015-0893-5 2015
10.1007%2bs00392-015-0912-6 2015
10.1007%2bs00393-014-1561-2 2015
10.1007%2bs00399-015-0359-5 2015
10.1007%2bs00399-015-0361-y 2015
10.1007%2bs00399-015-0376-4 2015
10.1007%2bs00399-015-0381-7 2015
10.1007%2bs00399-015-0392-4 2015
10.1007%2bs00399-015-0395-1 2015
10.1007%2bs00401-015-1384-5 2015
10.1007%2bs00401-015-1440-1 2015
10.1007%2bs00401-015-1497-x 2015
10.1007%2bs00401-015-1529-6 2015
10.1007%2bs00402-015-2271-1 2015
10.1007%2bs00402-015-2315-6 2015
10.1007%2bs00402-015-2317-4 2015
10.1007%2bs00402-015-2334-3 2015
10.1007%2b

10.1007%2bs11818-015-0699-8 2015
10.1007%2bs11818-015-0700-6 2015
10.1007%2bs11818-015-0702-4 2015
10.1007%2bs11825-014-0033-z 2015
10.1007%2bs11825-015-0034-6 2015
10.1007%2bs11825-015-0066-y 2015
10.1007%2bs11999-015-4462-z 2015
10.1007%2bs12015-014-9581-5 2015
10.1007%2bs12024-014-9641-9 2015
10.1007%2bs12024-015-9671-y 2015
10.1007%2bs12035-015-9390-0 2015
10.1007%2bs12038-015-9520-x 2015
10.1007%2bs12149-015-0976-4 2015
10.1007%2bs12181-014-0620-x 2015
10.1007%2bs12181-014-0636-2 2015
10.1007%2bs12181-015-0014-8 2015
10.1007%2bs12181-015-0021-9 2015
10.1007%2bs12181-015-0023-7 2015
10.1007%2bs12181-015-0654-8 2015
10.1007%2bs12402-014-0164-8 2015
10.1007%2bs12603-015-0589-6 2015
10.1007%2bs13277-015-4031-9 2015
10.1007%2bs13555-015-0089-y 2015
10.1007%2bs15010-015-0780-z 2015
10.1007%2bs15010-015-0813-7 2015
10.1007%2bs15010-015-0865-8 2015
10.1007%2bs40120-015-0034-0 2015
10.1007%2bs40121-015-0075-9 2015
10.1007%2bs40520-015-0444-y 2015
10.1007%2bs40620-015-0223-z 2015
10.1007%2b

10.1016%2bj.gene.2015.05.043 2015
10.1016%2bj.gene.2015.07.073 2015
10.1016%2bj.gene.2015.09.035 2015
10.1016%2bj.humimm.2015.09.028 2015
10.1016%2bj.ijcard.2009.06.061 2015
10.1016%2bj.ijcard.2015.02.009 2015
10.1016%2bj.ijcard.2015.03.062 2015
10.1016%2bj.ijcard.2015.03.261 2015
10.1016%2bj.ijcard.2015.05.110 2015
10.1016%2bj.ijcard.2015.06.067 2015
10.1016%2bj.ijcard.2015.07.044 2015
10.1016%2bj.ijcard.2015.08.089 2015
10.1016%2bj.ijcard.2015.10.115 2015
10.1016%2bj.ijcard.2015.11.178 2015
10.1016%2bj.ijgo.2015.01.013 2015
10.1016%2bj.ijgo.2015.08.018 2015
10.1016%2bj.ijmm.2015.04.002 2015
10.1016%2bj.ijmm.2015.08.009 2015
10.1016%2bj.ijmm.2015.08.022 2015
10.1016%2bj.ijmm.2015.08.033 2015
10.1016%2bj.ijmm.2015.08.034 2015
10.1016%2bj.ijnurstu.2014.12.009 2015
10.1016%2bj.ijpharm.2015.03.016 2015
10.1016%2bj.ijpharm.2015.03.063 2015
10.1016%2bj.ijporl.2015.10.003 2015
10.1016%2bj.ijrobp.2014.12.034 2015
10.1016%2bj.ijrobp.2015.10.027 2015
10.1016%2bj.ijsu.2015.07.011 2015
10.1016%2b

10.1016%2bj.wneu.2015.08.037 2015
10.1016%2bj.wneu.2015.10.024 2015
10.1016%2bj.yebeh.2015.02.041 2015
10.1016%2bj.yebeh.2015.04.006 2015
10.1016%2bj.yebeh.2015.05.036 2015
10.1016%2bj.yebeh.2015.11.009 2015
10.1016%2bj.yexcr.2015.02.008 2015
10.1016%2bj.ygyno.2015.07.007 2015
10.1016%2bj.ygyno.2015.07.010 2015
10.1016%2bj.ygyno.2015.12.018 2015
10.1016%2bj.yjmcc.2015.03.012 2015
10.1016%2bj.yjmcc.2015.07.029 2015
10.1016%2bj.yjmcc.2015.09.018 2015
10.1016%2bj.ymgme.2015.07.003 2015
10.1016%2bj.ymgmr.2015.02.001 2015
10.1016%2bj.zefq.2015.01.009 2015
10.1016%2bj.zefq.2015.03.008 2015
10.1016%2bj.zefq.2015.10.004 2015
10.1016%2bS0140-6736%2814%2961682-2 2015
10.1016%2bS0140-6736%2815%2960692-4 2015
10.1016%2bS0140-6736%2815%2961340-X 2015
10.1016%2bS0140-6736(14)62053-5 2015
10.1016%2bS0140-6736(15)00127-0 2015
10.1016%2bS0140-6736(15)00388-8 2015
10.1016%2bS0140-6736(15)00817-X 2015
10.1016%2bS0140-6736(15)01035-1 2015
10.1016%2bS0140-6736(15)60076-9 2015
10.1016%2bS0140-6736(15)61107-

10.1055%2bs-0035-1554991 2015
10.1055%2bs-0035-1555649 2015
10.1055%2bs-0035-1555771 2015
10.1055%2bs-0035-1555785 2015
10.1055%2bs-0035-1555793 2015
10.1055%2bs-0035-1555943 2015
10.1055%2bs-0035-1555946 2015
10.1055%2bs-0035-1557858 2015
10.1055%2bs-0035-1558059 2015
10.1055%2bs-0035-1558067 2015
10.1055%2bs-0035-1559639 2015
10.1055%2bs-0035-1559649 2015
10.1055%2bs-0035-1559707 2015
10.1055%2bs-0035-1559727 2015
10.1055%2bs-0035-1563609 2015
10.1055%2bs-0035-1563788 2015
10.1055%2bs-0035-1564276 2015
10.1055%2bs-0035-1564277 2015
10.1055%2bs-0035-1564278 2015
10.1055%2bs-0035-1564279 2015
10.1055%2bs-0035-1565082 2015
10.1055%2bs-0035-1565130 2015
10.1055%2bs-0035-1565208 2015
10.1055%2bs-0035-1565235 2015
10.1055%2bs-0040-100413 2015
10.1055%2bs-0041-100443 2015
10.1055%2bs-0041-100564 2015
10.1055%2bs-0041-100608 2015
10.1055%2bs-0041-100777 2015
10.1055%2bs-0041-100846 2015
10.1055%2bs-0041-101118 2015
10.1055%2bs-0041-101292 2015
10.1055%2bs-0041-101298 2015
10.1055%2bs-0041-10

10.1111%2bajt.13175 2015
10.1111%2bajt.13181 2015
10.1111%2bajt.13241 2015
10.1111%2bajt.13252 2015
10.1111%2bajt.13315 2015
10.1111%2bajt.13364 2015
10.1111%2bajt.13380 2015
10.1111%2bajt.13488 2015
10.1111%2ball.12531 2015
10.1111%2ball.12548 2015
10.1111%2ball.12576 2015
10.1111%2ball.12588 2015
10.1111%2ball.12614 2015
10.1111%2ball.12626 2015
10.1111%2ball.12630 2015
10.1111%2ball.12637 2015
10.1111%2ball.12640 2015
10.1111%2ball.12658 2015
10.1111%2ball.12666 2015
10.1111%2ball.12676 2015
10.1111%2ball.12686 2015
10.1111%2ball.12714 2015
10.1111%2ball.12801 2015
10.1111%2ball.12818 2015
10.1111%2bane.12414 2015
10.1111%2banec.12301 2015
10.1111%2baor.12634 2015
10.1111%2bapa.12914 2015
10.1111%2bapa.13280 2015
10.1111%2bapha.12399 2015
10.1111%2bapha.12439 2015
10.1111%2bapha.12451 2015
10.1111%2bapha.12457 2015
10.1111%2bapha.12464 2015
10.1111%2bapha.12472 2015
10.1111%2bapha.12492 2015
10.1111%2bapha.12509 2015
10.1111%2bapha.12518 2015
10.1111%2bapha.12543 2015
10.1111%2bapha

10.1152%2bajplung.00265.2014 2015
10.1152%2bajpregu.00154.2015 2015
10.1152%2bajpregu.00273.2015 2015
10.1152%2bajpregu.00388.2015 2015
10.1152%2bajprenal.00617.2014 2015
10.1152%2bjapplphysiol.00374.2015 2015
10.1152%2bjapplphysiol.00458.2014 2015
10.1152%2bjn.00249.2015 2015
10.1152%2bjn.00260.2015 2015
10.1152%2bjn.00577.2015 2015
10.1152%2bjn.00744.2014 2015
10.1152%2bjn.00783.2014 2015
10.1152%2bjn.00832.2014 2015
10.1152%2bjn.00969.2015 2015
10.1152%2bjn.00993.2014 2015
10.1152%2bjn.01012.2015 2015
10.1152%2bphysiolgenomics.00008.2015 2015
10.1155%2b2015%2b134708 2015
10.1155%2b2015%2b143109 2015
10.1155%2b2015%2b145154 2015
10.1155%2b2015%2b278139 2015
10.1155%2b2015%2b308185 2015
10.1155%2b2015%2b318306 2015
10.1155%2b2015%2b318586 2015
10.1155%2b2015%2b380615 2015
10.1155%2b2015%2b462592 2015
10.1155%2b2015%2b471719 2015
10.1155%2b2015%2b490947 2015
10.1155%2b2015%2b530371 2015
10.1155%2b2015%2b579675 2015
10.1155%2b2015%2b604028 2015
10.1155%2b2015%2b608141 2015
10.1155%2b201

10.1186%2b1745-6215-15-412 2015
10.1186%2b2001-1326-3-7 2015
10.1186%2b2197-425X-3-S1-A520 2015
10.1186%2bs10194-015-0539-z 2015
10.1186%2bs11689-015-9128-3 2015
10.1186%2bs12245-015-0053-8 2015
10.1186%2bs12859-015-0730-x 2015
10.1186%2bs12861-015-0089-2 2015
10.1186%2bs12862-015-0407-0 2015
10.1186%2bs12863-015-0204-1 2015
10.1186%2bs12864-015-1262-5 2015
10.1186%2bs12864-015-1322-x 2015
10.1186%2bs12864-015-1785-9 2015
10.1186%2bs12864-015-1973-7 2015
10.1186%2bs12865-015-0085-0 2015
10.1186%2bs12867-015-0031-y 2015
10.1186%2bs12868-015-0146-6 2015
10.1186%2bs12868-015-0198-7 2015
10.1186%2bs12870-015-0615-1 2015
10.1186%2bs12871-015-0043-7 2015
10.1186%2bs12871-015-0139-0 2015
10.1186%2bs12875-015-0249-2 2015
10.1186%2bs12876-015-0271-9 2015
10.1186%2bs12876-015-0292-4 2015
10.1186%2bs12877-015-0130-0 2015
10.1186%2bs12879-014-0738-2 2015
10.1186%2bs12879-015-0767-5 2015
10.1186%2bs12879-015-0882-3 2015
10.1186%2bs12879-015-0912-1 2015
10.1186%2bs12879-015-0952-6 2015
10.1186%2bs12

10.12659%2bMSMBR.894840 2015
10.12659%2bMSMBR.894985 2015
10.12659%2bMSMBR.895003 2015
10.12659%2bMSMBR.895004 2015
10.12659%2bMSMBR.895418 2015
10.12688%2bf1000research.6012.2 2015
10.12945%2bj.aorta.2015.14.064 2015
10.1369%2b0022155415575028 2015
10.1371%2bjournal.pbio.1002033 2015
10.1371%2bjournal.pbio.1002181 2015
10.1371%2bjournal.pbio.1002241 2015
10.1371%2bjournal.pbio.1002315 2015
10.1371%2bjournal.pcbi.1003965 2015
10.1371%2bjournal.pcbi.1004033 2015
10.1371%2bjournal.pcbi.1004206 2015
10.1371%2bjournal.pcbi.1004352 2015
10.1371%2bjournal.pcbi.1004407 2015
10.1371%2bjournal.pgen.1005076 2015
10.1371%2bjournal.pgen.1005218 2015
10.1371%2bjournal.pgen.1005226 2015
10.1371%2bjournal.pmed.1001841 2015
10.1371%2bjournal.pntd.0003651 2015
10.1371%2bjournal.pntd.0003699 2015
10.1371%2bjournal.pntd.0003769 2015
10.1371%2bjournal.pntd.0004126 2015
10.1371%2bjournal.pntd.0004182 2015
10.1371%2bjournal.pntd.0004188 2015
10.1371%2bjournal.pone.0113170 2015
10.1371%2bjournal.pone.0113482

10.3109%2b08880018.2014.949941 2015
10.3109%2b08923973.2014.971964 2015
10.3109%2b08941939.2014.941445 2015
10.3109%2b09273948.2015.1034375 2015
10.3109%2b09540261.2015.1086321 2015
10.3109%2b09546634.2015.1034076 2015
10.3109%2b10253890.2015.1004628 2015
10.3109%2b10253890.2015.1087504 2015
10.3109%2b10428194.2014.961011 2015
10.3109%2b10428194.2014.981175 2015
10.3109%2b10428194.2015.1044449 2015
10.3109%2b10428194.2015.1088650 2015
10.3109%2b14397595.2015.1056993 2015
10.3109%2b14767058.2014.991917 2015
10.3109%2b15360288.2014.1003677 2015
10.3109%2b15622975.2015.1036116 2015
10.3109%2b15622975.2015.1076173 2015
10.3109%2b15622975.2015.1083612 2015
10.3109%2b15622975.2015.1112032 2015
10.3111%2b13696998.2015.1014090 2015
10.3174%2bajnr.A4320 2015
10.3201%2beid2103.140927 2015
10.3201%2beid2112.150891 2015
10.3205%2b000213 2015
10.3205%2b000219 2015
10.3205%2b000224 2015
10.3205%2bcto000117 2015
10.3205%2biprs000061 2015
10.3205%2biprs000066 2015
10.3205%2biprs000075 2015
10.3205%2bi