In [131]:
import sys, getopt, re
from lxml import etree
from utils import UsageError, ConfigError
from pdf2xml import pdf2etree

In [132]:
ref_re = re.compile("\[\d+?\]\s.+?[0-9]{2}\.(?#=\s[[$])", re.MULTILINE)
url_re = re.compile("((https?|ftp|gopher|telnet|file|notes|ms-help):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)")
title_re = re.compile(u'("|“|“)(.+?)("|”|”)'.encode('utf8'), re.DOTALL)
vol_re = re.compile('([Vv]ol(?:\.|ume)\s)([0-9]+)')
ed_re = re.compile('([0-9]+)(\sed\W)')
pages_re1 = re.compile(u'([Pp]p\.?\s?)([0-9]+[-\u2013][0-9]+)')
pages_re2 = re.compile(u'(:)([0-9]+[-\u2013][0-9]+)')
year_re1 = re.compile("(, )([1-2][0-9]{3})(\D)")
year_re2 = re.compile("\(([1-2][0-9]{3})\)")
year_re3 = re.compile("([1-2][0-9]{3})(;)")

In [133]:

def split_refs(txt):
    """Split text into individual references and return."""
    global ref_re
    refs = ref_re.findall(txt)
    if len(refs) <= 1:
        refs = txt.split('\n')
    return refs
    #- end get_refs()


In [134]:
def tag_ref(txt, highlight=False):
    """Tag component parts of a reference and return."""
    global url_re, title_re, vol_re, ed_re, pages_re, year_re
    if highlight:
        txt = title_re.sub("\g<1>\033[0;32m<title>\033[0m\g<2>\033[0;32m</title>\033[0m\g<3>", txt, 1)
        txt = vol_re.sub("\g<1>\033[0;32m<volume>\033[0m\g<2>\033[0;32m</volume>\033[0m", txt)
        txt = ed_re.sub("\033[0;32m<edition>\033[0m\g<1>\033[0;32m</edition>\033[0m\g<2>", txt)
        txt = pages_re1.sub("\g<1>\033[0;32m<pages>\033[0m\g<2>\033[0;32m</pages>\033[0m", txt)
        txt = pages_re2.sub("\g<1>\033[0;32m<pages>\033[0m\g<2>\033[0;32m</pages>\033[0m", txt)
        txt = year_re1.sub("\g<1>\033[0;32m<year>\033[0m\g<2>\033[0;32m</year>\033[0m\g<3>", txt)
        txt = year_re2.sub("(\033[0;32m<year>\033[0m\g<1>\033[0;32m</year>\033[0m)", txt)
        txt = year_re3.sub("\033[0;32m<year>\033[0m\g<1>\033[0;32m</year>\033[0m\g<2>", txt)
        txt = url_re.sub("\033[0;34m\g<0>\033[0m", txt)
        txt = "\033[0;32m<reference>\033[0m" + txt + "\033[0;32m</reference>\033[0m"
    else:
        txt = title_re.sub("\g<1>\g<2>\g<3>", txt, 1)
        #txt = vol_re.sub("\g<1><volume>\g<2></volume>", txt)
        #txt = ed_re.sub("\g<1>\g<2>", txt)
        #txt = pages_re1.sub("\g<1><pages>\g<2></pages>", txt)
        #txt = pages_re2.sub("\g<1><pages>\g<2></pages>", txt)
        #txt = year_re1.sub("\g<1>\g<2>\g<3>", txt)
        #txt = year_re2.sub("(\g<1>)", txt)
        #txt = year_re3.sub("\g<1>\g<2>", txt)
        txt = "\t" + txt + "\t"
    return txt.encode('utf-8')
    #- end tag_ref()

In [135]:
import sys, commands, getopt, os, tempfile
from lxml import etree
from utils import UsageError, ConfigError
from config import pdf2xmlexe

def pdf2etree(name):
    """Convert a PDF to XML then parse to an LXML etree and return."""
    '''if argv is None:
        argv = ref_eg/References_12_34.pdf#sys.argv[1:]
    '''
    pdffn = os.path.split(name)[-1]
    tmpdir = tempfile.mkdtemp(suffix='.d', prefix=pdffn)
    tmppath = os.path.join(tmpdir, "{0}.xml".format(pdffn))
    cmdline = "{0} -q -blocks {1} {2}".format(pdf2xmlexe, name, tmppath)
    commands.getoutput(cmdline)   

    try:
        with open(tmppath, 'r') as fh:
            tree = etree.parse(fh)
    except IOError:
        raise UsageError("Could not convert to XML. Are you sure you provided the name of a valid PDF?")
    else:
        return tree

In [136]:
def pdf2refs(args):
    """."""
    global url_re
    xmltag = True
    highlight = False

    tree = pdf2etree(args)
    pubs = []
    urls = []
    xps = tree.xpath('//BLOCK')
    hit_ref = 0
    refs = []
    for el in xps:
        origtxts = []
        for el2 in el.iter():
            try: origtxts.append(el2.text.strip())
            except AttributeError: pass
            if el2 != el and el2.tail is not None:
                origtxts.append(el2.tail.strip())
        origtxt = ' '.join(origtxts)
        if not len(origtxt):
            continue
        elif origtxt.strip().startswith(('Reference', 'REFERENCE','Reference:')) or origtxt.find('Reference') > 0 or origtxt[:20].find('REFERENCE') > 0:
            hit_ref = 1
            continue
        elif hit_ref:
            refs.append(origtxt)

    ref2=[]
            
    for ref in split_refs('\n'.join(refs)):
        for url in url_re.findall(ref):
            urls.append(url[0])
            
        pubbits = []
        for pubnode in el.xpath(".//TOKEN[@italic='yes']"):
            pubtxt = etree.tostring(pubnode, method='text', encoding="UTF-8")
            pubbits.append(pubtxt)

        if len(pubbits): pubs.append(' '.join(pubbits))
        if xmltag:
            ref = tag_ref(ref, highlight)
        ref2.append(ref)
    
    pubs2=[]
    urls2=[]


    if len(pubs):
        for pub in pubs:
            pubs2.append(pub)
    if len(urls):
        for url in urls:
            urls2.append(url)
    return ref2

In [None]:
import csv
import os

#5entries/bcc/pdfs/books/Documents/Breast/
#/home/shray/pdf_ref/5entries/bcc/pdfs/books/Documents/Breast

dir_name='5entries/bcc/pdfs/family-oncology-network-site/Documents/'

directory='/home/shray/pdf_ref/5entries/bcc/pdfs/family-oncology-network-site/Documents'
for filename in os.listdir(directory):
    x=[]
    if filename.endswith(".pdf"):
        f_name= dir_name + filename
        x=pdf2refs(f_name)
        with open('reference.csv', 'a') as myfile:
            wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
            for i in x:
                wr.writerow([i])
                print i
                print '\n'

'''
args='chemotherapy-protocols-site/Documents/Breast/BRAJAC_Protocol.pdf'
x=pdf2refs(args)
for i in x:
    print i'''

		


		


		


	Anderson GH, Boyes DA, Benedet JL, Le Riche JC, Matisic JP, Suen KC, Worth AJ, Millner A, Bennett OM (1988) Organisation and results of the cervical cytology screening programme in British Columbia, 1955-85. Br Med J (Clin Res Ed) 296(6627): 975–978. Arbyn M, Ronco G, Anttila A, Meijer CJ, Poljak M, Ogilvie G, Koliopoulos G, Naucler P, Sankaranarayanan R, Peto J (2012) Evidence regarding human papillomavirus testing in secondary prevention of cervical cancer. Vaccine 30: F88–F99. Arbyn M, Castellsague X, de Sanjose S, Bruni L, Saraiya M, Bray F, Ferlay J (2011) Worldwide burden of cervical cancer in 2008. Ann Oncol 22(12): 2675–2686. Australian Government (2014) National Cervical Screening Program Renewal. Available at http://www.cancerscreening.gov.au/internet/screening/ publishing.nsf/Content/overview-of-the-renewal (Accessed on 19 August 2014). BC Cancer Agency (2014) Cervical Cancer Screening Program 2013 Annual Report. Boyes DA, Morrison B, Knox EG, Draper GJ, Mill

	1. Zhang Y. Epidemiology of esophageal cancer. World J Gastroenterol. 2013 [cited 2016 Jan 30];19(34):5598-606. Available from: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3769895/	


	2. Boeing H, Dietrich T, Hoffmann K, Pischon T, Ferrari P, Lahmann P, et al. Intake of fruits and vegetables and risk of cancer of the upper aero-digestive tract: the prospective EPIC-study. Cancer Causes Control. 2006 [cited 2016 Jan 30];17(7):957-69. Available from Medline: http://www.ncbi.nlm.nih.gov/pubmed/16841263	


	3. Freedman ND, Park Y, Subar AF, Hollenbeck AR, Leitzmann MF, Schatzkin A, et al. Fruit and vegetable intake and esophageal cancer in a large prospective cohort study. Int J Cancer. 2007 [cited 2016 Jan 30];121(12):2753-60. Available from Medline: http://www.ncbi.nlm.nih.gov/pubmed/17691111	


	4. Yoon H, Kim N. Diagnosis and management of high risk group for gastric cancer. Gut Liver. 2015 [cited 2016 Jan 30];9(1):5-17. Available from Medline: http://www.ncbi.nlm.nih.gov/pubmed/2554

1. /home/shray/pdf_ref/5entries/bcc/pdfs/books/Documents/Gastrointestinal
2. /home/shray/pdf_ref/5entries/bcc/pdfs/books/Documents/Genitourinary
3. /home/shray/pdf_ref/5entries/bcc/pdfs/books/Documents/Gynecology
4. /home/shray/pdf_ref/5entries/bcc/pdfs/cancer-management-guidelines-site/Documents
5. /home/shray/pdf_ref/5entries/bcc/pdfs/centre-for-the-north-site/Documents
6. /home/shray/pdf_ref/5entries/bcc/pdfs/centre-for-the-southern-interior-site/Documents
7. /home/shray/pdf_ref/5entries/bcc/pdfs/centre-for-the-southern-interior-site/Pages
8. /home/shray/pdf_ref/5entries/bcc/pdfs/centre-fraser-valley-centre-site/Documents
9. /home/shray/pdf_ref/5entries/bcc/pdfs/centre-vancouver-centre-site/Documents
10. /home/shray/pdf_ref/5entries/bcc/pdfs/centre-vancouver-island-centre-site/Documents
11. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Breast
12. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Gastrointestinal
13. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Genitourinary
14. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Gynecology
15. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Head%20and%20Neck
16. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Kaposi%27s%20Sarcoma
17. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Leukemia-BMT
18. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Lung
19. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Lymphoma-Myeloma
20. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Melanoma
21. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Miscellaneous%20Origin
22. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Neuro-Oncology
23. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Ocular
24. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Primary%20Unknown
25. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Sarcoma
26. /home/shray/pdf_ref/5entries/bcc/pdfs/chemotherapy-protocols-site/Documents/Supportive%20Care
27. /home/shray/pdf_ref/5entries/bcc/pdfs/communities-oncology-network-site/Documents
28. /home/shray/pdf_ref/5entries/bcc/pdfs/coping-and-support-site/Documents **ERROR**
29. /home/shray/pdf_ref/5entries/bcc/pdfs/coping-and-support-site/Documents/Colorectal-Forum/Presentations
30. /home/shray/pdf_ref/5entries/bcc/pdfs/coping-and-support-site/Documents/Hereditary%20Cancer%20Program
31. /home/shray/pdf_ref/5entries/bcc/pdfs/coping-and-support-site/Documents/MedicalCannabis
32. /home/shray/pdf_ref/5entries/bcc/pdfs/coping-and-support-site/Documents/Support%20Programs
33. /home/shray/pdf_ref/5entries/bcc/pdfs/Documents
34. /home/shray/pdf_ref/5entries/bcc/pdfs/drug-database-site/Documents
35. /home/shray/pdf_ref/5entries/bcc/pdfs/drug-database-site/Drug%20Index
36. /home/shray/pdf_ref/5entries/bcc/pdfs/facts4teens/Documents
37. /home/shray/pdf_ref/5entries/bcc/pdfs/family-oncology-network-site/Documents
38.
39.
40.