In [68]:
import PyPDF2
import re
import os
from chemdataextractor import Document
import spacy
from spacy import displacy

In [42]:
nlp = spacy.load('en')

In [43]:
list_of_files = os.listdir('./sample_methods')

In [44]:
list_of_files

['100011.pdf',
 '100024.pdf',
 '100077.pdf',
 '101212-E17D.pdf',
 '101931-E07B.pdf',
 '102452-E10B.pdf',
 '102557-E11A.pdf',
 '102668-E18B.pdf',
 '102819-E17A.pdf']

In [45]:
def pdf_parser(filepath):
    filename = filepath
    pdfFileObj = open(filename,'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    num_pages = pdfReader.numPages
    count = 0
    text = ""
    while count < num_pages:
        pageObj = pdfReader.getPage(count)
        count +=1
        text += pageObj.extractText()
        
    if text != "":
        text = text
    
#     else:
#         text = textract.process(fileurl, method='tesseract', language='eng')
    
    return text


In [46]:
text_list = [pdf_parser('sample_methods/' + i) for i in list_of_files]



* __Deleted file no. 5__

In [47]:
for idx, i in enumerate(text_list):
    if i == '':
        del text_list[idx]

## Document ID

In [48]:
doc_ids = [re.search(r'\d{6,7}\s{0,5}-\s{0,5}\w{0,1}\s{0,5}\d{0,2}\s{0,5}\w{0,2}\s{0,5}B{0,1}', i, flags=0).group() 
           if re.search(r'\d{6,7}\s{0,5}-\s{0,5}\w{0,1}\s{0,5}\d{0,2}\s{0,5}\w{0,1}\s{0,5}B{0,1}', i, flags=0) != None 
           else re.search(r'\d{6,7}', i, flags=0).group() for i in text_list]

doc_ids = [re.sub(r'[\n\s+]', '', i) for i in doc_ids]

doc_ids

['100011',
 '100024',
 '101212-E17D',
 '101931-E07B',
 '102452-E10B',
 '102557-E11A',
 '102668-E18B',
 '102819-E17A']

## Extract Scope

In [49]:
def scope_extractor(text):
    scope_pattern = re.search(r'this\smethod\sis', text.lower(), flags=0)
    next_sec_pattern = re.search(r'\d.\s+\w+.', text[scope_pattern.start():], flags=0)
    return text[scope_pattern.start():scope_pattern.start() + next_sec_pattern.start()]

In [50]:
extracted_scopes = [scope_extractor(i) for i in text_list]

In [51]:
print(extracted_scopes[1])

This method is applicable to the assay of o-benzyl-p-chlorophenol (OBCP).  It has 
been validated over the concentration range of 90 - 100%. 
 
 


In [52]:
cleaned_doc = Document(re.sub(r'\n', r'', extracted_scopes[1]))

cleaned_doc.cems

[Span('OBCP', 67, 71), Span('o-benzyl-p-chlorophenol', 42, 65)]

## Extract Matrix Formulation

In [53]:
def matrix_formulation_extractor(text):
    extracted_scope = scope_extractor(text)
    cleaned_doc = Document(re.sub(r'\n', r'', extracted_scope))
    return list(set([i.text for i in cleaned_doc.cems]))

In [54]:
matrix_formulation_extractor(text_list[1])

['o-benzyl-p-chlorophenol', 'OBCP']

In [55]:
mform_extracted = [matrix_formulation_extractor(i) for i in text_list]

mform_extracted

[['DPO', 'diphenyl oxide'],
 ['o-benzyl-p-chlorophenol', 'OBCP'],
 ['B-staged divinylsiloxane-bis(benzocyclobutene)'],
 ['Chlorine', 'polyolefin', 'polyethylene'],
 ['polyethylene'],
 ['low density polyethylene', 'benzyl alcohol'],
 [],
 []]

## Extract Precision

In [56]:
def precision_extractor(text):
    precision_pattern = re.search(r'\d{1,2}.\s+Pre\s{0,1}cision', text, flags=0)
    next_sec_pattern = re.search(r'\d{1,2}\.\s+\w+.', text[precision_pattern.end():], flags=0)
    return text[precision_pattern.start():precision_pattern.end() + next_sec_pattern.start()]

In [58]:
a = re.sub(r'\n', '', precision_extractor(text_list[7]))

In [59]:
a

'12. Precision 12.1 Precision has been determined from multiple analyses of two powder batches [n = 20] and one pellet batch [n = 10].  The average volatiles content ranged from 0.60 to 0.81% (w/w).  The precision data indicate a pooled standard deviation [spooled] of  0.022% and pooled relative standard deviation (RSD) [RSDpooled] of   3.1%.  12.2 At the 95% confidence level, individual measurements on similar samples may vary from the long-term average by  0.045% [t × spooled, where t = t-value of 2.05 at 27 degrees of freedom] and  6.4% relative [t × RSDpooled, where t = t-value of '

In [61]:
prec_doc = Document(a)

In [65]:
prec_spacy = nlp(a)

In [66]:
prec_spacy

12. Precision 12.1 Precision has been determined from multiple analyses of two powder batches [n = 20] and one pellet batch [n = 10].  The average volatiles content ranged from 0.60 to 0.81% (w/w).  The precision data indicate a pooled standard deviation [spooled] of  0.022% and pooled relative standard deviation (RSD) [RSDpooled] of   3.1%.  12.2 At the 95% confidence level, individual measurements on similar samples may vary from the long-term average by  0.045% [t × spooled, where t = t-value of 2.05 at 27 degrees of freedom] and  6.4% relative [t × RSDpooled, where t = t-value of 

In [72]:
for ent in prec_spacy.ents:
    print(ent.label, ent.label_, ' '.join(t.orth_ for t in ent))

394 CARDINAL 12
381 ORG Precision 12.1 Precision
394 CARDINAL two
394 CARDINAL 20
394 CARDINAL one
394 CARDINAL 10
390 PERCENT 0.60 to
390 PERCENT 0.81 %
390 PERCENT 0.022 %
381 ORG RSD
381 ORG RSDpooled
381 ORG   
390 PERCENT 3.1 %
394 CARDINAL 12.2
390 PERCENT 95 %
390 PERCENT 0.045 %
394 CARDINAL ×
394 CARDINAL 2.05
392 QUANTITY 27 degrees
390 PERCENT 6.4 %
394 CARDINAL ×
381 ORG RSDpooled


In [67]:
for chunk in prec_spacy.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

Precision 12.1 Precision Precision nsubjpass determined
multiple analyses analyses pobj from
two powder batches batches pobj of
one pellet batch batch conj batches
The average volatiles content content nsubj ranged
0.60 to 0.81% % pobj from
The precision data data nsubj indicate
a pooled standard deviation deviation dobj indicate
0.022% % appos deviation
relative standard deviation deviation dobj pooled
RSD RSD appos deviation
[RSDpooled RSDpooled appos deviation
3.1% % pobj of
the 95% confidence level level pobj At
individual measurements measurements nsubj vary
similar samples samples pobj on
the long-term average average pobj from
[t × spooled, where t = t-value value ROOT value
27 degrees degrees pobj at
freedom freedom pobj of
t × × ROOT ×
where t = t-value value appos RSDpooled


In [69]:
displacy.render(prec_spacy, style='dep', jupyter=True)

In [158]:
extracted_precision = [precision_extractor(i) for i in text_list]

In [160]:
extracted_precision[5]

'12. Precision    Precision has been determined \nfrom multiple analyses [n = \n20] of two \nsynthetic \nprepared samples of \nbenzyl alcohol in water\n.  The synthetic samples were prepared fresh daily (Section 4.1). \nThe analyses \nwere performed on two separate days at an average benzyl alcohol\n concentration\n [x] of \n0.09 ppm \n(g/g).  The precision data indicate a standard deviation [\ns] of \n0.01\n ppm (\ng/g\n), where s = standard \ndeviation of the validation data.\n  \n The estimated prediction interval at the 95% confidence level of a future final result determined on a \nsimilar sample [±\n t(n-1) × s; where t\n(n-1)   = 2.093 =\n t-value at (n\n-1) degrees of freedom] is ± \n0.01 ppm \n(g/g)\n.  This assumes a normal distribution of results and equal variability between locations.\n  Any future final result obtained on a similar sample (with a true \nbenz\nyl alcohol concentration\n [x] of \n0.090 ppm (\ng/g)\n) is expected to range from \n0.08 to 0.10 \nppm (\ng/g) \n