In [4]:
import re
import pdfplumber
import spacy
import pickle

In [2]:
def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    words = {
        #'Soll-/Istwerte': 'Sollwerte, Istwerte',
        ' Œ': '-', 
        '\n':'',
        #'[\[].*?[\]]': '',
        #'[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’""′‘\\\]':'', 
        #' +': ' ',
        ' •': ''
    }
    for k, w in words.items():
        text = text.replace(k, w)

    return text

In [3]:
def clean_text_2(text):
    words = {
	'..':'', '▶':'-', '- ':'', # 1 and 2 in 0,1,4,5. 3 in 4.
#	'>':':' 
    }
    for k, w in words.items():
        text = text.replace(k, w)

    return text

In [None]:
def clean_text_3(text): # combination of clean_text and clean_text_2
    text = re.sub(r"\s+", " ", text)
    words = {
        ' Œ': '-', 
        '\n':'',
        ' •': '',
        '..':'', '▶':'-', '- ':'', '¼':'`',
    }
    for k, w in words.items():
        text = text.replace(k, w)

    return text

In [4]:
def is_correct_sentence_german(sent):
    if len(sent) <= 3:
#        print("  >too few words")
        return False

    has_verb = False
    has_subject = False
    for token in sent:
        # this will be different for English
        if token.pos_ == "VERB" or token.tag_ in ["VAFIN", "VMFIN"]:
            has_verb = True
        elif token.dep_ == "sb":
            has_subject = True

    is_correct = has_verb and has_subject
#    if not is_correct:
#        if not has_verb:
#            print("  >has no verb")
#        if not has_subject:
#            print("  >has no subject")

    return is_correct

In [10]:
def document_processing(document, page_start, page_end):
    
    def check_bboxes(word, table_bbox):
        """
        Check whether word is inside a table bbox.
        """
        l = word['x0'], word['top'], word['x1'], word['bottom']
        r = table_bbox
        return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

    tables = []
    counter = 0
    raw_text_cleaned = ''
    TOI_start = page_start
    TOI_end = page_end

    with pdfplumber.open(document) as pdf:
        for page in pdf.pages[TOI_start-1:TOI_end]:        
            tables.append(page.find_tables())
            table_bboxes = [i.bbox for i in tables[counter]]
            tables[counter] = [{'table': [[clean_text_3(x) if x is not None else x for x in item] for item in i.extract()], 'doctop': i.bbox[1]} for i in tables[counter]]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
    
            for cluster in pdfplumber.utils.cluster_objects(non_table_words+tables[counter], 'doctop', tolerance=5):
                if 'text' in cluster[0]:
                    text = (' '.join([i['text'] for i in cluster]))+' '
                    text = clean_text_3(text)
                    raw_text_cleaned += text
            counter+=1
    d = dict();
    d['text'] = raw_text_cleaned
    d['tables'] = [item for item in tables if item!= []]
    
    return d

In [6]:
def sentence_catcher(text, phrase):

    nlp = spacy.load("en_core_web_lg")   
    doc = nlp(text)
    
    sentence_spans = []
    for sent in doc.sents:
        sentence_spans.append(sent)

    phraseParts = phrase.split()
    list_matched_sents = []
    
    if phrase == 'tooling adaptor':
        phraseRegex = r'(?=.*\b(' + phraseParts[0] + r'|tool)\b)(?=.*\b(' + phraseParts[1] + r'|adaptors)\b)(?!.*\b(assembly|assemblies|assembly`s)\b)'
        
        counter=0
        for item in sentence_spans:
            if re.search(phraseRegex, item.text, re.IGNORECASE) or re.search(phraseRegex, item.text, re.IGNORECASE) or re.search(phraseRegex, item.text, re.IGNORECASE):
                #print(counter,"\n",item)
                counter+=1
                list_matched_sents.append(item)

        # filtration for Tooling Adaptor sents
        list_matched_sents[0] = list_matched_sents[0][4:]
        list_matched_sents[5] = list_matched_sents[5][2:]
        list_matched_sents[6] = list_matched_sents[6][4:]
        list_matched_sents = list_matched_sents[:-1]

    if phrase == 'robot adaptor':
        phraseRegex = r'(?=.*\b(' + phraseParts[0] + r')\b)(?=.*\b(' + phraseParts[1] + r'|adaptors)\b)(?!.*\b(assembly|assemblies|assembly`s)\b)'
        
        counter=0
        for item in sentence_spans:
            if re.search(phraseRegex, item.text, re.IGNORECASE) or re.search(phraseRegex, item.text, re.IGNORECASE) or re.search(phraseRegex, item.text, re.IGNORECASE):
                #print(counter,"\n",item)
                counter+=1
                list_matched_sents.append(item)

        # filtration for Robot Adaptor sents
        list_matched_sents[0] = list_matched_sents[0][17:]
        del list_matched_sents[2]
        list_matched_sents[5] = list_matched_sents[5][23:]
        list_matched_sents[7] = list_matched_sents[7][2:]
        del list_matched_sents[4]
        del list_matched_sents[5]
        del list_matched_sents[6]

    return list_matched_sents

In [2]:
def ontology_transformer(sentence):
        
    def decision_maker(sent):
        
        wordsInSents = {}
        for token in sent:
            if spacy.explain(token.pos_) in ('noun', 'pronoun', 'proper noun'):
                wordsInSents[token.text] = 'object'
            if spacy.explain(token.pos_) in ('particle', 'auxiliary', 'verb'):
                wordsInSents[token.text] = 'action'
            if spacy.explain(token.pos_) in ('adposition', 'subordinating conjunction', 'coordinating conjunction'):
                wordsInSents[token.text] = 'positioning'
            if spacy.explain(token.pos_) in ('determiner', 'adverb', 'adjective', 'numeral'):
                wordsInSents[token.text] = 'determinant'
        
        dummy = {}
        removals = []
        for i in range(1,len(wordsInSents)):
            if list(wordsInSents.values())[i] != list(wordsInSents.values())[i-1]:
                dummy[list(wordsInSents.keys())[i-1]] = list(wordsInSents.values())[i-1]
            else:
                dummy[list(wordsInSents.keys())[i-1]+' '+list(wordsInSents.keys())[i]] = list(wordsInSents.values())[i]
                removals.append(i)
        
        if list(wordsInSents.values())[-1] == list(wordsInSents.values())[-2]:
            dummy[list(wordsInSents.keys())[-2]+' '+list(wordsInSents.keys())[-1]] = list(wordsInSents.values())[-1]
            dummy.pop(list(wordsInSents.keys())[-2], None)
        else:
            dummy[list(wordsInSents.keys())[-1]] = list(wordsInSents.values())[-1]
        
        for k in removals:
            dummy.pop(list(wordsInSents.keys())[k], None)
        
        wordsInSents = dummy
        
        return wordsInSents

    list_of_wordsInSent = []
    for item in sentence:
        list_of_wordsInSent.append(decision_maker(item))

    return list_of_wordsInSent

In [None]:
def phrase_namer(phrase):
    naming = []
    for i in range(len(phrase.split())):
        naming.append(phrase.split()[i])
        if i == len(phrase.split())-1:
            break
        else:
            naming.append('-')
    return ''.join(naming)