In [23]:
%reset -f

In [24]:
import spacy
import PyPDF2
import pandas as pd
import numpy as np
import torch
from transformers import Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [28]:
# Loading coreference resolution transformer model


# Loading core transformer model
spacy_core = spacy.load('en_core_web_trf')


def resolve_references(doc):
    token_mention_mapper = {}
    output_string = ''
    clusters = [val for key, val in doc.spans.items() if key.startswith('coref_cluster')]

    for cluster in clusters:
        first_mention = cluster[0]

        for mention_span in list(cluster)[1:]:
            if len(mention_span) == 0:
                continue
            else:
                token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
                for token in mention_span[1:]:
                    token_mention_mapper[token.idx] = ''

    for token in doc:
        if token.idx in token_mention_mapper:
            output_string += token_mention_mapper[token.idx]
        else:
            output_string += token.text + token.whitespace_

    return output_string

# Defining PyTorch Dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


def pipeline(input_text):
    # First stage - Coreference Resolution
    coref_model = spacy.load('en_coreference_web_trf')
    coref_model.max_length = 2000000

    doc = coref_model(input_text)
    resolved_string = resolve_references(doc)

    # Manual garbage collection
    del coref_model, doc

    spacy_core = spacy.load('en_core_web_trf')
    doc = spacy_core(resolved_string)
    
    
    # Second and third stage - NER and POS tagging
    action_statements = list()
    flag = False
    list_of_sent = list(doc.sents)
    for sent in list_of_sent:
        flag = False
        sent = list(sent)
        for word in sent:
            if word.ent_type_ == 'ORG':
                location = sent.index(word)
                if (location+1 != len(sent)) and sent[location+1].tag_ in ('VB', 'VBG', 'VBP', 'VBZ', 'MD', 'VV', 'VP', 'VERB', 'VAFIN', 'VMFIN', 'VVFIN', 'VE'):
                    flag = True
                    break
        if flag:
            str_sent = list(map(lambda x: str(x), sent))
            action_statements.append(' '.join(str_sent).replace(' - ', ' '))

    # Manual garbage collection
    del spacy_core, doc
    
    
    
    base_model = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    
    
    # Fourth stage - Inference
    action_statements_tokenized = tokenizer(action_statements, padding=True, truncation=True, max_length=512)


    # Converting tokenized sentences to PyTorch Dataset
    test_dataset = Dataset(action_statements_tokenized)

    # Loading fine-tune BERT model
    model_path = 'models/mar11/checkpoint-180'
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
    test_trainer = Trainer(model)

    # Inference step
    raw_pred, _, _ = test_trainer.predict(test_dataset)
    y_pred = np.argmax(raw_pred, axis=1)

    # Converting list to pandas Series
    y_pred = pd.Series(y_pred)
    goal_indices = list(y_pred[y_pred==1].index)

    c = 0

    for i in goal_indices:
        c += 1
        print(f'Goal {c}: ')
        print(action_statements[i])
        print('')


    print(f'Total no. of goals: {c}')
    print(f'Total no. of action_statements: {len(action_statements)}')
    

loading configuration file /var/folders/lf/r0_x20qj11jc_bvp_yglr3700000gn/T/tmpmgt9find/config.json
Model config RobertaConfig {
  "_name_or_path": "/var/folders/lf/r0_x20qj11jc_bvp_yglr3700000gn/T/tmpmgt9find/config.json",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_c

Goal 1: 
Still , Bosch intends to increase workforce diversity by 40 % .

Total no. of goals: 1
Total no. of action_statements: 1


In [27]:
# # Creating a pdf reader object
# reader = PyPDF2.PdfReader('2018.pdf')
# input_text = ''

# for i in reader.pages:
#     input_text += i.extract_text()

# input_text = input_text.replace('\n', ' ')
input_text = 'At Bosch, diversity is a fundamental pillar. We ensured a diverse workforce by taking necessary measures in the past. Still, The company intends to increase workforce diversity by 40%. It would enable a healthy and balanced workforce that would be warm and welcoming for people from diverse backgrounds.'
input_text_3 = 'Diversity in workforce is important. We are aware that legal and cultural requirements can vary in a global market. Continental expects all of our suppliers to be guided by fairness, honesty and responsibility in all aspects of their business. Our supplier code establishes important standards that match the Continental corporate values. Every supplier must comply strictly with these standards. We use them to define requirements for good working conditions, then check compliance with these requirements through our on-site audits.'
input_text_2 = 'A growing need for food, energy and clean water, limited resources and a booming world population – reconciling all these factors is the greatest challenge of our time. Innovations based on chemistry play a key role here, as they contribute decisively to new solutions. Effective and efficient research and development is a prerequisite for innovation as well as an important growth engine for BASF. To ensure our long-term business success with chemistry-based solutions for almost all sectors of industry, we develop innovative processes and products for a sustainable future and drive forward digitalization in research worldwide.'

list_of_inputs = [input_text, input_text_2, input_text_3]

for i in list_of_inputs:
    pipeline(i)

loading configuration file /var/folders/lf/r0_x20qj11jc_bvp_yglr3700000gn/T/tmpznvmper3/config.json
Model config RobertaConfig {
  "_name_or_path": "/var/folders/lf/r0_x20qj11jc_bvp_yglr3700000gn/T/tmpznvmper3/config.json",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_c

Goal 1: 
Still , Bosch intends to increase workforce diversity by 40 % .

Total no. of goals: 1
Total no. of action_statements: 1


loading configuration file /var/folders/lf/r0_x20qj11jc_bvp_yglr3700000gn/T/tmpvy85s_0u/config.json
Model config RobertaConfig {
  "_name_or_path": "/var/folders/lf/r0_x20qj11jc_bvp_yglr3700000gn/T/tmpvy85s_0u/config.json",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_c

Goal 1: 
To ensure BASF long term business success with chemistry based solutions for almost all sectors of industry , BASF develop innovative processes and products for a sustainable future and drive forward digitalization in research worldwide .

Total no. of goals: 1
Total no. of action_statements: 1


loading configuration file /var/folders/lf/r0_x20qj11jc_bvp_yglr3700000gn/T/tmppw13fob6/config.json
Model config RobertaConfig {
  "_name_or_path": "/var/folders/lf/r0_x20qj11jc_bvp_yglr3700000gn/T/tmppw13fob6/config.json",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_c

IndexError: list index out of range