In [1]:
%reset -f

In [2]:
import spacy
import PyPDF2
import pandas as pd
import numpy as np
import torch
from transformers import Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# # Creating a pdf reader object
# reader = PyPDF2.PdfReader('2018.pdf')
# input_text = ''

# for i in reader.pages:
#     input_text += i.extract_text()

# input_text = input_text.replace('\n', ' ')
input_text = 'At Bosch, diversity is a fundamental pillar. We ensured a diverse workforce by taking necessary measures in the past. Still, The company intends to increase workforce diversity by 40%. It would enable a healthy and balanced workforce that would be warm and welcoming for people from diverse backgrounds.'

In [4]:
# First stage - Coreference Resolution

# Loading coreference resolution transformer model
coref_model = spacy.load('en_coreference_web_trf')
coref_model.max_length = 2000000
doc = coref_model(input_text)

# Manual garbage collection
# del reader, input_text

def resolve_references(doc):
    token_mention_mapper = {}
    output_string = ''
    clusters = [val for key, val in doc.spans.items() if key.startswith('coref_cluster')]

    for cluster in clusters:
        first_mention = cluster[0]

        for mention_span in list(cluster)[1:]:
            if len(mention_span) == 0:
                continue
            else:
                token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
                for token in mention_span[1:]:
                    token_mention_mapper[token.idx] = ''

    for token in doc:
        if token.idx in token_mention_mapper:
            output_string += token_mention_mapper[token.idx]
        else:
            output_string += token.text + token.whitespace_

    return output_string

resolved_string = resolve_references(doc)

# Manual garbage collection
del coref_model, doc

# Loading core transformer model
spacy_core = spacy.load('en_core_web_trf')
doc = spacy_core(resolved_string)

In [5]:
# Second and third stage - NER and POS tagging
action_statements = list()
flag = False
list_of_sent = list(doc.sents)
for sent in list_of_sent:
    flag = False
    sent = list(sent)
    for word in sent:
        if word.ent_type_ == 'ORG':
            location = sent.index(word)
            if (location+1 != len(sent)) and sent[location+1].tag_ in ('VB', 'VBG', 'VBP', 'VBZ', 'MD', 'VV', 'VP', 'VERB', 'VAFIN', 'VMFIN', 'VVFIN', 'VE'):
                flag = True
                break
    if flag:
        str_sent = list(map(lambda x: str(x), sent))
        action_statements.append(' '.join(str_sent).replace(' - ', ' '))

# Manual garbage collection
del spacy_core, doc

In [6]:
# Fourth stage - Inference
base_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(base_model)
action_statements_tokenized = tokenizer(action_statements, padding=True, truncation=True, max_length=512)

# Defining PyTorch Dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Converting tokenized sentences to PyTorch Dataset
test_dataset = Dataset(action_statements_tokenized)

# Loading fine-tune BERT model
model_path = 'models/mar11/checkpoint-180'
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
test_trainer = Trainer(model)

# Inference step
raw_pred, _, _ = test_trainer.predict(test_dataset)
y_pred = np.argmax(raw_pred, axis=1)

# Converting list to pandas Series
y_pred = pd.Series(y_pred)
goal_indices = list(y_pred[y_pred==1].index)

c = 1

for i in goal_indices:
    print(f'Goal {c}: ')
    print(action_statements[i])
    print('')
    c += 1
    
print(f'Total no. of goals: {c}')
print(f'Total no. of action_statements: {len(action_statements)}')

***** Running Prediction *****
  Num examples = 525
  Batch size = 8


Goal 1: 
believes is   why ALLIANZ GROUP aim to communicate openly and   comprehensively about ALLIANZ GROUP corporate   responsibility commitments , approach and   performance against ALLIANZ GROUP commitments .

Goal 2: 
And by 2050 , ALLIANZ GROUP want all   ALLIANZ GROUP assets to be climate neutral .

Goal 3: 
believes includes   screening investments for climate related   risks and excluding certain sectors , as well   as improving transparency around climate- related disclosures . ALLIANZ GROUP have an industry- leading threshold for coal and ALLIANZ GROUP have   committed to bring an industry- leading threshold for coal down to zero thereby   completely phasing out coal based business   models from ALLIANZ GROUP business no later than 2040 .

Goal 4: 
ALLIANZ GROUP are working to further align ALLIANZ GROUP strategy   and reporting with the recommendations   developed by the G20 Financial Stability   Board ’s Task Force on Climate related   Financial Disclosures ( TCFD ) .

Goa