In [23]:
%reset -f

In [24]:
import spacy
from spacy.tokens import Doc
import re
import PyPDF2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import EarlyStoppingCallback

In [25]:
# Creating a pdf reader object
reader = PyPDF2.PdfReader('test/2019.pdf')
input_text = ''

for i in reader.pages:
    input_text += i.extract_text()

input_text = input_text.replace('\n', '')

In [26]:
# First stage - Coreference Resolution
coref_model = spacy.load('en_coreference_web_trf')
doc = coref_model(input_text)

def resolve_references(doc):
    token_mention_mapper = {}
    output_string = ''
    clusters = [val for key, val in doc.spans.items() if key.startswith('coref_cluster')]

    for cluster in clusters:
        first_mention = cluster[0]

        for mention_span in list(cluster)[1:]:
            if len(mention_span) == 0:
                continue
            else:
                token_mention_mapper[mention_span[0].idx] = first_mention.text + mention_span[0].whitespace_
                for token in mention_span[1:]:
                    token_mention_mapper[token.idx] = ''

    for token in doc:
        if token.idx in token_mention_mapper:
            output_string += token_mention_mapper[token.idx]
        else:
            output_string += token.text + token.whitespace_

    return output_string

resolved_string = resolve_references(doc)

spacy_core = spacy.load('en_core_web_trf')
doc = spacy_core(resolved_string)

loading configuration file /var/folders/lf/r0_x20qj11jc_bvp_yglr3700000gn/T/tmpn25w3dop/config.json
Model config RobertaConfig {
  "_name_or_path": "/var/folders/lf/r0_x20qj11jc_bvp_yglr3700000gn/T/tmpn25w3dop/config.json",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_c

In [27]:
# Second and third stage - NER and POS tagging
action_statements = list()
flag = False
list_of_sent = list(doc.sents)
for sent in list_of_sent:
    flag = False
    sent = list(sent)
    for word in sent:
        if word.ent_type_ == 'ORG':
            location = sent.index(word)
            if (location+1 != len(sent)) and sent[location+1].tag_ in ('VB', 'VBG', 'VBP', 'VBZ', 'MD', 'VV', 'VP', 'VERB', 'VAFIN', 'VMFIN', 'VVFIN', 'VE'):
                flag = True
                break
    if flag:
        str_sent = list(map(lambda x: str(x), sent))
        action_statements.append(' '.join(str_sent).replace(' - ', ' '))


In [28]:
# Fourth stage - Inference
base_model = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(base_model)
action_statements_tokenized = tokenizer(action_statements, padding=True, truncation=True, max_length=512)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

test_dataset = Dataset(action_statements_tokenized)

model_path = "checkpoint-1000"
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
test_trainer = Trainer(model)


raw_pred, _, _ = test_trainer.predict(test_dataset)
y_pred = np.argmax(raw_pred, axis=1)

y_pred = pd.Series(y_pred)
goal_indices = list(y_pred[y_pred==1].index)

c = 1
for i in goal_indices:
    print(f'Goal{c}')
    print(action_statements[i])
    print('')
    c += 1

loading configuration file config.json from cache at /Users/shubham/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/shubham/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.t

Goal1
That will not change going forward – ALLIANZ GROUP will continue to secure the future of ALLIANZ GROUP customers and society sustainably .

Goal2
the Encouraging Future Generations Program includes an ambition to increase the number of children and   youth benefited by 20 percent by 2020 ( baseline 2018 ) .

Goal3
ALLIANZ GROUP is committed to completely phasing out all coal based risks from P&C insurance portfolios by 2040 at the latest .

Goal4
ALLIANZ GROUP is working to provide more certainty to ALLIANZ GROUP customers on what level of cyber risk is covered by each insurance product .

Goal5
ALLIANZ GROUP are also improving ALLIANZ GROUP privacy cover to be as client orientated as possible .

Goal6
ALLIANZ GROUP aims to create a lasting positive legacy for society through ALLIANZ GROUP investments , believing that sustainable business practices improve the financial performance of companies .

Goal7
As a founding member of the U.N. convened Net Zero Asset Owner Alliance , ALL

In [29]:
# df = pd.Series(action_statements)
# df.to_csv('temp.csv', index=False)

In [30]:

# input_text = 'Diversity in workforce is important. We are aware that legal and cultural requirements can vary in a global market. Continental expects all of our suppliers to be guided by fairness, honesty and responsibility in all aspects of their business. Our supplier code establishes important standards that match the Continental corporate values. Every supplier must comply strictly with these standards. We use them to define requirements for good working conditions, then check compliance with these requirements through our on-site audits.'
# input_text = 'A growing need for food, energy and clean water, limited resources and a booming world population – reconciling all these factors is the greatest challenge of our time. Innovations based on chemistry play a key role here, as they contribute decisively to new solutions. Effective and efficient research and development is a prerequisite for innovation as well as an important growth engine for BASF. To ensure our long-term business success with chemistry-based solutions for almost all sectors of industry, we develop innovative processes and products for a sustainable future and drive forward digitalization in research worldwide.'
# input_text = 'At Bosch, diversity is a fundamental pillar. We ensured a diverse workforce by taking necessary measures in the past. Still, The company intends to increase workforce diversity by 40%. It would enable a healthy and balanced workforce that would be warm and welcoming for people from diverse backgrounds.'
# input_text = 'adidas increases sales for first time since the pandemic'
# input_text = 'As a global financial institution, Deutsche Bank operates in various countries, each of which imposes its own regulations (often with extra- territorial implications). These define how we operate, as well as our conduct, behavior, and standards to which we must adhere. Our strategy and execution model is affected by different political environments and a large number of regulatory requirements. We remain continually aware of these forces that influence our business, and we engage in political and regulatory decisions. This is fundamental to understanding wider political developments and the evolution of the regulatory environment, as well as fostering stakeholder trust. In recent times, international and national political systems have shown signs of fragmentation. This directly affects our business model. In 2017 alone, we saw crucial elections in France, the UK, and Germany, as well as a new US government whose Con- gress passed a comprehensive tax reform that had an immediate impact on our US tax position. The UK has formally declared its exit from the EU, and negotiations are under way. This will have repercussions on our structure, operations, client relationships, and staffing. Furthermore, wider political developments in the Euro zone (such as important national elections) will impact the stability of financial markets, market prices, and long-term investment decisions by companies. All of this affects our entire value chain.'


In [31]:
# Second stage
# ner_op_string = resolved_string
# def show_ents(doc):
#     global ner_op_string
    
#     if doc.ents:
        
#         for ent in doc.ents:
#             # print(ent.text+' - ' +str(ent.start_char) +' - '+ str(ent.end_char) +' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_)))
            
#             if ent.label_ == 'ORG':
#                 word_length = ent.end_char - ent.start_char
#                 ner_op_string = ner_op_string[:ent.start_char] + 'O'*(word_length) + ner_op_string[ent.end_char:]

#     else:
#         print('No named entities found.')
    
#     return ner_op_string
# show_ents(doc)
# ner_op_string = re.sub(r'O{2,}', 'ORG',ner_op_string)
# print(ner_op_string)

In [32]:
# def get_action_statements(ner_op_string):
#     temp_list = ner_op_string.split('. ')
#     return '. '.join(list(filter(lambda x: 'ORG' in x, temp_list)))

# print(get_org_sentences(ner_op_string))

In [33]:
# # print(f'{token.text:{8}} {token.ent_type_:{8}} {token.pos_:{6}} {token.tag_:{6}} {token.dep_:{6}} {spacy.explain(token.pos_):{20}} {spacy.explain(token.tag_)}')
# for token in doc:
#     print(f'{token.text:{15}} {token.ent_type_:{8}} {token.pos_:{6}} {token.tag_:{6}}  {list(token.morph)}')

In [34]:
# action_statements = list()
# flag = False
# list_of_sent = list(doc.sents)
# for sent in list_of_sent:
#     flag = False
#     sent = list(sent)
#     for word in sent:
#         if word.ent_type_ == 'ORG':
#             location = sent.index(word)
#             if (location+1 != len(sent)) and sent[location+1].tag_ in ('VB', 'VBG', 'VBP', 'VBZ', 'MD', 'VV', 'VP', 'VERB', 'VAFIN', 'VMFIN', 'VVFIN', 'VE'):
#                 flag = True
#                 break
#     if flag:
#         str_sent = list(map(lambda x: str(x), sent))
#         action_statements.append(' '.join(str_sent))
# action_statements