In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", output_attentions=True, output_hidden_states=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [8]:
# reading data from file
with open('html_ouptut.txt', 'r') as input_file:
  content = input_file.readlines()

# selecting sample lines for initial testing
x = 712
content = content[x:x+5]

# print out sample lines
for line in content:
  print(f"{line}")

We continue to ramp production, build new manufacturing capacity and expand our operations to enable increased deliveries and deployments of our products and further revenue growth.

In 2022, our net income attributable to common stockholders was $12.56 billion, representing a favorable change of $7.04 billion, compared to the prior year.

We continue to focus on improving our profitability through production and operational efficiencies.

We ended 2022 with $22.19 billion in cash and cash equivalents and investments, representing an increase of $4.48 billion from the end of 2021.

Our cash flows provided by operating activities during 2022 and 2021 were $14.72 billion and $11.50 billion, respectively, representing an increase of $3.23 billion.



In [9]:
# creating transformer input via tokenization
inputs = tokenizer(content, padding = True, truncation = True, return_tensors='pt')

# check size of the input
print((inputs["input_ids"].shape))

torch.Size([5, 41])


In [10]:
# create output layer
outputs = model(**inputs)

# print shape of output layer
print(outputs.logits.shape)

torch.Size([5, 3])


In [11]:
# obtaining last hidden state in outputs
last_hidden_states = outputs.hidden_states[-1]

In [12]:
# print shape of hidden state
print(last_hidden_states.shape)

torch.Size([5, 41, 768])


In [13]:
import torch

In [18]:
# perform prediction with pytorch using softmax function
predictions = torch.nn.functional.softmax(outputs.logits,dim=1)

print(predictions)

tensor([[0.9447, 0.0123, 0.0430],
        [0.8845, 0.0931, 0.0224],
        [0.9492, 0.0127, 0.0382],
        [0.9501, 0.0253, 0.0246],
        [0.9491, 0.0241, 0.0268]], grad_fn=<SoftmaxBackward0>)


In [19]:
import pandas as pd
import numpy as np

In [20]:
# check number of attention heads in model
num_heads = model.config.num_attention_heads
print(num_heads)

12


In [21]:
# map predictions to labels
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()

table = {"Sentence" : content,
         "Positive" : positive,
         "Negative" : negative,
         "Neutral" : neutral}

labels = ["Positive", "Negative", "Neutral"]

df = pd.DataFrame(table, columns = ["Sentence", "Positive", "Negative" , "Neutral"])
df['Predicted Label'] = np.argmax(df[['Positive', 'Negative', 'Neutral']].values, axis=1)
df['Predicted Label'] = df['Predicted Label'].apply(lambda x: labels[x])

df.head(5)

Unnamed: 0,Sentence,Positive,Negative,Neutral,Predicted Label
0,"We continue to ramp production, build new manu...",0.944715,0.012315,0.04297,Positive
1,"In 2022, our net income attributable to common...",0.884498,0.09311,0.022392,Positive
2,We continue to focus on improving our profitab...,0.949182,0.012664,0.038154,Positive
3,We ended 2022 with $22.19 billion in cash and ...,0.950136,0.025291,0.024573,Positive
4,Our cash flows provided by operating activitie...,0.949141,0.024077,0.026782,Positive


In [22]:
# get attention from output
attention = outputs.attentions

In [23]:
# validate that number of attention tensors are equal to length of content
print(len(attention[0]))

5


In [24]:
# check
print(len(attention[0][0,0,0]))

41


In [25]:
import nltk
from nltk.corpus import stopwords
import string

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')

# Get the stopwords list
stop_words = set(stopwords.words('english'))

#print(stop_words)

punctuation = set(string.punctuation)

# Combine stopwords and punctuation to get all unwanted words
unwanted_words = stop_words.union(punctuation)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [26]:
import matplotlib.pyplot as plt
from IPython.core.display import HTML

In [28]:
def visualize_sum_of_attention_layers(attention, content, tokenizer, sentence_idx ,layer_idx, head_idx):
  sentence = content[sentence_idx]
  tokens = tokenizer.tokenize(sentence)

  # initialize overall attention vector
  overall_attention = np.array([0.0] * len(attention[0][sentence_idx, head_idx, 0]))

  # add all vector to overall attention vector
  for i in range(0, len(attention)):
    temp_token_attention = attention[i][sentence_idx, head_idx, 0].detach().cpu().numpy()

    # normalize curr layer attention values by attention.max()
    normalized_token_attention = temp_token_attention / temp_token_attention.max()

    overall_attention += normalized_token_attention

  # Divide by len of attention to find average level
  overall_attention = overall_attention / len(attention)

  # instantiate array for shortlisting top k values
  shortlist_arr = []

  # visualization of weights with HTML
  html_text = "<p>"
  for token, weight in zip(tokens, overall_attention):
    if token not in unwanted_words:
      shortlist_arr.append((weight, token))
    html_text += f"<span style='background-color:rgba(255,0,0,{weight})'>{token}</span> "
  html_text += "</p>"
  display(HTML(html_text))

  # sort shortlisting array in reverse order
  shortlist_arr = sorted(shortlist_arr, key= lambda x:x[0], reverse = True)
  print(shortlist_arr[:5])

In [29]:
visualize_sum_of_attention_layers(attention, content, tokenizer, 0, 0, 0)

[(0.1976448315908783, 'continue'), (0.19515672001216444, 'enable'), (0.1578126060339855, 'ramp'), (0.14296176709225014, 'build'), (0.14170819174402519, 'increased')]
