In [7]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
# Intialize BERT Model

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
# setting up the negative keywords related to economic downturn that are used for analyzing FEARS index
keywords = [
    "Inflation", "Recession", "Bankruptcy", "Unemployment", "Crisis", "Decline",
    "Debt", "Deficit", "Depreciation", "Devaluation", "Downgrade", "Downturn",
    "Crash", "Collapse", "Stagnation", "Insolvency", "Layoff", "Loss",
    "Risk", "Slump", "Turbulence", "Volatility", "Bubble", "Default",
    "Panic", "Scarcity", "Underperformance", "Bearish", "Liquidation", "Stagflation"
]


In [10]:
#Tokenize and encode the keywords
inputs = tokenizer(keywords, padding=True, truncation=True, return_tensors="pt")

In [11]:
# pass the encoded keywords through BERT
with torch.no_grad():
    outputs = model(**inputs)

In [12]:
# the outputs will contain the embeddings for the keywords. we can use these embeddings for various downstream tasks like clustering, similarity analysis, or as features in a machine learning model.
embeddings = outputs.last_hidden_state
print(embeddings)

tensor([[[ 2.3770e-02,  4.5828e-01,  2.0908e-01,  ..., -5.5151e-01,
          -1.4852e-01,  6.9389e-01],
         [-7.4408e-01,  1.8233e-01,  7.0369e-02,  ..., -8.6355e-02,
           4.4981e-01, -9.1600e-02],
         [ 8.5303e-01,  4.4853e-02, -1.3271e-01,  ...,  8.4435e-02,
          -9.2435e-01, -8.5228e-02],
         [ 3.8595e-01,  3.0324e-02,  5.0227e-01,  ..., -2.6491e-01,
          -9.6082e-02,  9.7598e-02],
         [ 2.3694e-01, -1.0007e-01,  4.0225e-01,  ..., -2.8127e-01,
          -1.5125e-01, -3.5505e-02],
         [ 4.0635e-01, -9.5907e-02,  4.4301e-01,  ..., -3.3833e-01,
          -1.8810e-01,  8.9839e-05]],

        [[-4.0819e-01,  9.5364e-02, -1.1065e-01,  ..., -6.0680e-02,
           8.3833e-04,  1.1663e-01],
         [-8.8832e-01, -6.1526e-01, -4.1771e-01,  ...,  4.3892e-01,
           2.9164e-01, -3.6309e-01],
         [ 1.0423e+00,  1.2335e-01, -2.1827e-01,  ...,  1.4559e-01,
          -9.1235e-01, -1.4132e-01],
         [-6.3869e-01,  1.7216e-02,  2.1222e-01,  ...