In [22]:
import numpy as np
import torch 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax



In [23]:
from newsapi import NewsApiClient

newsapi = NewsApiClient(api_key="d1801a0f067c43d9a6ca322e42617703")


In [38]:
Model_Name = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(Model_Name)
model = AutoModelForSequenceClassification.from_pretrained(Model_Name)

model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [39]:
def chunk_text(text, tokenizer, max_length=512):
    token_ids = tokenizer(
        text,
        add_special_tokens=False,
        return_attention_mask=False
    )["input_ids"]

    chunk_size = max_length - 2  
    chunks = []

    for i in range(0, len(token_ids), chunk_size):
        chunk = token_ids[i:i + chunk_size]
        chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
        assert len(chunk) <= max_length 
        chunks.append(chunk)

    return chunks




In [54]:
labels = ["negative", "neutral", "positive"]

def predict_chunk_sentiment(token_ids):
    input_ids = torch.tensor([token_ids])
    attention_mask = torch.ones_like(input_ids)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

    probs = softmax(outputs.logits.numpy()[0])
    return dict(zip(labels, probs))


In [55]:
def finbert_long_document_sentiment(text):
    chunks = chunk_text(text, tokenizer)
    
    allscores = []
    for chunk in chunks:
        scores = predict_chunk_sentiment(chunk)
        allscores.append(scores)

    avg_scores = {
        label: np.mean([score[label] for score in allscores])
        for label in labels
    }

    return avg_scores


In [56]:
articles = newsapi.get_everything(
    q="stock market OR earnings OR finance",
    language="en",
    sort_by="publishedAt",
    page_size=5
)


In [57]:
docs = []

for article in articles["articles"]:
    text = f"""
    {article['title']}
    {article['description']}
    {article['content']}
    """
    docs.append(text)

long_document = " ".join(docs)



In [58]:
result = finbert_long_document_sentiment(long_document)
result


{'negative': np.float32(0.028408596),
 'neutral': np.float32(0.3247344),
 'positive': np.float32(0.64685696)}