In [1]:
# only needed in Colab or fresh env
!pip install -q transformers datasets accelerate

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import pandas as pd




In [2]:
MODEL_NAME = "ProsusAI/finbert"

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)

label_map = {0: "negative", 1: "neutral", 2: "positive"}

print("Device:", device)
print("Loaded tokenizer/model:", MODEL_NAME)


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device: cpu
Loaded tokenizer/model: ProsusAI/finbert


In [3]:
def finbert_zero_shot(text, return_probs=True):
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    cls = int(np.argmax(probs))
    return {
        "text": text,
        "predicted_class": cls,
        "predicted_label": label_map[cls],
        "confidence": float(probs[cls]),
        "probs": {"negative": float(probs[0]), "neutral": float(probs[1]), "positive": float(probs[2])}
    }

# quick test
print(finbert_zero_shot("Revenue increased significantly, beating analyst expectations"))


{'text': 'Revenue increased significantly, beating analyst expectations', 'predicted_class': 0, 'predicted_label': 'negative', 'confidence': 0.9567899703979492, 'probs': {'negative': 0.9567899703979492, 'neutral': 0.01961829699575901, 'positive': 0.023591840639710426}}


In [4]:
from torch.utils.data import DataLoader, Dataset

class SimpleTextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts
    def __len__(self): return len(self.texts)
    def __getitem__(self, i): return self.texts[i]

def finbert_predict_batch(texts, batch_size=32):
    ds = SimpleTextDataset(texts)
    loader = DataLoader(ds, batch_size=batch_size, collate_fn=lambda batch: batch)
    results = []
    model.eval()
    for batch in loader:
        inputs = tokenizer(batch, truncation=True, padding=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
        for t, p in zip(batch, probs):
            cls = int(np.argmax(p))
            results.append((label_map[cls], float(p[cls]), p.tolist()))
    return results


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]