In [17]:
!pip install -q transformers datasets accelerate scikit-learn

import os
import torch
import numpy as np
import pandas as pd
import transformers
print(transformers.__version__)
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


4.57.3


In [18]:
!pip install -U transformers accelerate datasets


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


In [19]:
import kagglehub
path = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")
csv_path = os.path.join(path, "all-data.csv")

df = pd.read_csv(csv_path, encoding="latin-1", sep=",", header=None, names=["label","text"])
df["label"] = df["label"].astype(str).str.strip().str.lower()
label_map_str2int = {"positive": 0, "negative": 1, "neutral": 2}
df["label_id"] = df["label"].map(label_map_str2int)
df = df.dropna(subset=["text","label_id"]).reset_index(drop=True)

train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df["label_id"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.50, stratify=temp_df["label_id"], random_state=42)

print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))


Train/Val/Test sizes: 3392 727 727


In [20]:
MODEL_NAME = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

train_ds = Dataset.from_pandas(train_df[["text","label_id"]]).rename_column("label_id","label")
val_ds   = Dataset.from_pandas(val_df[["text","label_id"]]).rename_column("label_id","label")
test_ds  = Dataset.from_pandas(test_df[["text","label_id"]]).rename_column("label_id","label")

train_ds = train_ds.map(tokenize_batch, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tokenize_batch, batched=True, remove_columns=["text"])
test_ds  = test_ds.map(tokenize_batch, batched=True, remove_columns=["text"])


Map:   0%|          | 0/3392 [00:00<?, ? examples/s]

Map:   0%|          | 0/727 [00:00<?, ? examples/s]

Map:   0%|          | 0/727 [00:00<?, ? examples/s]

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

training_args = TrainingArguments(
    output_dir="./finbert_finetuned",
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100
)



In [24]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if preds.ndim == 2:
        preds = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average="weighted", zero_division=0)
    rec = recall_score(labels, preds, average="weighted", zero_division=0)
    f1 = f1_score(labels, preds, average="weighted", zero_division=0)
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)


In [25]:
trainer.train()
# Save the final model
trainer.save_model("./finbert_finetuned/best_model")




Step,Training Loss
100,0.3424
200,0.3111
300,0.1496
400,0.1679
500,0.0888
600,0.0692


In [26]:
metrics = trainer.evaluate(test_ds)
print("Test metrics:", metrics)
# Optionally get predictions and confusion matrix
preds = trainer.predict(test_ds)
preds_label = np.argmax(preds.predictions, axis=1)
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(test_df["label_id"], preds_label, target_names=["positive","negative","neutral"]))
print("Confusion matrix:\n", confusion_matrix(test_df["label_id"], preds_label))




Test metrics: {'eval_loss': 0.3674461245536804, 'eval_accuracy': 0.8954607977991746, 'eval_precision': 0.8967924660958725, 'eval_recall': 0.8954607977991746, 'eval_f1': 0.8958172764500236, 'eval_runtime': 22.9731, 'eval_samples_per_second': 31.646, 'eval_steps_per_second': 1.001, 'epoch': 3.0}




              precision    recall  f1-score   support

    positive       0.84      0.87      0.86       204
    negative       0.87      0.92      0.89        91
     neutral       0.93      0.90      0.91       432

    accuracy                           0.90       727
   macro avg       0.88      0.90      0.89       727
weighted avg       0.90      0.90      0.90       727

Confusion matrix:
 [[178   3  23]
 [  0  84   7]
 [ 33  10 389]]


In [27]:
trainer.save_model("./finbert_finetuned/best_model")
tokenizer.save_pretrained("./finbert_finetuned/best_model")

# also save test_df for task 3
test_df.to_pickle("finbert_test_df.pkl")
