In [1]:
# Ensure environment settings for tokenizers and wandb
%env TOKENIZERS_PARALLELISM=false
%env WANDB_DISABLED=true

# Install required packages
# %pip install numpy pandas scikit-learn transformers datasets evaluate torch tqdm

# Now import the necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoModel,
    AdamW,
    AutoConfig,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification
)

from datasets import Dataset
from evaluate import load

import torch
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader

# Verify if torch was installed successfully
try:
    print(f"PyTorch version: {torch.__version__}")
except ImportError:
    print("PyTorch installation failed.")

env: TOKENIZERS_PARALLELISM=false
env: WANDB_DISABLED=true


  from .autonotebook import tqdm as notebook_tqdm


PyTorch version: 2.5.1


In [2]:
class args:
  model = 'ProsusAI/finbert'

In [3]:
df = pd.read_csv('data/all-data.csv', names=['labels', 'messages'], encoding='ISO-8859-1')

In [4]:
df = df[['messages', 'labels']]
df.head()

Unnamed: 0,messages,labels
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive


In [5]:
le = LabelEncoder()
df['labels'] = le.fit_transform(df['labels'])
df['labels'].value_counts()

labels
1    2879
2    1363
0     604
Name: count, dtype: int64

In [6]:
X, y = df['messages'].values, df['labels'].values

# train : test = 0.9 : 0.1
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1, stratify=y)

# train : valid = 0.8 : 0.2
xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size=0.2, stratify=ytrain)

# train : valid : test = 0.72 : 0.18 : 0.10 (stratified on 'labels')

In [7]:
train_dataset_raw = Dataset.from_dict({'text':xtrain, 'labels':ytrain})
valid_dataset_raw = Dataset.from_dict({'text':xvalid, 'labels':yvalid})

In [8]:
def tokenize_fn(examples):
    return tokenizer(examples['text'], truncation=True)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(args.model)

In [10]:
train_dataset_raw

Dataset({
    features: ['text', 'labels'],
    num_rows: 3488
})

In [11]:
train_dataset = train_dataset_raw.map(tokenize_fn, batched=True)
valid_dataset = valid_dataset_raw.map(tokenize_fn, batched=True)

data_collator = DataCollatorWithPadding(tokenizer)

Map: 100%|██████████| 3488/3488 [00:00<00:00, 17647.98 examples/s]
Map: 100%|██████████| 873/873 [00:00<00:00, 18644.29 examples/s]


In [12]:
model = AutoModelForSequenceClassification.from_pretrained(args.model)

In [13]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, preds)}

In [14]:
train_args = TrainingArguments(
    './Finbert Trained/',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=2*16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,    
    do_eval=True,
    do_train=True,
    do_predict=True,
    evaluation_strategy='epoch',
    save_strategy="no",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
trainer = Trainer(
    model,
    train_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [16]:
trainer.train()

                                                  
 20%|██        | 219/1090 [00:45<16:24,  1.13s/it]

{'eval_loss': 0.39340025186538696, 'eval_accuracy': 0.843069873997709, 'eval_runtime': 3.2965, 'eval_samples_per_second': 264.825, 'eval_steps_per_second': 8.494, 'epoch': 1.0}


                                                  
 40%|████      | 437/1090 [01:21<08:26,  1.29it/s]

{'eval_loss': 0.30324220657348633, 'eval_accuracy': 0.8854524627720504, 'eval_runtime': 2.0961, 'eval_samples_per_second': 416.488, 'eval_steps_per_second': 13.358, 'epoch': 2.0}


 46%|████▌     | 501/1090 [01:31<01:34,  6.23it/s]

{'loss': 0.5708, 'grad_norm': 1.5953829288482666, 'learning_rate': 1.2028542303771661e-05, 'epoch': 2.29}


                                                  
 60%|██████    | 655/1090 [01:57<05:44,  1.26it/s]

{'eval_loss': 0.42314985394477844, 'eval_accuracy': 0.8694158075601375, 'eval_runtime': 2.136, 'eval_samples_per_second': 408.71, 'eval_steps_per_second': 13.109, 'epoch': 3.0}


                                                  
 80%|████████  | 873/1090 [02:32<02:50,  1.27it/s]

{'eval_loss': 0.4458830952644348, 'eval_accuracy': 0.8843069873997709, 'eval_runtime': 2.1248, 'eval_samples_per_second': 410.868, 'eval_steps_per_second': 13.178, 'epoch': 4.0}


 92%|█████████▏| 1001/1090 [02:51<00:13,  6.48it/s]

{'loss': 0.0779, 'grad_norm': 0.14419539272785187, 'learning_rate': 1.8348623853211011e-06, 'epoch': 4.59}


                                                   
100%|██████████| 1090/1090 [03:07<00:00,  5.82it/s]

{'eval_loss': 0.44380295276641846, 'eval_accuracy': 0.8900343642611683, 'eval_runtime': 2.1185, 'eval_samples_per_second': 412.088, 'eval_steps_per_second': 13.217, 'epoch': 5.0}
{'train_runtime': 187.2477, 'train_samples_per_second': 93.139, 'train_steps_per_second': 5.821, 'train_loss': 0.3001344844835614, 'epoch': 5.0}





TrainOutput(global_step=1090, training_loss=0.3001344844835614, metrics={'train_runtime': 187.2477, 'train_samples_per_second': 93.139, 'train_steps_per_second': 5.821, 'total_flos': 533404898383392.0, 'train_loss': 0.3001344844835614, 'epoch': 5.0})

In [17]:
trainer.save_model('finbert_finetuned.bin')

In [18]:
pred_dataset_raw = Dataset.from_dict({'text': xtest})
pred_dataset_raw

Dataset({
    features: ['text'],
    num_rows: 485
})

In [19]:
pred_dataset = pred_dataset_raw.map(tokenize_fn, batched=False)

Map: 100%|██████████| 485/485 [00:00<00:00, 11236.77 examples/s]


In [20]:
len(pred_dataset[0]['input_ids'])

22

In [21]:
output = trainer.predict(
    test_dataset=pred_dataset,
)

100%|██████████| 16/16 [00:01<00:00,  9.86it/s]


In [22]:
le.inverse_transform([np.argmax(x) for x in output.predictions])

array(['neutral', 'negative', 'neutral', 'neutral', 'positive',
       'positive', 'positive', 'negative', 'neutral', 'positive',
       'positive', 'neutral', 'neutral', 'negative', 'positive',
       'neutral', 'positive', 'neutral', 'neutral', 'positive', 'neutral',
       'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral',
       'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral',
       'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'positive',
       'neutral', 'neutral', 'positive', 'neutral', 'positive', 'neutral',
       'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral',
       'positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral',
       'positive', 'neutral', 'neutral', 'neutral', 'neutral', 'positive',
       'negative', 'neutral', 'neutral', 'positive', 'negative',
       'positive', 'neutral', 'positive', 'neutral', 'neutral',
       'positive', 'negative', 'positive', 'neutral', 'negative',
       'posit

In [23]:
preds = [np.argmax(x) for x in output.predictions]

In [24]:
accuracy_score(ytest, preds)

0.8989690721649485