<a href="https://colab.research.google.com/github/rajdeepbasu/Transformers/blob/main/Fine_Tuning_FinBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files, drive
drive.mount('/content/gdrive')
# files.upload()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install transformers



In [None]:
%matplotlib inline
#Importing Transformers
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, AdamW, Trainer, TrainingArguments
from torch.utils.data import DataLoader

#Importing Pandas
import pandas as pd
import glob

#Importing Numpy
import numpy as np

import torch
import json
import nltk
import random
import seaborn as sns
import re
import string
import matplotlib.pyplot as plt
import operator
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [None]:
transformers.__version__

'4.4.0'

In [None]:
config = AutoConfig.from_pretrained("ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", config=config)

In [None]:
# your path to folder containing excel files
datapath = "/content/gdrive/MyDrive/StockMarket/Shared Task/News Article/"
list_df = []

# set all .xls files in your folder to list
allfiles = glob.glob(datapath + "*.xlsx")
# for loop to aquire all excel files in folder
for excelfiles in allfiles:
  raw_excel = pd.read_excel(excelfiles)
  raw_excel = raw_excel.loc[:, ~raw_excel.columns.str.contains('^Unnamed')]
  list_df.append(raw_excel)
merged = pd.concat(list_df)
merged = merged.reset_index(drop=True)
merged = merged[merged['Sentiment'].notna()]
merged = merged[merged['Sentiment'] != 'Only title']
merged['Sentiment'] = merged['Sentiment'].apply(lambda x: x.lower())

In [None]:
merged['Sentiment'].value_counts()

positive    201
neutral      93
negative     70
Name: Sentiment, dtype: int64

In [None]:
def to_sentiment(sentiment):
  if sentiment == "positive":
    return 1
  elif sentiment == "neutral":
    return 0
  else:
    return 2

In [None]:
merged['Sentiment'] = merged.Sentiment.apply(to_sentiment)

In [None]:
train_texts = merged['Title'].tolist()
train_labels = merged['Sentiment'].tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

In [None]:
print(len(X_train))
print(len(X_val))
print(len(X_test))

218
73
73


In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, y_train)
val_dataset = CustomDataset(val_encodings, y_val)
test_dataset = CustomDataset(test_encodings, y_test)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Step,Training Loss
10,2.838
20,2.9319
30,2.9025
40,1.9626
50,1.801
60,1.3446
70,1.1961
80,1.0195
90,0.8389
100,0.8542


TrainOutput(global_step=560, training_loss=0.381618039474623, metrics={'train_runtime': 58.2042, 'train_samples_per_second': 9.621, 'total_flos': 120292861479840.0, 'epoch': 20.0, 'init_mem_cpu_alloc_delta': 350453, 'init_mem_gpu_alloc_delta': 439075328, 'init_mem_cpu_peaked_delta': 18306, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 586990, 'train_mem_gpu_alloc_delta': 1317479936, 'train_mem_cpu_peaked_delta': 94560620, 'train_mem_gpu_peaked_delta': 248897024})

In [None]:
trainer.save_model()

In [None]:
resampled_model = AutoModelForSequenceClassification.from_pretrained("./results", config=config)

In [None]:
text_sample = "Hey! How are you doing?"
sample_encodings = tokenizer(text_sample, truncation=True, padding=True,
                              return_tensors="pt")

In [None]:
output = resampled_model(**sample_encodings)

In [None]:
logit = output.logits[0]
softmax_score = torch.nn.functional.softmax(logit,dim=-1)

In [None]:
torch.argmax(softmax_score)

tensor(0)