<a href="https://colab.research.google.com/github/rajdeepbasu/Transformers/blob/main/Fine_Tuning_FinBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files, drive
drive.mount('/content/gdrive')
# files.upload()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install transformers
!pip install transformers-interpret



In [None]:
%matplotlib inline
#Importing Transformers
import transformers
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, AdamW, Trainer, TrainingArguments
from torch.utils.data import DataLoader

#Importing Pandas
import pandas as pd
import glob

#Importing Numpy
import numpy as np

import torch
import json
import nltk
import random
import seaborn as sns
import re
import string
import matplotlib.pyplot as plt
import operator
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [None]:
transformers.__version__

'4.5.1'

In [None]:
config = AutoConfig.from_pretrained("ProsusAI/finbert")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", config=config)

In [None]:
# your path to folder containing excel files
datapath = "/content/gdrive/MyDrive/StockMarket/Shared Task/News Article/"
list_df = []

# set all .xls files in your folder to list
allfiles = glob.glob(datapath + "*.xlsx")
# for loop to aquire all excel files in folder
for excelfiles in allfiles:
  raw_excel = pd.read_excel(excelfiles)
  raw_excel = raw_excel.loc[:, ~raw_excel.columns.str.contains('^Unnamed')]
  list_df.append(raw_excel)
merged = pd.concat(list_df)
merged = merged.reset_index(drop=True)
merged = merged[merged['Sentiment'].notna()]
merged = merged[merged['Sentiment'] != 'Only title']
merged['Sentiment'] = merged['Sentiment'].apply(lambda x: x.lower())
merged.Sentiment[merged.Sentiment=="poisitve"] = "positive"

In [None]:
merged['Sentiment'].value_counts()

positive    220
neutral     110
negative     75
Name: Sentiment, dtype: int64

In [None]:
def to_sentiment(sentiment):
  if sentiment == "positive":
    return 0
  elif sentiment == "neutral":
    return 2
  else:
    return 1

In [None]:
merged['Sentiment'] = merged.Sentiment.apply(to_sentiment)

In [None]:
train_texts = merged['Title'].tolist()
train_labels = merged['Sentiment'].tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) # 0.2 x 0.8 = 0.16

In [None]:
print(len(X_train))
print(len(X_val))
print(len(X_test))

259
65
81


In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, y_train)
val_dataset = CustomDataset(val_encodings, y_val)
test_dataset = CustomDataset(test_encodings, y_test)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Step,Training Loss
10,1.0803
20,1.3282
30,1.1872
40,0.9317
50,1.0054
60,0.7371
70,0.8604
80,0.6796
90,0.5412
100,0.6755


TrainOutput(global_step=660, training_loss=0.18321500110376457, metrics={'train_runtime': 195.1635, 'train_samples_per_second': 3.382, 'total_flos': 142916748271920.0, 'epoch': 20.0, 'init_mem_cpu_alloc_delta': 1038954496, 'init_mem_gpu_alloc_delta': 439075328, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 105480192, 'train_mem_gpu_alloc_delta': 1317479936, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 247815168})

In [None]:
trainer.save_model()

In [None]:
resampled_model = AutoModelForSequenceClassification.from_pretrained("./results", config=config)

In [None]:
text_sample = "The country's largest carmaker Maruti Suzuki India on Wednesday announced that it will be increasing prices across its model range from January 2021. In November, Maruti Suzuki said its total production increased 5.91 per cent to 150221 units. The company had produced total 141834 units in the same month last year, Maruti Suzuki India (MSI) said in a regulatory filing. Passenger vehicles production stood at 146577 units last month as compared with 139084 units in November 2019, a growth of 5.38 per cent. MSI said production of its light commercial vehicle Super Carry stood at 3644 units as against 2750 units in the year-ago month."
sample_encodings = tokenizer(text_sample, truncation=True, padding=True,
                              return_tensors="pt")

In [None]:
output = resampled_model(**sample_encodings)

In [None]:
logit = output.logits[0]
softmax_score = torch.nn.functional.softmax(logit,dim=-1)

In [None]:
torch.argmax(softmax_score)

tensor(0)

In [None]:
from transformers_interpret import SequenceClassificationExplainer
cls_explainer = SequenceClassificationExplainer(
    resampled_model,
    tokenizer)
word_attributions = cls_explainer("The country's largest carmaker Maruti Suzuki India on Wednesday announced that it will be increasing prices across its model range from January 2021. In November, Maruti Suzuki said its total production increased 5.91 per cent to 150221 units. The company had produced total 141834 units in the same month last year, Maruti Suzuki India (MSI) said in a regulatory filing. Passenger vehicles production stood at 146577 units last month as compared with 139084 units in November 2019, a growth of 5.38 per cent. MSI said production of its light commercial vehicle Super Carry stood at 3644 units as against 2750 units in the year-ago month.")

In [None]:
word_attributions

[('[CLS]', 0.0),
 ('the', -0.05640547403247152),
 ('country', 0.07300394119441185),
 ("'", 0.00849292641762541),
 ('s', -0.004927281040473205),
 ('largest', 0.037477754476840694),
 ('car', 0.029750636558720955),
 ('##maker', 0.007454008483239137),
 ('maru', -0.007033180948050675),
 ('##ti', 0.03151686417189309),
 ('suzuki', 0.044007320853369405),
 ('india', -0.005128812045585003),
 ('on', 0.02092205103222193),
 ('wednesday', -0.02848210684813108),
 ('announced', -0.025289362835133694),
 ('that', 0.06144834502987568),
 ('it', 0.04956191374893187),
 ('will', 0.0858342605828336),
 ('be', 0.4385605527773697),
 ('increasing', 0.4046884835177239),
 ('prices', -0.047958418091171226),
 ('across', 0.18355991569859934),
 ('its', 0.2551222858174247),
 ('model', -0.09165726796602228),
 ('range', 0.06836526936622082),
 ('from', -0.037464415936788245),
 ('january', -0.031445883948960446),
 ('2021', -0.01813474557247328),
 ('.', -0.043268125895807266),
 ('in', 0.05265862006909612),
 ('november', 0.01

In [None]:
cls_explainer.visualize("finbert_viz.html")

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,positive (1.00),positive,3.84,"[CLS] the country ' s largest car ##maker maru ##ti suzuki india on wednesday announced that it will be increasing prices across its model range from january 2021 . in november , maru ##ti suzuki said its total production increased 5 . 91 per cent to 150 ##22 ##1 units . the company had produced total 141 ##8 ##34 units in the same month last year , maru ##ti suzuki india ( ms ##i ) said in a regulatory filing . passenger vehicles production stood at 146 ##57 ##7 units last month as compared with 139 ##0 ##8 ##4 units in november 2019 , a growth of 5 . 38 per cent . ms ##i said production of its light commercial vehicle super carry stood at 36 ##44 units as against 275 ##0 units in the year - ago month . [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,positive (1.00),positive,3.84,"[CLS] the country ' s largest car ##maker maru ##ti suzuki india on wednesday announced that it will be increasing prices across its model range from january 2021 . in november , maru ##ti suzuki said its total production increased 5 . 91 per cent to 150 ##22 ##1 units . the company had produced total 141 ##8 ##34 units in the same month last year , maru ##ti suzuki india ( ms ##i ) said in a regulatory filing . passenger vehicles production stood at 146 ##57 ##7 units last month as compared with 139 ##0 ##8 ##4 units in november 2019 , a growth of 5 . 38 per cent . ms ##i said production of its light commercial vehicle super carry stood at 36 ##44 units as against 275 ##0 units in the year - ago month . [SEP]"
,,,,
