## Sentiment Analysis on Finance Headlines / Tweets

Following the structure of this guide from HuggingFace: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb

Dataset used: https://www.kaggle.com/datasets/sbhatti/financial-sentiment-analysis 

https://www.kaggle.com/datasets/ankurzing/aspect-based-sentiment-analysis-for-financial-news

Just download the final data final and add it to the content directory on colab to run the notebook.

Adapted for the dataset and task of classifying Finance related tweets.


~Samyukt Sriram

In [None]:
import pandas as pd
import time

In [None]:
#Loading and Cleaning, EDA

filepath = '/content/sentfin_aspect.csv'
df = pd.read_csv(filepath)

print(df.head())



#Creating integer labels for sentiments
for i in range(len(df)):
  df.loc[i,'labels'] = 0 if df.loc[i,'Sentiment'] == 'positive' else 1 if df.loc[i,'Sentiment'] == 'neutral' else 2

df['labels'] = df['labels'].astype(int)

#This is for sentfin_aspect.csv, from the aspect based sentiment analysis datasets linked above.
if 'sentfin_aspect.csv' in filepath:
  df.rename(columns = {'Title':'Sentence'}, inplace = True)


print(df.head())
df.info()

   Unnamed: 0  S No.                                              Title  \
0           0      1  SpiceJet to issue 6.4 crore warrants to promoters   
1           1      2                  MMTC Q2 net loss at Rs 10.4 crore   
2           2      3  Mid-cap funds can deliver more, stay put: Experts   
3           3      4             Mid caps now turn into market darlings   
4           4      5  Market seeing patience, if not conviction: Pra...   

                       Decisions  Words         Entity Sentiment  
0        {"SpiceJet": "neutral"}      8       SpiceJet   neutral  
1            {"MMTC": "neutral"}      8           MMTC   neutral  
2  {"Mid-cap funds": "positive"}      8  Mid-cap funds  positive  
3       {"Mid caps": "positive"}      7       Mid caps  positive  
4          {"Market": "neutral"}      8         Market   neutral  
   Unnamed: 0  S No.                                           Sentence  \
0           0      1  SpiceJet to issue 6.4 crore warrants to promoters 

In [None]:
#Installing packages

!pip install transformers
!pip install datasets


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [None]:
import scipy
import sklearn
import numpy as np

from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification, create_optimizer
from transformers.keras_callbacks import KerasMetricCallback
from tensorflow.keras.callbacks import TensorBoard

import datasets

import tensorflow as tf
from datasets import load_dataset, load_metric

In [None]:
#Loading into the datasets library from Huggingface

dataset = datasets.Dataset.from_pandas(df).train_test_split(test_size=0.2)

dataset


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'S No.', 'Sentence', 'Decisions', 'Words', 'Entity', 'Sentiment', 'labels'],
        num_rows: 8602
    })
    test: Dataset({
        features: ['Unnamed: 0', 'S No.', 'Sentence', 'Decisions', 'Words', 'Entity', 'Sentiment', 'labels'],
        num_rows: 2151
    })
})

In [None]:
#Setting up task and model:
task = 'sst2' #Similar to problem statement from GLUE - supervised sentiment classification on Stanford Sentiment Treebank
model_checkpoint = 'roberta-base' #Make sure the model is compatible with classification tasks
#Here are some models for classification: roberta-base, roberta-large, ProsusAI/finbert

#vocab for distilbert = 30522
batch_size = 16 #This might be need to tweaked based on task and model.

In [None]:
#Loading metric for sentiment classification 
metric = load_metric('glue', task) #Can look into finding a different metric, here it is accuracy. mrpc includes f1
metric

Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

In [None]:
print(dataset['train'][5])
dataset

{'Unnamed: 0': 7552, 'S No.': 7553, 'Sentence': 'Kalpataru Power surges to 5-year high on order win', 'Decisions': '{"Kalpataru Power": "positive"}', 'Words': 9, 'Entity': 'Kalpataru Power', 'Sentiment': 'positive', 'labels': 0}


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'S No.', 'Sentence', 'Decisions', 'Words', 'Entity', 'Sentiment', 'labels'],
        num_rows: 8602
    })
    test: Dataset({
        features: ['Unnamed: 0', 'S No.', 'Sentence', 'Decisions', 'Words', 'Entity', 'Sentiment', 'labels'],
        num_rows: 2151
    })
})

In [None]:
#Preprocessing

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
  return tokenizer(examples['Sentence'], truncation = True)


encoded_dataset = dataset.map(preprocess_function, batched = True)


#For the DataCollator function, we need to specify which columns are tokenized inputs. 
pre_tokenizer_columns = set(dataset['train'].features)
tokenizer_columns = list(set(encoded_dataset['train'].features) - pre_tokenizer_columns)


  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
encoded_dataset['train'].features

{'Decisions': Value(dtype='string', id=None),
 'Entity': Value(dtype='string', id=None),
 'S No.': Value(dtype='int64', id=None),
 'Sentence': Value(dtype='string', id=None),
 'Sentiment': Value(dtype='string', id=None),
 'Unnamed: 0': Value(dtype='int64', id=None),
 'Words': Value(dtype='int64', id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'labels': Value(dtype='int64', id=None)}

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors ='tf')

tf_train_dataset = encoded_dataset['train'].to_tf_dataset(
    columns = tokenizer_columns,
    label_cols = ['labels'],
    shuffle = True,
    batch_size = 16,
    collate_fn = data_collator,
)

tf_validation_dataset = encoded_dataset['test'].to_tf_dataset(
    columns = tokenizer_columns,
    label_cols = ['labels'],
    shuffle = False, #? Why not shuffle here? Reasons seems to be shuffling is unnecessary, as model isn't learning on this. Just adds excess computation.
    batch_size = 16,
    collate_fn = data_collator,
)

In [None]:
#Defining Loss and Model

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
num_labels = 3 #for tweet sentiments
model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = num_labels)


Downloading tf_model.h5:   0%|          | 0.00/627M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Compiling the model

num_epochs = 2
batches_per_epoch = len(encoded_dataset['train']) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)


#create_optimizer() is AdamW with weight and learning rate decay
optimizer, schedule = create_optimizer(
    init_lr = 3e-5, num_warmup_steps = 0, num_train_steps = total_train_steps
)

model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])

In [None]:
#Setting up KerasMetricCallback, can handle any metric computation (like BLEU, ROUGE)
#Useful for other callbacks like TensorBoard, EarlyStopping. 


def compute_metrics(eval_predictions):
  predictions, labels = eval_predictions
  predictions = np.argmax(predictions, axis=1)
  return metric.compute(predictions=predictions, references = labels)

metric_callback = KerasMetricCallback(
    metric_fn = compute_metrics, eval_dataset = tf_validation_dataset
)

In [None]:
#Evaluation

model.evaluate(tf_validation_dataset)



[0.3493361175060272, 0.8795908689498901]

In [None]:

#Training

tensorboard_callback = TensorBoard(log_dir = "./text_classification_model_save/logs")
callbacks = [metric_callback, tensorboard_callback]


print('STOP THIS if you are running on masternode! Hit command+c!')
time.sleep(20)

#num_epochs defined a few cells above
model.fit(
    tf_train_dataset,
    validation_data = tf_validation_dataset,
    epochs = num_epochs,
    callbacks = callbacks
)
#5 epochs might be overfitting, val_loss gets significantly worse as epochs go on, very little improvement in accuracy.

STOP THIS if you are running on masternode! Hit command+c!
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fd8c26ca690>

In [None]:
#Tweet Classifier

while False:
  sentiments = ['positive', 'neutral', 'negative']
  input_tweet = "$HUL's results meet market expectations"
  input_ids = tf.constant(tokenizer.encode(input_tweet))  # Batch size 1
  outputs = model(input_ids)
  logits = outputs[0]
  print(f'logits: {logits}')
  print(f'Input tweet: {input_tweet}')
  print(f'Highest probability prediction: {sentiments[np.argmax(logits)]}')

#['positive', 'neutral', 'negative']
#p = 0.5 <=> logit = 0

In [None]:
sentiments = ['positive', 'neutral', 'negative']
input_tweet = "Microsoft records massive surge in Q1 profits"

inputs = tokenizer(input_tweet, return_tensors="tf")

logits = model(**inputs).logits

predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])

print(f'logits: {logits}')
print(f'Input tweet: {input_tweet}')
print(f'Highest probability prediction: {sentiments[predicted_class_id]}')


logits: [[ 3.5974314 -1.2441467 -2.4829457]]
Input tweet: Microsoft records massive surge in Q1 profits
Highest probability prediction: positive


In [None]:
!mkdir -p roberta_base_2_sentfin
model.save('roberta_base_2_sentfin/my_model', save_traces = True)



INFO:tensorflow:Assets written to: roberta_base_2_sentfin/my_model/assets


INFO:tensorflow:Assets written to: roberta_base_2_sentfin/my_model/assets


In [None]:
model.evaluate(tf_validation_dataset)



[0.3493361175060272, 0.8795908689498901]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
new_model = tf.keras.models.load_model('/content/drive/MyDrive/roberta_base_2_sentfin/my_model')

new_model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])

In [None]:
sentiments = ['positive', 'neutral', 'negative']
input_tweet = "Microsoft records massive surge in Q1 profits"

inputs = tokenizer(input_tweet, return_tensors="tf")

logits = new_model(**inputs).logits

predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])

print(f'logits: {logits}')
print(f'Input tweet: {input_tweet}')
print(f'Highest probability prediction: {sentiments[predicted_class_id]}')
