In [None]:
!pip install datasets evaluate transformers[sentencepiece]

from datasets import load_dataset
data_files = {"train": "sent_train.csv", "test": "sent_valid.csv"}
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment", data_files=data_files)


In [2]:
sentiments = {
    0: "Bearish", 
    1: "Bullish", 
    2: "Neutral"
}


In [3]:
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, TFBertForSequenceClassification, DataCollatorWithPadding

In [4]:
import re

def process_tweet(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)

    return tweet

In [5]:
test_rawdata = dataset["test"]
train_rawdata = dataset["train"]

test_data = [ process_tweet(twt) for twt in test_rawdata['text'] ]
train_data = [ process_tweet(twt) for twt in train_rawdata['text'] ]

In [6]:
train_rawdata

Dataset({
    features: ['text', 'label'],
    num_rows: 9543
})

In [None]:
#@title Default title text
idx0 = [idx for idx,lbl in enumerate(test_rawdata['label']) if test_rawdata['label'][idx] == 0]
idx1 = [idx for idx,lbl in enumerate(test_rawdata['label']) if test_rawdata['label'][idx] == 1]
idx2 = [idx for idx,lbl in enumerate(test_rawdata['label']) if test_rawdata['label'][idx] == 2]

In [None]:
#@title Default title text
n = 33
sample_idx= idx0[0:n] + idx1[0:n] + idx2[0:n] 

sample_data = []
sample_label = []

for idx in sample_idx:
  sample_data.append(test_data[idx])
  sample_label.append(test_rawdata['label'][idx])

sample_data[-1], sample_label[-1]

In [None]:
#@title Default title text
models = ['ProsusAI/finbert'
    , 'finiteautomata/bertweet-base-sentiment-analysis'
    , 'yiyanghkust/finbert-tone'
    , 'cardiffnlp/twitter-roberta-base-sentiment'
    , 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis'
    , 'soleimanian/financial-roberta-large-sentiment'
    , 'ahmedrachid/FinancialBERT-Sentiment-Analysis'
    , 'nickmuchi/finbert-tone-finetuned-fintwitter-classification'
]

In [None]:
#@title Default title text
i =7
raw_inputs = sample_data

checkpoint = models[i]

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(raw_inputs, padding='longest', truncation=True, return_tensors="tf")
model = TFBertForSequenceClassification.from_pretrained(checkpoint, from_pt=True)
output =  model(inputs) #logits

In [None]:
#@title Default title text
print(output[0][0])

preds = tf.nn.softmax(output[0:][0:], axis =-1)
#preds

In [None]:
#@title Default title text
preds_labels = np.argmax(preds,axis=2)
preds_labels.shape

In [None]:
#@title Default title text
for i in range(n):
  if sample_label[i] != preds_labels[0][i]:
    print(i, sample_data[i], sample_label[i], preds_labels[0][i])

**Fine-Tuning** 

Comparing 4 models

In [7]:
models = [ 'distilbert-base-uncased-finetuned-sst-2-english'
    , 'cardiffnlp/twitter-roberta-base-sentiment'
    , 'finiteautomata/bertweet-base-sentiment-analysis'
    , 'ProsusAI/finbert'
]

In [31]:
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
import datasets

In [42]:
checkpoint = models[2]

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

def tokenizer_func(tweet):
  #tweet['text'] = process_tweet(tweet['text']) #pre-process tweet
  return tokenizer(tweet['text'], truncation=True)



emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [51]:
X_train, X_val, y_train, y_val = train_test_split([process_tweet(twt) for twt in train_rawdata['text']], train_rawdata['label'],
                                                    stratify= train_rawdata['label'], 
                                                    test_size=0.20)

train_dict = {
    "text": X_train
    , "label": y_train
}

val_dict = {
    "text": X_val
    , "label": y_val
}

test_dict = {
    "text": [process_tweet(twt) for twt in test_rawdata['text']]
    , "label": test_rawdata['label']
}


In [53]:
train_dataset = Dataset.from_dict(train_dict)
val_dataset = Dataset.from_dict(val_dict)
test_dataset = Dataset.from_dict(test_dict)

fintweet_datasetdict = datasets.DatasetDict({"train":train_dataset, "val":val_dataset, "test":test_dataset})

fintweet_datasetdict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7634
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 1909
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})

In [65]:
tokenized_datasets = fintweet_datasetdict.map(tokenizer_func, batched=True)

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=64,
)

tf_validation_dataset = tokenized_datasets["val"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=32,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=128,
)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [66]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [67]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay


num_epochs = 25

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# so its len() is already num_samples // batch_size.

num_train_steps = len(tf_train_dataset) * num_epochs

lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

model.compile(
    optimizer=opt,
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = num_epochs
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fb91dfc6f10>

No significant improvement in validation accuracy after epoch 3. 

Training accuracy saturates after epoch 6 and model likely starts to overfit.

Possible solution: larger training set (already 80%), early stopping, different learning rates

Possibility of noise in the dataset since it is tweets. Data not clean enough

In [68]:
preds = model.predict(tf_test_dataset)["logits"]
class_preds = np.argmax(preds, axis=1)



In [77]:
true_vals = [1 if test_dataset['label'][n] == class_preds[n] else 0 for n in range(len(test_dataset['label']))]

#acc
sum(true_vals)/len(true_vals)

0.8848408710217756

In [78]:
results =  model.evaluate(tf_validation_dataset)
dict(zip(model.metrics_names, results))



{'loss': 0.7317471504211426, 'accuracy': 0.8816134333610535}

In [70]:
results =  model.evaluate(tf_test_dataset)



In [72]:
dict(zip(model.metrics_names, results))

{'loss': 0.7282922863960266, 'accuracy': 0.8848408460617065}

Repeating for different checkpoint

In [80]:
checkpoint = models[-1]

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tokenized_datasets = fintweet_datasetdict.map(tokenizer_func, batched=True)

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=64,
)

tf_validation_dataset = tokenized_datasets["val"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=32,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=128,
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [84]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

num_epochs = 10

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# so its len() is already num_samples // batch_size.

num_train_steps = len(tf_train_dataset) * num_epochs

lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=5e-8, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

model.compile(
    optimizer=opt,
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = num_epochs
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fbc9d6d10d0>

In [85]:
results =  model.evaluate(tf_validation_dataset)
dict(zip(model.metrics_names, results))



{'loss': 0.581873893737793, 'accuracy': 0.8837087750434875}

In [86]:
results =  model.evaluate(tf_test_dataset)
dict(zip(model.metrics_names, results))



{'loss': 0.5551179051399231, 'accuracy': 0.8819095492362976}

Trying with more training data on same model

In [87]:
X_train, X_val, y_train, y_val = train_test_split([process_tweet(twt) for twt in train_rawdata['text']], train_rawdata['label'],
                                                    stratify= train_rawdata['label'], 
                                                    test_size=0.10)

train_dict = {
    "text": X_train
    , "label": y_train
}

val_dict = {
    "text": X_val
    , "label": y_val
}

test_dict = {
    "text": [process_tweet(twt) for twt in test_rawdata['text']]
    , "label": test_rawdata['label']
}


train_dataset = Dataset.from_dict(train_dict)
val_dataset = Dataset.from_dict(val_dict)
test_dataset = Dataset.from_dict(test_dict)

fintweet_datasetdict = datasets.DatasetDict({"train":train_dataset, "val":val_dataset, "test":test_dataset})


checkpoint = models[-1]

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tokenized_datasets = fintweet_datasetdict.map(tokenizer_func, batched=True)

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=64,
)

tf_validation_dataset = tokenized_datasets["val"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=32,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=128,
)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [88]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

num_epochs = 10

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# so its len() is already num_samples // batch_size.

num_train_steps = len(tf_train_dataset) * num_epochs

lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=5e-8, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

model.compile(
    optimizer=opt,
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = num_epochs
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fba7eaed790>

In [89]:
results =  model.evaluate(tf_test_dataset)
dict(zip(model.metrics_names, results))



{'loss': 0.5633423924446106, 'accuracy': 0.8865159153938293}

No significant difference

In [105]:
tf.math.confusion_matrix(
    labels = test_dataset['label']
    , predictions = class_preds
    , num_classes = 3
)

"""
predictions ---->
actual 
|
|
v
"""

<tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[ 283,   12,   52],
       [  19,  377,   79],
       [  50,   59, 1457]], dtype=int32)>

In [90]:
preds = model.predict(tf_test_dataset)["logits"]
class_preds = np.argmax(preds, axis=1)



In [92]:
true_vals = [1 if test_dataset['label'][n] == class_preds[n] else 0 for n in range(len(test_dataset['label']))]

false_idx = [idx for idx, lbl in enumerate(true_vals) if lbl == 0]

In [95]:
len(false_idx), len(true_vals)

(271, 2388)

In [102]:
false_dict ={'text': [],
             'label': [],
             'pred': []}
for idx in false_idx:
  false_dict['text'].append(test_dataset['text'][idx])
  false_dict['label'].append(test_dataset['label'][idx])
  false_dict['pred'].append(class_preds[idx])  


false_df = pd.DataFrame(false_dict, index = false_idx)
false_df.head()

Unnamed: 0,text,label,pred
8,Barclays assigns only a 20% chance that studie...,0,2
9,BTIG points to breakfast pressure for Dunkin' ...,0,2
35,- BMO Capital joins Nike bull camp,1,2
36,"- Buy oil service firms, Bernstein says aft...",1,2
42,AM Best Revises Outlooks to Positive for Pacíf...,1,2


In [122]:
sentiments

{0: 'Bearish', 1: 'Bullish', 2: 'Neutral'}

In [112]:
print('# tweets falsely classified')
false_dict['label'].count(0), false_dict['label'].count(1), false_dict['label'].count(2)

# tweets falsely classified


(64, 98, 109)

In [111]:
print('# tweets falsely classified as')
false_dict['pred'].count(0), false_dict['pred'].count(1), false_dict['pred'].count(2)

# tweets falsely classified as


(69, 71, 131)

In [120]:
false_df[ (false_df.label==2) & (false_df.pred==1)]

Unnamed: 0,text,label,pred
94,Analysts Expect Breakeven For China Online Edu...,2,1
154,Markets bet Fed is pushed to cut rates in coro...,2,1
245,"- Max out Apple's Mac Pro for ,599",2,1
297,Amazon will spend more than billion on shippi...,2,1
318,Boeing announces additional order for 737 MAX ...,2,1
319,Boeing gets 10 additional orders for 737 MAX 8...,2,1
354,Galp Energia : secures new renewable power pur...,2,1
362,Grazitti Interactive Expands Its Presence in N...,2,1
386,Marubeni : 10 Year Extension of Production Sha...,2,1
424,Procter & Gamble looks to get well in healthcare,2,1


In [121]:
false_df[ (false_df.label==2) & (false_df.pred==0)]

Unnamed: 0,text,label,pred
126,Buyback Backlash Begins: Fed Will Limit Buybac...,2,0
258,- Margins suffer at Ituran Location in Q3,2,0
260,(halted pre) La Jolla Pharmaceutical (LJPC) t...,2,0
282,Activision Blizzard’s Revenue To See Double-Di...,2,0
285,AGF Announces Closure and Liquidation of AGFiQ...,2,0
387,Maruti Suzuki Says India Not Ready For EVs As ...,2,0
418,"PG&E to cut power to 150,000 customer for fire...",2,0
467,T-Mobile moves not enough to address concerns ...,2,0
469,Total Halts Production at Normandy Refinery Af...,2,0
472,Two of three SandRidge royalty trusts lose NYS...,2,0


## Conclusion: Model doesnt perform very well on tweets classified as neutral (2)