# Domain Adaptation - Labelled Financial News Data Using Transformer Models

In [25]:
# Importing the required libraries
import pandas as pd
import numpy as np
import math
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer
import evaluate

## Loading the data

In [26]:
data = pd.read_csv('data/Fin_cleaned.csv', encoding='utf-8')

# duplicate the data
news = data.copy()

news

Unnamed: 0,Date_published,Headline,Synopsis,Full_text,Final Status
0,2022-06-21,"Banks holding on to subsidy share, say payment...",The companies have written to the National Pay...,ReutersPayments companies and banks are at log...,Negative
1,2022-04-19,Digitally ready Bank of Baroda aims to click o...,"At present, 50% of the bank's retail loans are...",AgenciesThe bank presently has 20 million acti...,Positive
2,2022-05-27,Karnataka attracted investment commitment of R...,Karnataka is at the forefront in attracting in...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,Positive
3,2022-04-06,Splitting of provident fund accounts may be de...,The EPFO is likely to split accounts only at t...,Getty ImagesThe budget for FY22 had imposed in...,Negative
4,2022-06-14,Irdai weighs proposal to privatise Insurance I...,"Set up in 2009 as an advisory body, IIB collec...",AgenciesThere is a view in the insurance indus...,Positive
...,...,...,...,...,...
395,2022-06-10,"Banks take a cue from RBI, hike lending rates",These banks raised their respective external b...,"PTIICICI Bank, Bank of Baroda, Punjab National...",Negative
396,2022-06-29,Sebi issues Rs 27 lakh recovery notice to indi...,"In the event of non-payment, it will recover t...",ReutersThe logo of the Securities and Exchange...,Negative
397,2022-06-06,Apollo Hospital shares drop 0.68% as Sensex ...,"A total of 10,105 shares changed hands on the ...",Getty ImagesShrikant Chouhan of Kotak Securiti...,Negative
398,2022-05-16,SBI at Rs 710? What makes analysts see up to 5...,Calling the stock 'attractively valued' analys...,AgenciesThe PSU bank reported a 41.27 per cent...,Positive


## Data Preprocessing

In [27]:
# Fill the missing value with an empty string
news['Synopsis'] = news['Synopsis'].fillna('')

# Rename columns
news = news.rename(
    columns={'Date_published': 'date', 'Headline': 'headline', 'Synopsis': 'synopsis', 'Full_text': 'text',
             'Final Status': 'label'})

# Remove the spaces from the labels
news['label'] = news['label'].str.strip()

# Combine the headline, synopsis, and text columns
news['full_text'] = news['headline'] + ' ' + news['synopsis'] + ' ' + news['text']

# Convert labels to binary
news['label'] = news['label'].apply(lambda x: 1 if x == 'Positive' else 0)

news

Unnamed: 0,date,headline,synopsis,text,label,full_text
0,2022-06-21,"Banks holding on to subsidy share, say payment...",The companies have written to the National Pay...,ReutersPayments companies and banks are at log...,0,"Banks holding on to subsidy share, say payment..."
1,2022-04-19,Digitally ready Bank of Baroda aims to click o...,"At present, 50% of the bank's retail loans are...",AgenciesThe bank presently has 20 million acti...,1,Digitally ready Bank of Baroda aims to click o...
2,2022-05-27,Karnataka attracted investment commitment of R...,Karnataka is at the forefront in attracting in...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,1,Karnataka attracted investment commitment of R...
3,2022-04-06,Splitting of provident fund accounts may be de...,The EPFO is likely to split accounts only at t...,Getty ImagesThe budget for FY22 had imposed in...,0,Splitting of provident fund accounts may be de...
4,2022-06-14,Irdai weighs proposal to privatise Insurance I...,"Set up in 2009 as an advisory body, IIB collec...",AgenciesThere is a view in the insurance indus...,1,Irdai weighs proposal to privatise Insurance I...
...,...,...,...,...,...,...
395,2022-06-10,"Banks take a cue from RBI, hike lending rates",These banks raised their respective external b...,"PTIICICI Bank, Bank of Baroda, Punjab National...",0,"Banks take a cue from RBI, hike lending rates ..."
396,2022-06-29,Sebi issues Rs 27 lakh recovery notice to indi...,"In the event of non-payment, it will recover t...",ReutersThe logo of the Securities and Exchange...,0,Sebi issues Rs 27 lakh recovery notice to indi...
397,2022-06-06,Apollo Hospital shares drop 0.68% as Sensex ...,"A total of 10,105 shares changed hands on the ...",Getty ImagesShrikant Chouhan of Kotak Securiti...,0,Apollo Hospital shares drop 0.68% as Sensex ...
398,2022-05-16,SBI at Rs 710? What makes analysts see up to 5...,Calling the stock 'attractively valued' analys...,AgenciesThe PSU bank reported a 41.27 per cent...,1,SBI at Rs 710? What makes analysts see up to 5...


## Create a dataset dictionary

In [28]:
# Create a dataset dictionary
dataset = Dataset.from_pandas(news[["full_text", 'label']])

# Display the dataset dictionary    
dataset

Dataset({
    features: ['full_text', 'label'],
    num_rows: 400
})

## Split the dataset into training and testing sets

In [29]:
# We will split the dataset into 80% training and 20% testing
train_test = dataset.train_test_split(test_size=0.2, seed=42)

In [30]:
# Now we will split the test set into 50% validation and 50% test
valid_test = train_test['test'].train_test_split(test_size=0.5, seed=42)

## Defining models

In [32]:
# Models names
model_names = [
    # "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
    "distilbert-base-uncased",
    # "ProsusAI/finbert"
]

# Hugging Face organization name for the models saving
huggingface_owner = "rnribeiro/"

# Create a models dictionary
models = {
    model: {
        'tokenizer': AutoTokenizer.from_pretrained(model),
    }
    for model in model_names
}


## Loading models

This model is a distilled version of the RoBERTa-base model. It follows the same training procedure as DistilBERT.
The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.

In [34]:
# Load the metric
metric = evaluate.load("accuracy")

# Define the function to compute the metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model

## Creating a new dataset for Domain Adaptation

In [None]:
for model in model_names:
    models[model]['da_dataset'] = DatasetDict({
        'train': train_test['train'],
        'validation': valid_test['train'],
        'test': valid_test['test']
    })  # Dataset for Domain Adaptation

## Tokenization of new dataset

In [None]:
def da_tokenize_function(model, examples):  # Tokenization function for domain adaptation dataset
    result = model['tokenizer'](examples["full_text"], truncation=True, padding='max_length')
    if model['tokenizer'].is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


for model in model_names:
    models[model]['da_dataset'] = models[model]['da_dataset'].map(
        lambda examples: da_tokenize_function(models[model], examples),
        batched=True,
        remove_columns=['full_text', 'label']
    )

## Group text

In [None]:
chunk_size = 128


def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result


# Group the texts in the dataset
for model in model_names:
    models[model]['lm_dataset'] = models[model]['da_dataset'].map(
        group_texts,
        batched=True
    )

models[model_names[0]]['lm_dataset']

## Defining trainers for Domain Adaptation

In [None]:
import collections
from transformers import DataCollatorForLanguageModeling, AutoModelForMaskedLM

wwm_probability = 0.2

for model in model_names:
    models[model]['da_saving_dir'] = huggingface_owner + "DA-" + model.replace("/", "-")

    # Load model for DA training
    models[model]['da_model'] = AutoModelForMaskedLM.from_pretrained(model)

    models[model]['da_data_collator'] = DataCollatorForLanguageModeling(tokenizer=models[model]['tokenizer'],
                                                                        mlm_probability=0.15)

    models[model]['da_training_args'] = TrainingArguments(
        output_dir="./da_training_results/" + model.replace("/", "-"),
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        save_strategy='epoch',
        num_train_epochs=3,
        learning_rate=2e-5,
        weight_decay=0.01,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        push_to_hub=True,
        hub_model_id=models[model]['da_saving_dir'],
    )

    models[model]['da_trainer'] = Trainer(
        model=models[model]['da_model'],
        args=models[model]['da_training_args'],
        train_dataset=models[model]['da_dataset']['train'],
        eval_dataset=models[model]['da_dataset']['validation'],
        data_collator=models[model]['da_data_collator'],
        tokenizer=models[model]['tokenizer'],
    )

## Calculate perplexity before domain adaptation

In [None]:
for model in model_names:
    eval_results = models[model]['da_trainer'].evaluate()
    print(f"{model} Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

## Domain adaptation training

In [None]:
da_train = False
if da_train:
    for model in model_names:
        print(f"Training {model}...")
        models[model]['da_trainer'].train()

## Calculating perplexity after domain adaptation

In [None]:
if da_train:
    for model in model_names:
        eval_results = models[model]['da_trainer'].evaluate()
        print(f"{model} Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

As we can see, the perplexity of the models has decreased significantly after domain adaptation.

## Saving the models to the hub after domain adaptation

In [None]:
if da_train:
    for model in model_names:
        models[model]['da_trainer'].push_to_hub()