<a href="https://colab.research.google.com/github/sfuller14/MSDS-453-Project/blob/master/Fine_Grained_Financial_Sentiment_Regression_with_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [9]:
from google.colab import output

In [None]:
# Comment this out for deberta
!pip install -q transformers

In [2]:
!pip install -q datasets

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import json

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping

from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification
from datasets import Dataset

from pprint import pprint

In [4]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from scipy.spatial.distance import cosine

# For model evaluation:
def cosine_score(predicted_values, true_values):
    return 1 - cosine(predicted_values, true_values)

In [5]:
import warnings
warnings.filterwarnings("ignore")

# EDA

## Get Data

In [None]:
# Get data from here:
# !git clone https://bitbucket.org/ssix-project/semeval-2017-task-5-subtask-2.git

In [6]:
# Clean up ground truth test set
import json

with open("data/finance/Headlines_Testdata_withscores.json", "r") as file:
    data = json.load(file)

for example in data:
    del example['UniqueID']

with open("data/finance/Headlines_Testdata_withscores_clean.json", "w") as file:
    json.dump(data, file)

In [13]:
raw_train_ds = Dataset.from_json("data/finance/Headline_Trainingdata.json")
raw_val_ds = Dataset.from_json("data/finance/Headline_Validationdata.json")
raw_test_ds = Dataset.from_json("data/finance/Headlines_Testdata_withscores_clean.json")
raw_datasets = {"train": raw_train_ds, "val": raw_val_ds, "test": raw_test_ds}
output.clear()

In [14]:
raw_train_ds

Dataset({
    features: ['id', 'company', 'title', 'sentiment'],
    num_rows: 914
})

In [15]:
raw_val_ds

Dataset({
    features: ['id', 'company', 'title', 'sentiment'],
    num_rows: 228
})

In [16]:
raw_test_ds

Dataset({
    features: ['id', 'company', 'title', 'sentiment'],
    num_rows: 491
})

## Examine data

In [38]:
pd.DataFrame(raw_train_ds.to_dict())

Unnamed: 0,id,company,title,sentiment
0,3,IMI,IMI posts drop in first-quarter organic revenue; warns on full year,-0.344
1,4,Glencore,"Glencore to refinance its short-term debt early, shares rise",0.340
2,5,Ryanair,EasyJet attracts more passengers in June but still lags Ryanair,0.259
3,6,Barclays,Barclays 'bad bank' chief to step down,-0.231
4,7,BP,Bilfinger Industrial Services win Â£100m BP contract extension,0.113
...,...,...,...,...
909,1136,Balfour Beatty plc,Balfour Beatty plc Set To Reinstate Dividend (And Rival National Grid plc And Centrica PLC Once More?),0.413
910,1137,CIB,"UPDATE: CIB, Legal & General Sell Egyptian Life Joint Venture To AXA",0.148
911,1138,BG Group,BG Group appoints new CEO one month early,0.069
912,1140,Barclays,Barclays set to name former JPMorgan banker Staley as new CEO,0.000


In [39]:
pd.DataFrame(raw_val_ds.to_dict())

Unnamed: 0,id,company,title,sentiment
0,155,Starwood,"Intercontinental Hotels, Starwood held early deal talks - FT",0.145
1,71,Dixons Carphone,CompaniesLord Livingston joins Dixons Carphone,0.364
2,336,Severn Trent,"Water utility Severn Trent ups savings forecast, FY profit falls",0.177
3,58,BP,"EXCLUSIVE-BP, China's CNPC to unveil oil alliance - sources",0.326
4,225,BAE Systems,BAE Systems's sales boosted by European Typhoon and currencies,0.390
...,...,...,...,...
223,107,United Utilities,United Utilities FY profit up 3.5%; to boost capex,0.420
224,154,Shell,Shell and BG Shareholders to Vote on Deal at End of January,0.000
225,60,BP,BP sees Q1 earnings slide as low oil prices take their toll,-0.376
226,210,AstraZeneca,AstraZeneca in Talks to Buy Cancer Drug Developer Acerta Pharma,0.210


In [40]:
pd.DataFrame(raw_test_ds.to_dict())

Unnamed: 0,id,company,title,sentiment
0,1144,Ashtead,"Ashtead to buy back shares, full-year profit beats estimates",0.588
1,1145,Shell,EU regulators clear Shell's takeover of BG Group,0.276
2,1146,Prudential,UK's FTSE has worst day so far in 2015 as BG and Prudential fall,-0.651
3,1147,GlaxoSmithKline,GlaxoSmithKline acquires HIV assets,0.390
4,1148,Barclays,Barclays faces another heavy forex fine,-0.834
...,...,...,...,...
486,1630,Barclays,Exclusive: Barclays reins in foreign exchange trading before referendum result - sources,0.152
487,1631,Barclays,Copper Falls With Mining Stocks as Barclays Warns of Pullback,-0.800
488,1632,Glencore,LPC-Glencore launches refinancing of US$8.45bn loan,0.173
489,1633,GSK,GSK and Novartis complete deals to reshape both drugmakers,0.285


## Tokenize datasets

#### Get Max Tokenized Vector Length

In [18]:
def get_max_tokenized_length(ds, tok):
    max_len_raw = 0
    max_len_token_vector = 0
    
    for text_input in ds:
        
        if len(text_input.split(" ")) > max_len_raw:
            max_len_raw = len(text_input.split(" "))
        
        if len(tok(text_input)['input_ids']) > max_len_token_vector:
            max_len_token_vector = len(tok(text_input)['input_ids'])

    return max_len_raw, max_len_token_vector

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
max_len_raw, max_len_token_vector = get_max_tokenized_length(raw_train_ds['title'] + raw_val_ds['title'] + raw_test_ds['title'], tok=tokenizer)
output.clear()

In [42]:
print('For AutoTokenzier associated with "bert-base-uncased":')
print('(Note this will potentially be different for each pretrained model)\n')
print(f'max_len_raw (longest headline)= {max_len_raw}')
print(f'max_len_tokenized (longest tokenized headline)= {max_len_token_vector}')

For AutoTokenzier associated with "bert-base-uncased":
(Note this will potentially be different for each pretrained model)

max_len_raw (longest headline)= 18
max_len_tokenized (longest tokenized headline)= 29


#### Tokenize and save as TF Dataset object

In [20]:
def create_tokenized_tf_datasets(untokenized_datasets_dict, tok, max_len):

    def preprocess_function(examples):#, max_len, tok):
        label = examples["sentiment"] 
        examples = tok(examples["title"], truncation=True, padding="max_length", max_length=max_len)
        
        # Change this to real number
        examples["label"] = float(label)
        return examples


    tokenized_datasets = dict()
    for split in untokenized_datasets_dict:
        tokenized_datasets[split] = untokenized_datasets_dict[split].map(preprocess_function)

    data_collator = DataCollatorWithPadding(tokenizer=tok, return_tensors="tf")

    tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
        columns=["attention_mask", "input_ids"],#, "token_type_ids"],
        label_cols=["labels"],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=4,
    )

    tf_val_dataset = tokenized_datasets["val"].to_tf_dataset(
        columns=["attention_mask", "input_ids"],#, "token_type_ids"],
        label_cols=["labels"],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=4,
    )    

    tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
        columns=["attention_mask", "input_ids"],#, "token_type_ids"],
        label_cols=["labels"],
        shuffle=False,
        collate_fn=data_collator,
        batch_size=4,
    )

    return tokenized_datasets, tf_train_dataset, tf_val_dataset, tf_test_dataset

In [None]:
tokenized_datasets, tf_train_dataset, tf_val_dataset, tf_test_dataset = create_tokenized_tf_datasets(untokenized_datasets_dict=raw_datasets, tok=tokenizer, max_len=max_len_token_vector)

#### Look at tokenized dataset

In [44]:
# Full training set
np.asarray(tokenized_datasets['train']['input_ids'])

array([[  101, 10047,  2072, ...,     0,     0,     0],
       [  101,  8904, 17345, ...,     0,     0,     0],
       [  101,  3733, 15759, ...,     0,     0,     0],
       ...,
       [  101,  1038,  2290, ...,     0,     0,     0],
       [  101, 23724,  2015, ...,     0,     0,     0],
       [  101,  2332,  7529, ...,     0,     0,     0]])

In [45]:
# First sentence observation -- tokenized representation
np.asarray(tokenized_datasets['train']['input_ids'])[0,:]

array([  101, 10047,  2072,  8466,  4530,  1999,  2034,  1011,  4284,
        7554,  6599,  1025, 19428,  2006,  2440,  2095,   102,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0])

In [46]:
# (training_set_rows, max_token_vector_length)
np.asarray(tokenized_datasets['train']['input_ids']).shape

(914, 29)

In [47]:
# Convert back from tokenized to text representation
tokenizer.decode(tokenizer(raw_train_ds['title'][6])['input_ids'])

'[CLS] centrica prepared for takeover approach - chairman [SEP]'

In [48]:
# Same, represented as tokens
# 101 - [CLS]
# 102 - [SEP]
tokenizer(raw_train_ds['title'][6])['input_ids']

[101, 9358, 14735, 4810, 2005, 15336, 3921, 1011, 3472, 102]

In [49]:
# Batch #1:


# [tokenized sentences,
#  N/A (not used by BERT),
#  attention masks,
#  labels]

arr = tf_train_dataset.as_numpy_iterator()
l = list(arr)
l[0]

({'input_ids': array([[  101,  2866, 15187,  2062,  1997,  6746,  2015,  2924,  8406,
           1010,  2260,  1012,  1019,  4551,  7038,  2992,  2061,  2521,
            102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0],
         [  101, 10975, 12672, 19909,  6573,  2044,  9387, 10632,  1037,
          29646,  2509,  1012,  1023, 24700,  2013,  1049,  1004,  1043,
            102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0],
         [  101, 10651,  1019,  1011, 23724,  2015,  3472, 11338, 23511,
          19589,  5766,  2000,  3177,  2039,  6143,  2689,   102,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0],
         [  101,  2866,  2231,  7659,  8406,  1999,  6746,  2015,  2000,
           2917,  2340,  3867,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]]),
  'token_

# Create function to fit model

In [26]:
def load_compile_fit_eval_model(checkpoint, train_ds, val_ds, test_df):
    model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1, ignore_mismatched_sizes=True, from_pt=True) # num_labels=1 --> regression head after BERT layer (linear layer for output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss="mse", metrics=['cosine_proximity'])
    early_stopping_monitor = EarlyStopping(patience=6)
    
    model.fit(train_ds, epochs=50, validation_data=val_ds, callbacks=[early_stopping_monitor])# batch_size=4,

    predicted_sentiments = model.predict(test_df)
    result = cosine_score(predicted_sentiments['logits'][:,0],  np.concatenate([y for x, y in test_df], axis=0))
    return result

In [None]:
def tenfold_cross_val(train_ds, checkpoint):

    all_results = []

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    for i, (train_index, test_index) in enumerate(kfold.split(train_ds['title'], train_ds['sentiment'])):
        
        print(f"Fold {i}:")
        print(f"  Train: index={train_index}")
        print(f"  Test:  index={test_index}")

        tokenized_cv_train = Dataset.from_dict(train_ds[train_index]).map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])
        tokenized_cv_test = Dataset.from_dict(train_ds[test_index]).map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

        cv_train_dataset_tf = tokenized_cv_train.to_tf_dataset(
            columns=["attention_mask", "input_ids"],# "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=4,
        )

        cv_test_dataset_tf = tokenized_cv_test.to_tf_dataset(
            columns=["attention_mask", "input_ids"],# "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=4,
        )
    
        model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1, ignore_mismatched_sizes=True, from_pt=True) # num_labels=1 --> regression head after BERT layer (linear layer for output)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss="mse", metrics=['cosine_proximity'])
        model.fit(cv_train_dataset_tf, epochs=10)

        predicted_sentiments = model.predict(cv_test_dataset_tf)
        result = cosine_score(predicted_sentiments['logits'][:,0],  np.concatenate([y for x, y in cv_test_dataset_tf], axis=0))
        print(f'Fold {i} cosine_similarity = {result}\n\n')
        all_results.append(result)

    return all_results

# BERT
bert-base-uncased  
Cross Validation cosine_similarity = 74.94  
Competition Test Set cosine_similarity = 79.26

## Preprocessing for input into BERT

In [51]:
current = "bert-base-uncased"

In [52]:
tokenizer = AutoTokenizer.from_pretrained(current)
max_len_raw, max_len_token_vector = get_max_tokenized_length(raw_train_ds['title'] + raw_test_ds['title'], tok=tokenizer)

print(f'For AutoTokenzier associated with "{current}":')
print('(Note this will potentially be different for each pretrained model)\n')
print(f'max_len_raw (longest headline)= {max_len_raw}')
print(f'max_len_tokenized (longest tokenized headline)= {max_len_token_vector}')

For AutoTokenzier associated with "bert-base-uncased":
(Note this will potentially be different for each pretrained model)

max_len_raw (longest headline)= 18
max_len_tokenized (longest tokenized headline)= 29


In [53]:
tokenized_datasets, tf_train_dataset, tf_val_dataset, tf_test_dataset = create_tokenized_tf_datasets(untokenized_datasets_dict = raw_datasets, 
                                                                                                     tok = tokenizer, 
                                                                                                     max_len = max_len_token_vector)
output.clear()

## Fit and Evaluate BERT

In [56]:
load_compile_fit_eval_model(current, tf_train_dataset, tf_val_dataset, tf_test_dataset)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50


0.7926343679428101

## Perform 10-Fold Cross Validation
(as a proxy for test set - prior to receiving ground truth test set)

In [None]:
cv_raw_data_cosine_similarity_all_folds = tenfold_cross_val(raw_train_ds)
cv_raw_data_cosine_similarity_avg = (sum(cv_raw_data_cosine_similarity_all_folds) / len(cv_raw_data_cosine_similarity_all_folds)) * 100

In [None]:
cv_raw_data_cosine_similarity_avg

74.94607746601105

# finBERT
ProsusAI/finbert  
Cross Validation cosine_similarity = 77.79  
Competition Test Set cosine_similarity = 80.69

## Preprocessing for input into finBERT

In [57]:
current = "ProsusAI/finbert"

In [58]:
tokenizer = AutoTokenizer.from_pretrained(current)
max_len_raw, max_len_token_vector = get_max_tokenized_length(raw_train_ds['title'] + raw_test_ds['title'], tok=tokenizer)

print(f'For AutoTokenzier associated with "{current}":')
print('(Note this will potentially be different for each pretrained model)\n')
print(f'max_len_raw (longest headline)= {max_len_raw}')
print(f'max_len_tokenized (longest tokenized headline)= {max_len_token_vector}')

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

For AutoTokenzier associated with "ProsusAI/finbert":
(Note this will potentially be different for each pretrained model)

max_len_raw (longest headline)= 18
max_len_tokenized (longest tokenized headline)= 29


In [59]:
tokenized_datasets, tf_train_dataset, tf_val_dataset, tf_test_dataset = create_tokenized_tf_datasets(untokenized_datasets_dict = raw_datasets, 
                                                                                                     tok = tokenizer, 
                                                                                                     max_len = max_len_token_vector)
output.clear()

## Fit and Evaluate finBERT

In [60]:
load_compile_fit_eval_model(current, tf_train_dataset, tf_val_dataset, tf_test_dataset)

Downloading (…)"tf_model.h5";:   0%|          | 0.00/438M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


0.8069681525230408

## Perform 10-Fold Cross Validation
(as a proxy for test set - prior to receiving ground truth test set)

In [None]:
cv_raw_data_cosine_similarity_all_folds = tenfold_cross_val(raw_train_ds, checkpoint)
cv_raw_data_cosine_similarity_avg = (sum(cv_raw_data_cosine_similarity_all_folds) / len(cv_raw_data_cosine_similarity_all_folds)) * 100

In [None]:
cv_raw_data_cosine_similarity_avg

77.78641879558563

# roBERTa
roberta-base  
Cross Validation cosine_similarity = 77.61  
Competition Test Set cosine_similarity = 82.18

## Preprocessing for input into roBERTa

In [61]:
current = "roberta-base"

In [65]:
tokenizer = AutoTokenizer.from_pretrained(current)
max_len_raw, max_len_token_vector = get_max_tokenized_length(raw_train_ds['title'] + raw_test_ds['title'], tok=tokenizer)

print(f'For AutoTokenzier associated with "{current}":')
print('(Note this will potentially be different for each pretrained model)\n')
print(f'max_len_raw (longest headline)= {max_len_raw}')
print(f'max_len_tokenized (longest tokenized headline)= {max_len_token_vector}')

For AutoTokenzier associated with "roberta-base":
(Note this will potentially be different for each pretrained model)

max_len_raw (longest headline)= 18
max_len_tokenized (longest tokenized headline)= 32


In [66]:
tokenized_datasets, tf_train_dataset, tf_val_dataset, tf_test_dataset = create_tokenized_tf_datasets(untokenized_datasets_dict = raw_datasets, 
                                                                                                     tok = tokenizer, 
                                                                                                     max_len = max_len_token_vector)
output.clear()

## Fit and Evaluate RoBERTa

In [67]:
load_compile_fit_eval_model(current, tf_train_dataset, tf_val_dataset, tf_test_dataset)

Downloading (…)"tf_model.h5";:   0%|          | 0.00/657M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


0.8218125104904175

## Perform 10-Fold Cross Validation
(as a proxy for test set - prior to receiving ground truth test set)

In [None]:
cv_raw_data_cosine_similarity_all_folds = tenfold_cross_val(raw_train_ds, checkpoint)
cv_raw_data_cosine_similarity_avg = (sum(cv_raw_data_cosine_similarity_all_folds) / len(cv_raw_data_cosine_similarity_all_folds)) * 100

In [None]:
cv_raw_data_cosine_similarity_avg

77.60886251926422

# DeBERTa
deberta-v3  
Cross Validation cosine_similarity = 80.45  
Competition Test Set cosine_similarity = 84.20

## Imports

In [68]:
!pip uninstall transformers
!pip uninstall sentencepiece

Found existing installation: transformers 4.26.0
Uninstalling transformers-4.26.0:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.8/dist-packages/transformers-4.26.0.dist-info/*
    /usr/local/lib/python3.8/dist-packages/transformers/*
Proceed (Y/n)? Y
  Successfully uninstalled transformers-4.26.0
[0m

In [17]:
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Preprocessing for input into DeBERTa

In [21]:
current = "microsoft/deberta-v3-base"

In [22]:
tokenizer = AutoTokenizer.from_pretrained(current)
max_len_raw, max_len_token_vector = get_max_tokenized_length(raw_train_ds['title'] + raw_test_ds['title'], tok=tokenizer)

print(f'For AutoTokenzier associated with "{current}":')
print('(Note this will potentially be different for each pretrained model)\n')
print(f'max_len_raw (longest headline)= {max_len_raw}')
print(f'max_len_tokenized (longest tokenized headline)= {max_len_token_vector}')

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading (…)"spm.model";:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


For AutoTokenzier associated with "microsoft/deberta-v3-base":
(Note this will potentially be different for each pretrained model)

max_len_raw (longest headline)= 18
max_len_tokenized (longest tokenized headline)= 28


In [24]:
tokenized_datasets, tf_train_dataset, tf_val_dataset, tf_test_dataset = create_tokenized_tf_datasets(untokenized_datasets_dict = raw_datasets, 
                                                                                                     tok = tokenizer, 
                                                                                                     max_len = max_len_token_vector)

  0%|          | 0/914 [00:00<?, ?ex/s]

  0%|          | 0/228 [00:00<?, ?ex/s]

  0%|          | 0/491 [00:00<?, ?ex/s]

## Fit and Evaluate DeBERTa

In [27]:
load_compile_fit_eval_model(current, tf_train_dataset, tf_val_dataset, tf_test_dataset)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/50


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


0.8420573472976685

## Perform 10-Fold Cross Validation
(as a proxy for test set - prior to receiving ground truth test set)

In [None]:
cv_raw_data_cosine_similarity_all_folds = tenfold_cross_val(raw_train_ds, checkpoint)
cv_raw_data_cosine_similarity_avg = (sum(cv_raw_data_cosine_similarity_all_folds) / len(cv_raw_data_cosine_similarity_all_folds)) * 100

In [None]:
cv_raw_data_cosine_similarity_avg

80.44909358024597

# One-Time Validation Set Creation Step

In [None]:
import random

# Load the original .json file
with open("data/finance/Headline_Trainingdata.json", "r") as file:
    original_data = json.load(file)

# Select 228 random entries
selected_entries = random.sample(original_data, 228)

# Remove the selected entries from the original data
for entry in selected_entries:
    original_data.remove(entry)

# Save the remaining entries in the original file
with open("data/finance/Headline_Trainingdata.json", "w") as file:
    json.dump(original_data, file)

# Save the selected entries to a new .json file
with open("data/finance/Headline_Validationdata.json", "w") as file:
    json.dump(selected_entries, file)