## Imports

In [None]:
!pip install transformers
!pip install datasets

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import json

import tensorflow as tf
from tensorflow import keras
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification
from datasets import Dataset

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from scipy.spatial.distance import cosine

# For model evaluation:
def cosine_score(predicted_values, true_values):
    return 1 - cosine(predicted_values, true_values)

# EDA

## Get Data

In [None]:
# Get data from here:
!git clone https://bitbucket.org/ssix-project/semeval-2017-task-5-subtask-2.git

## Examine data

In [None]:
def __text_company(all_data):
    
    text = []
    company = []
    ids = []
    for data in all_data:
        text.append(data['title'])
        company.append(data['company'])
        ids.append(data['id'])
    return text, company, ids


def __text_sentiment_company(all_data):

    text = []
    sentiment = []
    company = []
    for data in all_data:
        text.append(data['title'])
        company.append(data['company'])
        # This field does not exist in test dataset
        if 'sentiment' in data:
            sentiment.append(data['sentiment'])
        elif 'sentiment score' in data:
            sentiment.append(data['sentiment score'])
    return text, np.asarray(sentiment), company

def fin_data(data_type, test_data=False):

    with open("semeval-2017-task-5-subtask-2/" + data_type, 'r') as fp:
        if test_data:
            return __text_company(json.load(fp))
        return __text_sentiment_company(json.load(fp))

In [None]:
train_texts, train_sentiments, train_companies = fin_data('Headline_Trainingdata.json')
test_texts, test_sentiments, test_companies = fin_data('Headlines_Testdata_withscores.json')

df = pd.DataFrame(list(zip(train_texts, train_sentiments, train_companies)), columns =['text', 'Sentiment', 'company'])
df = df.reset_index()
df.rename({'index':'input_ids'}, axis = 1, inplace=True)
df.head()

Unnamed: 0,input_ids,text,Sentiment,company
0,0,Morrisons book second consecutive quarter of sales growth,0.43,Morrisons
1,1,IMI posts drop in first-quarter organic revenue; warns on full year,-0.344,IMI
2,2,"Glencore to refinance its short-term debt early, shares rise",0.34,Glencore
3,3,EasyJet attracts more passengers in June but still lags Ryanair,0.259,Ryanair
4,4,Barclays 'bad bank' chief to step down,-0.231,Barclays


## Preprocessing data for input into BERT

In [None]:
raw_train_ds = Dataset.from_json("semeval-2017-task-5-subtask-2/Headline_Trainingdata.json")
raw_test_ds = Dataset.from_json("semeval-2017-task-5-subtask-2/Headlines_Testdata_withscores.json")
raw_datasets = {"train": raw_train_ds, "test": raw_test_ds}



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-35be68298e7ca5ec/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]



Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-35be68298e7ca5ec/0.0.0. Subsequent calls will reuse this data.
Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-891fd242f4daf689/0.0.0...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-891fd242f4daf689/0.0.0. Subsequent calls will reuse this data.


In [None]:
raw_train_ds

Dataset({
    features: ['id', 'company', 'title', 'sentiment'],
    num_rows: 1142
})

In [None]:
raw_train_ds[0]

{'id': 2,
 'company': 'Morrisons',
 'title': 'Morrisons book second consecutive quarter of sales growth',
 'sentiment': 0.43}

## Tokenize datasets

#### Get Max Tokenized Vector Length

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def get_max_tokenized_length(ds):
    max_len_raw = 0
    max_len_token_vector = 0
    
    for text_input in ds:
        
        if len(text_input.split(" ")) > max_len_raw:
            max_len_raw = len(text_input.split(" "))
        
        if len(tokenizer(text_input)['input_ids']) > max_len_token_vector:
            max_len_token_vector = len(tokenizer(text_input)['input_ids'])

    return max_len_raw, max_len_token_vector

max_len_raw_train, max_len_token_vector_train = get_max_tokenized_length(raw_train_ds['title'])
max_len_raw_test, max_len_token_vector_test = get_max_tokenized_length(raw_test_ds['title'])

print(f'max_len_raw_train = {max_len_raw_train}')
print(f'max_len_token_vector_train = {max_len_token_vector_train}')
print('')
print(f'max_len_raw_test = {max_len_raw_test}')
print(f'max_len_token_vector_test = {max_len_token_vector_test}')
print('')
max_len_token_vector = max(max_len_token_vector_train, max_len_token_vector_test)
print(f'max_len_token_vector (BOTH) = {max_len_token_vector}')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

max_len_raw_train = 18
max_len_token_vector_train = 29

max_len_raw_test = 12
max_len_token_vector_test = 21

max_len_token_vector (BOTH) = 29


#### Tokenize and save as TF Dataset object

In [None]:
def preprocess_function(examples):
    global max_len_token_vector
    label = examples["sentiment"] 
    examples = tokenizer(examples["title"], truncation=True, padding="max_length", max_length=max_len_token_vector)
    
    # Change this to real number
    examples["label"] = float(label)
    return examples


for split in raw_datasets:
    raw_datasets[split] = raw_datasets[split].map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])

tokenized_datasets = raw_datasets.copy()

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

  0%|          | 0/1142 [00:00<?, ?ex/s]

  0%|          | 0/14 [00:00<?, ?ex/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


#### Look at tokenized dataset

In [None]:
# Full training set
np.asarray(raw_datasets['train']['input_ids'])

array([[  101,  9959,  2015, ...,     0,     0,     0],
       [  101, 10047,  2072, ...,     0,     0,     0],
       [  101,  8904, 17345, ...,     0,     0,     0],
       ...,
       [  101,  2332,  7529, ...,     0,     0,     0],
       [  101,  3316,  6494, ...,     0,     0,     0],
       [  101,  3115, 12443, ...,     0,     0,     0]])

In [None]:
# First sentence observation -- tokenized representation
np.asarray(raw_datasets['train']['input_ids'])[0,:]

array([ 101, 9959, 2015, 2338, 2117, 5486, 4284, 1997, 4341, 3930,  102,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [None]:
# (training_set_rows, max_token_vector_length)
np.asarray(raw_datasets['train']['input_ids']).shape

(1142, 29)

In [None]:
# Convert back from tokenized to text representation
tokenizer.decode(tokenizer(raw_train_ds['title'][6])['input_ids'])

'[CLS] bilfinger industrial services win a£100m bp contract extension [SEP]'

In [None]:
# Same, represented as tokens
# 101 - [CLS]
# 102 - [SEP]
tokenizer(raw_train_ds['title'][6])['input_ids']

[101,
 12170,
 10270,
 9912,
 3919,
 2578,
 2663,
 1037,
 29646,
 18613,
 2213,
 17531,
 3206,
 5331,
 102]

In [None]:
# Batch #1:


# [tokenized sentences,
#  N/A (not used by BERT),
#  attention masks,
#  labels]

arr = tf_train_dataset.as_numpy_iterator()
l = list(arr)
l[0]

({'input_ids': array([[  101,  2414,  2851, 27918,  1024, 26236,  9818,  1998,  3115,
          12443,  6661,  4125,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0],
         [  101, 16565, 12058,  2004,  4636,  3208,  8040,  8093, 27381,
           2015,  4311,  1037,  2501,  2095,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0],
         [  101,  6031, 28305,  2102,  1005,  1055, 15570,  6045, 14238,
           4710, 12174,  5618, 14523,  2471,  2028,  2353,   102,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0],
         [  101, 10651,  1015,  1011, 11825, 11146, 12386,  1037,  2353,
           1997,  9959,  2015,  6501,  3206,   102,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0],
         [ 

# BERT
bert-base-uncased  
Cross Validation cosine_similarity = 74.94

## Preprocessing for input into BERT

In [None]:
# Same as above
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def get_max_tokenized_length(ds):
    max_len_raw = 0
    max_len_token_vector = 0
    
    for text_input in ds:
        
        if len(text_input.split(" ")) > max_len_raw:
            max_len_raw = len(text_input.split(" "))
        
        if len(tokenizer(text_input)['input_ids']) > max_len_token_vector:
            max_len_token_vector = len(tokenizer(text_input)['input_ids'])

    return max_len_raw, max_len_token_vector

max_len_raw_train, max_len_token_vector_train = get_max_tokenized_length(raw_train_ds['title'])
max_len_raw_test, max_len_token_vector_test = get_max_tokenized_length(raw_test_ds['title'])

max_len_token_vector = max(max_len_token_vector_train, max_len_token_vector_test)

In [None]:
# Same as above
def preprocess_function(examples):
    global max_len_token_vector
    label = examples["sentiment"] 
    examples = tokenizer(examples["title"], truncation=True, padding="max_length", max_length=max_len_token_vector)
    
    # Change this to real number
    examples["label"] = float(label)
    return examples

## Perform 10-Fold Cross Validation

In [None]:
def tenfold_cross_val(train_ds):

    all_results = []

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    for i, (train_index, test_index) in enumerate(kfold.split(train_ds['title'], train_ds['sentiment'])):
        
        print(f"Fold {i}:")
        print(f"  Train: index={train_index}")
        print(f"  Test:  index={test_index}")

        tokenized_cv_train = Dataset.from_dict(train_ds[train_index]).map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])
        tokenized_cv_test = Dataset.from_dict(train_ds[test_index]).map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

        cv_train_dataset_tf = tokenized_cv_train.to_tf_dataset(
            columns=["attention_mask", "input_ids", "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=4,
        )

        cv_test_dataset_tf = tokenized_cv_test.to_tf_dataset(
            columns=["attention_mask", "input_ids", "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=4,
        )
    
        model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1) # num_labels=1 --> regression head after BERT layer (linear layer for output)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss="mse", metrics=['cosine_proximity'])
        model.fit(cv_train_dataset_tf, epochs=10)

        predicted_sentiments = model.predict(cv_test_dataset_tf)
        result = cosine_score(predicted_sentiments['logits'][:,0],  np.concatenate([y for x, y in cv_test_dataset_tf], axis=0))
        print(f'Fold {i} cosine_similarity = {result}\n\n')
        all_results.append(result)

    return all_results

In [None]:
cv_raw_data_cosine_similarity_all_folds = tenfold_cross_val(raw_train_ds)
cv_raw_data_cosine_similarity_avg = (sum(cv_raw_data_cosine_similarity_all_folds) / len(cv_raw_data_cosine_similarity_all_folds)) * 100

Fold 0:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  10   23   31   44   51   56   70   86   88   96  101  109  113  128
  140  158  178  198  199  208  240  243  244  247  277  291  292  296
  298  306  318  319  323  331  333  336  342  344  355  361  362  367
  371  377  388  390  398  422  429  439  442  447  448  451  453  461
  482  493  494  497  514  528  529  530  535  538  558  599  605  622
  644  666  674  701  703  708  711  718  727  733  741  754  757  771
  774  777  793  810  814  827  844  875  893  909  922  924  933  940
  961  970  976 1001 1008 1011 1014 1046 1062 1077 1080 1094 1096 1107
 1119 1128 1133]


  0%|          | 0/1027 [00:00<?, ?ex/s]

  0%|          | 0/115 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 0 cosine_similarity = 0.6663803458213806


Fold 1:
  Train: index=[   0    1    2 ... 1138 1139 1141]
  Test:  index=[   3   12   30   39   49   54   58   59   63   66   67   76   78   83
  100  107  136  138  139  141  156  168  174  192  209  210  218  220
  231  260  270  273  274  275  290  294  299  321  328  346  351  359
  363  382  394  404  405  413  425  436  462  465  479  506  513  519
  534  536  549  552  553  561  567  570  575  581  582  585  587  596
  602  617  661  668  722  728  745  746  755  761  768  781  789  798
  825  855  862  881  896  901  907  916  918  930  936  945  948  965
  977  981  984  986 1005 1006 1022 1061 1069 1076 1088 1090 1106 1111
 1127 1135 1140]


  0%|          | 0/1027 [00:00<?, ?ex/s]

  0%|          | 0/115 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 1 cosine_similarity = 0.7970901131629944


Fold 2:
  Train: index=[   0    1    3 ... 1139 1140 1141]
  Test:  index=[   2    6   25   29   47   55   60   72   92  106  110  120  137  165
  182  184  213  215  237  248  256  261  265  280  286  289  307  308
  309  327  332  334  352  354  380  381  423  424  430  432  433  435
  460  481  485  501  525  526  531  551  560  572  584  597  620  631
  636  649  660  665  667  673  678  693  695  707  715  723  731  736
  737  740  743  758  759  760  784  785  796  813  817  820  822  828
  841  842  843  846  861  872  874  882  904  908  915  917  919  925
  927  934  947  949  969  989  993  998 1027 1047 1057 1059 1065 1089
 1103 1108]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 2 cosine_similarity = 0.758492112159729


Fold 3:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[   5    9   33   62   65   71   77   81   82   84   94   97  104  118
  135  144  145  177  196  204  211  221  227  235  238  239  249  250
  254  259  266  281  285  305  310  312  314  350  365  370  389  409
  411  420  427  428  445  449  457  464  467  478  490  523  527  539
  541  542  548  557  566  578  583  591  593  613  614  615  618  650
  657  675  688  690  713  714  717  720  750  752  762  764  773  809
  811  824  830  847  848  877  880  903  910  921  928  939  942  978
 1004 1009 1029 1032 1040 1054 1066 1072 1083 1086 1093 1099 1101 1114
 1117 1129]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 3 cosine_similarity = 0.715449333190918


Fold 4:
  Train: index=[   1    2    3 ... 1139 1140 1141]
  Test:  index=[   0    7   28   41   43   69   73   79   90  108  125  131  132  133
  148  155  163  164  169  172  173  181  185  193  212  214  223  228
  234  251  300  302  311  316  326  329  338  357  360  408  440  450
  458  475  477  486  491  495  499  504  507  516  518  522  532  533
  543  545  554  568  588  589  594  598  621  626  629  634  643  652
  679  682  692  694  696  700  704  706  716  721  724  738  739  753
  765  787  790  816  823  832  834  849  852  868  884  894  898  899
  926  932  941  953  968  979  982 1038 1063 1081 1085 1098 1100 1104
 1105 1136]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 4 cosine_similarity = 0.6492296457290649


Fold 5:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  11   15   18   22   24   42   61   68   74   75   89   93   99  114
  153  167  176  179  188  203  217  222  236  257  264  271  272  278
  284  324  335  340  356  366  368  375  383  393  395  396  412  416
  417  426  431  434  444  446  454  456  468  483  487  498  500  521
  544  547  576  580  590  595  601  604  611  616  628  630  670  677
  689  712  730  732  780  782  783  786  799  801  803  808  812  839
  850  853  857  867  869  879  885  895  931  958  959  966  987 1002
 1013 1023 1031 1034 1043 1049 1050 1058 1067 1068 1071 1091 1097 1102
 1118 1120]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 5 cosine_similarity = 0.7426257729530334


Fold 6:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[   4   16   17   19   38   45   46   48   50   57  102  105  115  116
  117  124  126  127  142  149  154  157  171  175  180  190  191  195
  245  255  263  268  287  301  304  313  320  322  341  349  353  369
  372  399  407  443  470  473  476  489  511  512  517  537  559  569
  574  603  606  625  633  635  638  653  655  656  662  664  684  685
  697  726  734  756  767  772  788  792  802  807  837  845  851  858
  859  876  887  888  905  913  920  935  937  943  944  951  954  963
  973  983  992  996 1003 1010 1018 1019 1026 1033 1035 1042 1073 1115
 1121 1138]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 6 cosine_similarity = 0.8135931491851807


Fold 7:
  Train: index=[   0    1    2 ... 1138 1140 1141]
  Test:  index=[   8   26   32   36   37   53  103  111  119  123  143  146  147  150
  151  152  160  162  186  194  207  225  226  229  253  262  283  297
  303  325  345  348  364  374  403  414  419  421  437  452  463  469
  480  488  503  515  546  550  571  579  586  608  610  623  640  651
  658  659  669  672  680  687  705  710  749  751  770  778  800  819
  821  833  864  865  866  873  886  889  890  892  900  902  911  912
  914  923  938  946  950  956  964  985  988  994  997 1000 1030 1036
 1045 1052 1053 1055 1060 1070 1074 1075 1079 1092 1109 1122 1124 1131
 1132 1139]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 7 cosine_similarity = 0.82365882396698


Fold 8:
  Train: index=[   0    2    3 ... 1138 1139 1140]
  Test:  index=[   1   27   35   52   80   85   95  112  122  129  159  170  183  197
  202  219  224  232  233  242  246  258  267  279  282  293  317  339
  347  358  373  376  384  386  400  402  410  415  438  441  471  472
  484  496  505  509  524  540  555  556  573  577  607  609  619  624
  627  632  637  639  641  645  648  654  663  671  676  681  691  698
  709  735  744  748  795  797  806  818  826  829  835  836  838  854
  883  906  952  962  967  971  972  974  980  990  991  999 1007 1012
 1015 1020 1024 1037 1039 1041 1056 1078 1084 1087 1110 1113 1116 1134
 1137 1141]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 8 cosine_similarity = 0.7725008130073547


Fold 9:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  13   14   20   21   34   40   64   87   91   98  121  130  134  161
  166  187  189  200  201  205  206  216  230  241  252  269  276  288
  295  315  330  337  343  378  379  385  387  391  392  397  401  406
  418  455  459  466  474  492  502  508  510  520  562  563  564  565
  592  600  612  642  646  647  683  686  699  702  719  725  729  742
  747  763  766  769  775  776  779  791  794  804  805  815  831  840
  856  860  863  870  871  878  891  897  929  955  957  960  975  995
 1016 1017 1021 1025 1028 1044 1048 1051 1064 1082 1095 1112 1123 1125
 1126 1130]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 9 cosine_similarity = 0.755587637424469




In [None]:
cv_raw_data_cosine_similarity_avg

74.94607746601105

# finBERT
ProsusAI/finbert  
Cross Validation cosine_similarity = 77.79

## Preprocessing for input into finBERT

In [5]:
checkpoint = "ProsusAI/finbert"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def get_max_tokenized_length(ds):
    max_len_raw = 0
    max_len_token_vector = 0
    
    for text_input in ds:
        
        if len(text_input.split(" ")) > max_len_raw:
            max_len_raw = len(text_input.split(" "))
        
        if len(tokenizer(text_input)['input_ids']) > max_len_token_vector:
            max_len_token_vector = len(tokenizer(text_input)['input_ids'])

    return max_len_raw, max_len_token_vector

max_len_raw_train, max_len_token_vector_train = get_max_tokenized_length(raw_train_ds['title'])
max_len_raw_test, max_len_token_vector_test = get_max_tokenized_length(raw_test_ds['title'])

print(f'max_len_raw_train = {max_len_raw_train}')
print(f'max_len_token_vector_train = {max_len_token_vector_train}')
print('')
print(f'max_len_raw_test = {max_len_raw_test}')
print(f'max_len_token_vector_test = {max_len_token_vector_test}')
print('')
max_len_token_vector = max(max_len_token_vector_train, max_len_token_vector_test)
print(f'max_len_token_vector (BOTH) = {max_len_token_vector}')

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

max_len_raw_train = 18
max_len_token_vector_train = 29

max_len_raw_test = 12
max_len_token_vector_test = 21

max_len_token_vector (BOTH) = 29


In [12]:
tokenizer

BertTokenizerFast(name_or_path='ProsusAI/finbert', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [7]:
def preprocess_function(examples):
    global max_len_token_vector
    label = examples["sentiment"] 
    examples = tokenizer(examples["title"], truncation=True, padding="max_length", max_length=max_len_token_vector)
    
    # Change this to real number
    examples["label"] = float(label)
    return examples

## Perform 10-Fold Cross Validation

In [13]:
def tenfold_cross_val(train_ds, checkpoint):

    all_results = []

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    for i, (train_index, test_index) in enumerate(kfold.split(train_ds['title'], train_ds['sentiment'])):
        
        print(f"Fold {i}:")
        print(f"  Train: index={train_index}")
        print(f"  Test:  index={test_index}")

        tokenized_cv_train = Dataset.from_dict(train_ds[train_index]).map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])
        tokenized_cv_test = Dataset.from_dict(train_ds[test_index]).map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

        cv_train_dataset_tf = tokenized_cv_train.to_tf_dataset(
            columns=["attention_mask", "input_ids", "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=4,
        )

        cv_test_dataset_tf = tokenized_cv_test.to_tf_dataset(
            columns=["attention_mask", "input_ids", "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=4,
        )
    
        model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1) # num_labels=1 --> regression head after BERT layer (linear layer for output)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss="mse", metrics=['cosine_proximity'])
        model.fit(cv_train_dataset_tf, epochs=10)

        predicted_sentiments = model.predict(cv_test_dataset_tf)
        result = cosine_score(predicted_sentiments['logits'][:,0],  np.concatenate([y for x, y in cv_test_dataset_tf], axis=0))
        print(f'Fold {i} cosine_similarity = {result}\n\n')
        all_results.append(result)

    return all_results

In [14]:
cv_raw_data_cosine_similarity_all_folds = tenfold_cross_val(raw_train_ds, checkpoint)
cv_raw_data_cosine_similarity_avg = (sum(cv_raw_data_cosine_similarity_all_folds) / len(cv_raw_data_cosine_similarity_all_folds)) * 100

Fold 0:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  10   23   31   44   51   56   70   86   88   96  101  109  113  128
  140  158  178  198  199  208  240  243  244  247  277  291  292  296
  298  306  318  319  323  331  333  336  342  344  355  361  362  367
  371  377  388  390  398  422  429  439  442  447  448  451  453  461
  482  493  494  497  514  528  529  530  535  538  558  599  605  622
  644  666  674  701  703  708  711  718  727  733  741  754  757  771
  774  777  793  810  814  827  844  875  893  909  922  924  933  940
  961  970  976 1001 1008 1011 1014 1046 1062 1077 1080 1094 1096 1107
 1119 1128 1133]


  0%|          | 0/1027 [00:00<?, ?ex/s]

  0%|          | 0/115 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 0 cosine_similarity = 0.7036332488059998


Fold 1:
  Train: index=[   0    1    2 ... 1138 1139 1141]
  Test:  index=[   3   12   30   39   49   54   58   59   63   66   67   76   78   83
  100  107  136  138  139  141  156  168  174  192  209  210  218  220
  231  260  270  273  274  275  290  294  299  321  328  346  351  359
  363  382  394  404  405  413  425  436  462  465  479  506  513  519
  534  536  549  552  553  561  567  570  575  581  582  585  587  596
  602  617  661  668  722  728  745  746  755  761  768  781  789  798
  825  855  862  881  896  901  907  916  918  930  936  945  948  965
  977  981  984  986 1005 1006 1022 1061 1069 1076 1088 1090 1106 1111
 1127 1135 1140]


  0%|          | 0/1027 [00:00<?, ?ex/s]

  0%|          | 0/115 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 1 cosine_similarity = 0.7829529643058777


Fold 2:
  Train: index=[   0    1    3 ... 1139 1140 1141]
  Test:  index=[   2    6   25   29   47   55   60   72   92  106  110  120  137  165
  182  184  213  215  237  248  256  261  265  280  286  289  307  308
  309  327  332  334  352  354  380  381  423  424  430  432  433  435
  460  481  485  501  525  526  531  551  560  572  584  597  620  631
  636  649  660  665  667  673  678  693  695  707  715  723  731  736
  737  740  743  758  759  760  784  785  796  813  817  820  822  828
  841  842  843  846  861  872  874  882  904  908  915  917  919  925
  927  934  947  949  969  989  993  998 1027 1047 1057 1059 1065 1089
 1103 1108]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 2 cosine_similarity = 0.7570813298225403


Fold 3:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[   5    9   33   62   65   71   77   81   82   84   94   97  104  118
  135  144  145  177  196  204  211  221  227  235  238  239  249  250
  254  259  266  281  285  305  310  312  314  350  365  370  389  409
  411  420  427  428  445  449  457  464  467  478  490  523  527  539
  541  542  548  557  566  578  583  591  593  613  614  615  618  650
  657  675  688  690  713  714  717  720  750  752  762  764  773  809
  811  824  830  847  848  877  880  903  910  921  928  939  942  978
 1004 1009 1029 1032 1040 1054 1066 1072 1083 1086 1093 1099 1101 1114
 1117 1129]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 3 cosine_similarity = 0.7281929850578308


Fold 4:
  Train: index=[   1    2    3 ... 1139 1140 1141]
  Test:  index=[   0    7   28   41   43   69   73   79   90  108  125  131  132  133
  148  155  163  164  169  172  173  181  185  193  212  214  223  228
  234  251  300  302  311  316  326  329  338  357  360  408  440  450
  458  475  477  486  491  495  499  504  507  516  518  522  532  533
  543  545  554  568  588  589  594  598  621  626  629  634  643  652
  679  682  692  694  696  700  704  706  716  721  724  738  739  753
  765  787  790  816  823  832  834  849  852  868  884  894  898  899
  926  932  941  953  968  979  982 1038 1063 1081 1085 1098 1100 1104
 1105 1136]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 4 cosine_similarity = 0.7762556672096252


Fold 5:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  11   15   18   22   24   42   61   68   74   75   89   93   99  114
  153  167  176  179  188  203  217  222  236  257  264  271  272  278
  284  324  335  340  356  366  368  375  383  393  395  396  412  416
  417  426  431  434  444  446  454  456  468  483  487  498  500  521
  544  547  576  580  590  595  601  604  611  616  628  630  670  677
  689  712  730  732  780  782  783  786  799  801  803  808  812  839
  850  853  857  867  869  879  885  895  931  958  959  966  987 1002
 1013 1023 1031 1034 1043 1049 1050 1058 1067 1068 1071 1091 1097 1102
 1118 1120]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 5 cosine_similarity = 0.804621160030365


Fold 6:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[   4   16   17   19   38   45   46   48   50   57  102  105  115  116
  117  124  126  127  142  149  154  157  171  175  180  190  191  195
  245  255  263  268  287  301  304  313  320  322  341  349  353  369
  372  399  407  443  470  473  476  489  511  512  517  537  559  569
  574  603  606  625  633  635  638  653  655  656  662  664  684  685
  697  726  734  756  767  772  788  792  802  807  837  845  851  858
  859  876  887  888  905  913  920  935  937  943  944  951  954  963
  973  983  992  996 1003 1010 1018 1019 1026 1033 1035 1042 1073 1115
 1121 1138]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 6 cosine_similarity = 0.830838680267334


Fold 7:
  Train: index=[   0    1    2 ... 1138 1140 1141]
  Test:  index=[   8   26   32   36   37   53  103  111  119  123  143  146  147  150
  151  152  160  162  186  194  207  225  226  229  253  262  283  297
  303  325  345  348  364  374  403  414  419  421  437  452  463  469
  480  488  503  515  546  550  571  579  586  608  610  623  640  651
  658  659  669  672  680  687  705  710  749  751  770  778  800  819
  821  833  864  865  866  873  886  889  890  892  900  902  911  912
  914  923  938  946  950  956  964  985  988  994  997 1000 1030 1036
 1045 1052 1053 1055 1060 1070 1074 1075 1079 1092 1109 1122 1124 1131
 1132 1139]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 7 cosine_similarity = 0.8110780119895935


Fold 8:
  Train: index=[   0    2    3 ... 1138 1139 1140]
  Test:  index=[   1   27   35   52   80   85   95  112  122  129  159  170  183  197
  202  219  224  232  233  242  246  258  267  279  282  293  317  339
  347  358  373  376  384  386  400  402  410  415  438  441  471  472
  484  496  505  509  524  540  555  556  573  577  607  609  619  624
  627  632  637  639  641  645  648  654  663  671  676  681  691  698
  709  735  744  748  795  797  806  818  826  829  835  836  838  854
  883  906  952  962  967  971  972  974  980  990  991  999 1007 1012
 1015 1020 1024 1037 1039 1041 1056 1078 1084 1087 1110 1113 1116 1134
 1137 1141]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 8 cosine_similarity = 0.8225487470626831


Fold 9:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  13   14   20   21   34   40   64   87   91   98  121  130  134  161
  166  187  189  200  201  205  206  216  230  241  252  269  276  288
  295  315  330  337  343  378  379  385  387  391  392  397  401  406
  418  455  459  466  474  492  502  508  510  520  562  563  564  565
  592  600  612  642  646  647  683  686  699  702  719  725  729  742
  747  763  766  769  775  776  779  791  794  804  805  815  831  840
  856  860  863  870  871  878  891  897  929  955  957  960  975  995
 1016 1017 1021 1025 1028 1044 1048 1051 1064 1082 1095 1112 1123 1125
 1126 1130]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 9 cosine_similarity = 0.7614390850067139




In [15]:
cv_raw_data_cosine_similarity_avg

77.78641879558563

# roBERTa
roberta-base  
Cross Validation cosine_similarity = 77.61

## Preprocessing for input into roBERTa

In [16]:
checkpoint = "roberta-base"

In [17]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def get_max_tokenized_length(ds):
    max_len_raw = 0
    max_len_token_vector = 0
    
    for text_input in ds:
        
        if len(text_input.split(" ")) > max_len_raw:
            max_len_raw = len(text_input.split(" "))
        
        if len(tokenizer(text_input)['input_ids']) > max_len_token_vector:
            max_len_token_vector = len(tokenizer(text_input)['input_ids'])

    return max_len_raw, max_len_token_vector

max_len_raw_train, max_len_token_vector_train = get_max_tokenized_length(raw_train_ds['title'])
max_len_raw_test, max_len_token_vector_test = get_max_tokenized_length(raw_test_ds['title'])

print(f'max_len_raw_train = {max_len_raw_train}')
print(f'max_len_token_vector_train = {max_len_token_vector_train}')
print('')
print(f'max_len_raw_test = {max_len_raw_test}')
print(f'max_len_token_vector_test = {max_len_token_vector_test}')
print('')
max_len_token_vector = max(max_len_token_vector_train, max_len_token_vector_test)
print(f'max_len_token_vector (BOTH) = {max_len_token_vector}')

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

max_len_raw_train = 18
max_len_token_vector_train = 33

max_len_raw_test = 12
max_len_token_vector_test = 24

max_len_token_vector (BOTH) = 33


In [18]:
tokenizer

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [19]:
def preprocess_function(examples):
    global max_len_token_vector
    label = examples["sentiment"] 
    examples = tokenizer(examples["title"], truncation=True, padding="max_length", max_length=max_len_token_vector)
    
    # Change this to real number
    examples["label"] = float(label)
    return examples

## Perform 10-Fold Cross Validation

In [22]:
def tenfold_cross_val(train_ds, checkpoint):

    all_results = []

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    for i, (train_index, test_index) in enumerate(kfold.split(train_ds['title'], train_ds['sentiment'])):
        
        print(f"Fold {i}:")
        print(f"  Train: index={train_index}")
        print(f"  Test:  index={test_index}")

        tokenized_cv_train = Dataset.from_dict(train_ds[train_index]).map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])
        tokenized_cv_test = Dataset.from_dict(train_ds[test_index]).map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

        cv_train_dataset_tf = tokenized_cv_train.to_tf_dataset(
            columns=["attention_mask", "input_ids"],# "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=4,
        )

        cv_test_dataset_tf = tokenized_cv_test.to_tf_dataset(
            columns=["attention_mask", "input_ids"],# "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=4,
        )
    
        model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1) # num_labels=1 --> regression head after BERT layer (linear layer for output)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss="mse", metrics=['cosine_proximity'])
        model.fit(cv_train_dataset_tf, epochs=10)

        predicted_sentiments = model.predict(cv_test_dataset_tf)
        result = cosine_score(predicted_sentiments['logits'][:,0],  np.concatenate([y for x, y in cv_test_dataset_tf], axis=0))
        print(f'Fold {i} cosine_similarity = {result}\n\n')
        all_results.append(result)

    return all_results

In [23]:
cv_raw_data_cosine_similarity_all_folds = tenfold_cross_val(raw_train_ds, checkpoint)
cv_raw_data_cosine_similarity_avg = (sum(cv_raw_data_cosine_similarity_all_folds) / len(cv_raw_data_cosine_similarity_all_folds)) * 100

Fold 0:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  10   23   31   44   51   56   70   86   88   96  101  109  113  128
  140  158  178  198  199  208  240  243  244  247  277  291  292  296
  298  306  318  319  323  331  333  336  342  344  355  361  362  367
  371  377  388  390  398  422  429  439  442  447  448  451  453  461
  482  493  494  497  514  528  529  530  535  538  558  599  605  622
  644  666  674  701  703  708  711  718  727  733  741  754  757  771
  774  777  793  810  814  827  844  875  893  909  922  924  933  940
  961  970  976 1001 1008 1011 1014 1046 1062 1077 1080 1094 1096 1107
 1119 1128 1133]


  0%|          | 0/1027 [00:00<?, ?ex/s]

  0%|          | 0/115 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 0 cosine_similarity = 0.7359067797660828


Fold 1:
  Train: index=[   0    1    2 ... 1138 1139 1141]
  Test:  index=[   3   12   30   39   49   54   58   59   63   66   67   76   78   83
  100  107  136  138  139  141  156  168  174  192  209  210  218  220
  231  260  270  273  274  275  290  294  299  321  328  346  351  359
  363  382  394  404  405  413  425  436  462  465  479  506  513  519
  534  536  549  552  553  561  567  570  575  581  582  585  587  596
  602  617  661  668  722  728  745  746  755  761  768  781  789  798
  825  855  862  881  896  901  907  916  918  930  936  945  948  965
  977  981  984  986 1005 1006 1022 1061 1069 1076 1088 1090 1106 1111
 1127 1135 1140]


  0%|          | 0/1027 [00:00<?, ?ex/s]

  0%|          | 0/115 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 1 cosine_similarity = 0.7982765436172485


Fold 2:
  Train: index=[   0    1    3 ... 1139 1140 1141]
  Test:  index=[   2    6   25   29   47   55   60   72   92  106  110  120  137  165
  182  184  213  215  237  248  256  261  265  280  286  289  307  308
  309  327  332  334  352  354  380  381  423  424  430  432  433  435
  460  481  485  501  525  526  531  551  560  572  584  597  620  631
  636  649  660  665  667  673  678  693  695  707  715  723  731  736
  737  740  743  758  759  760  784  785  796  813  817  820  822  828
  841  842  843  846  861  872  874  882  904  908  915  917  919  925
  927  934  947  949  969  989  993  998 1027 1047 1057 1059 1065 1089
 1103 1108]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 2 cosine_similarity = 0.7693243026733398


Fold 3:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[   5    9   33   62   65   71   77   81   82   84   94   97  104  118
  135  144  145  177  196  204  211  221  227  235  238  239  249  250
  254  259  266  281  285  305  310  312  314  350  365  370  389  409
  411  420  427  428  445  449  457  464  467  478  490  523  527  539
  541  542  548  557  566  578  583  591  593  613  614  615  618  650
  657  675  688  690  713  714  717  720  750  752  762  764  773  809
  811  824  830  847  848  877  880  903  910  921  928  939  942  978
 1004 1009 1029 1032 1040 1054 1066 1072 1083 1086 1093 1099 1101 1114
 1117 1129]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 3 cosine_similarity = 0.8039254546165466


Fold 4:
  Train: index=[   1    2    3 ... 1139 1140 1141]
  Test:  index=[   0    7   28   41   43   69   73   79   90  108  125  131  132  133
  148  155  163  164  169  172  173  181  185  193  212  214  223  228
  234  251  300  302  311  316  326  329  338  357  360  408  440  450
  458  475  477  486  491  495  499  504  507  516  518  522  532  533
  543  545  554  568  588  589  594  598  621  626  629  634  643  652
  679  682  692  694  696  700  704  706  716  721  724  738  739  753
  765  787  790  816  823  832  834  849  852  868  884  894  898  899
  926  932  941  953  968  979  982 1038 1063 1081 1085 1098 1100 1104
 1105 1136]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 4 cosine_similarity = 0.6739829778671265


Fold 5:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  11   15   18   22   24   42   61   68   74   75   89   93   99  114
  153  167  176  179  188  203  217  222  236  257  264  271  272  278
  284  324  335  340  356  366  368  375  383  393  395  396  412  416
  417  426  431  434  444  446  454  456  468  483  487  498  500  521
  544  547  576  580  590  595  601  604  611  616  628  630  670  677
  689  712  730  732  780  782  783  786  799  801  803  808  812  839
  850  853  857  867  869  879  885  895  931  958  959  966  987 1002
 1013 1023 1031 1034 1043 1049 1050 1058 1067 1068 1071 1091 1097 1102
 1118 1120]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 5 cosine_similarity = 0.8316053152084351


Fold 6:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[   4   16   17   19   38   45   46   48   50   57  102  105  115  116
  117  124  126  127  142  149  154  157  171  175  180  190  191  195
  245  255  263  268  287  301  304  313  320  322  341  349  353  369
  372  399  407  443  470  473  476  489  511  512  517  537  559  569
  574  603  606  625  633  635  638  653  655  656  662  664  684  685
  697  726  734  756  767  772  788  792  802  807  837  845  851  858
  859  876  887  888  905  913  920  935  937  943  944  951  954  963
  973  983  992  996 1003 1010 1018 1019 1026 1033 1035 1042 1073 1115
 1121 1138]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 6 cosine_similarity = 0.7768799066543579


Fold 7:
  Train: index=[   0    1    2 ... 1138 1140 1141]
  Test:  index=[   8   26   32   36   37   53  103  111  119  123  143  146  147  150
  151  152  160  162  186  194  207  225  226  229  253  262  283  297
  303  325  345  348  364  374  403  414  419  421  437  452  463  469
  480  488  503  515  546  550  571  579  586  608  610  623  640  651
  658  659  669  672  680  687  705  710  749  751  770  778  800  819
  821  833  864  865  866  873  886  889  890  892  900  902  911  912
  914  923  938  946  950  956  964  985  988  994  997 1000 1030 1036
 1045 1052 1053 1055 1060 1070 1074 1075 1079 1092 1109 1122 1124 1131
 1132 1139]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 7 cosine_similarity = 0.7702275514602661


Fold 8:
  Train: index=[   0    2    3 ... 1138 1139 1140]
  Test:  index=[   1   27   35   52   80   85   95  112  122  129  159  170  183  197
  202  219  224  232  233  242  246  258  267  279  282  293  317  339
  347  358  373  376  384  386  400  402  410  415  438  441  471  472
  484  496  505  509  524  540  555  556  573  577  607  609  619  624
  627  632  637  639  641  645  648  654  663  671  676  681  691  698
  709  735  744  748  795  797  806  818  826  829  835  836  838  854
  883  906  952  962  967  971  972  974  980  990  991  999 1007 1012
 1015 1020 1024 1037 1039 1041 1056 1078 1084 1087 1110 1113 1116 1134
 1137 1141]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 8 cosine_similarity = 0.8150732517242432


Fold 9:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  13   14   20   21   34   40   64   87   91   98  121  130  134  161
  166  187  189  200  201  205  206  216  230  241  252  269  276  288
  295  315  330  337  343  378  379  385  387  391  392  397  401  406
  418  455  459  466  474  492  502  508  510  520  562  563  564  565
  592  600  612  642  646  647  683  686  699  702  719  725  729  742
  747  763  766  769  775  776  779  791  794  804  805  815  831  840
  856  860  863  870  871  878  891  897  929  955  957  960  975  995
 1016 1017 1021 1025 1028 1044 1048 1051 1064 1082 1095 1112 1123 1125
 1126 1130]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 9 cosine_similarity = 0.7856841683387756




In [24]:
cv_raw_data_cosine_similarity_avg

77.60886251926422

# DeBERTa
deberta-v3  
Cross Validation cosine_similarity = 80.45





## Imports

In [None]:
!pip uninstall transformers
!pip uninstall sentencepiece

[0m

In [None]:
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Using cached transformers-4.26.0-py3-none-any.whl (6.3 MB)
Installing collected packages: transformers
Successfully installed transformers-4.26.0


## Preprocessing for input into DeBERTa

In [None]:
checkpoint = "microsoft/deberta-v3-base"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def get_max_tokenized_length(ds):
    max_len_raw = 0
    max_len_token_vector = 0
    
    for text_input in ds:
        
        if len(text_input.split(" ")) > max_len_raw:
            max_len_raw = len(text_input.split(" "))
        
        if len(tokenizer(text_input)['input_ids']) > max_len_token_vector:
            max_len_token_vector = len(tokenizer(text_input)['input_ids'])

    return max_len_raw, max_len_token_vector

max_len_raw_train, max_len_token_vector_train = get_max_tokenized_length(raw_train_ds['title'])
max_len_raw_test, max_len_token_vector_test = get_max_tokenized_length(raw_test_ds['title'])

print(f'max_len_raw_train = {max_len_raw_train}')
print(f'max_len_token_vector_train = {max_len_token_vector_train}')
print('')
print(f'max_len_raw_test = {max_len_raw_test}')
print(f'max_len_token_vector_test = {max_len_token_vector_test}')
print('')
max_len_token_vector = max(max_len_token_vector_train, max_len_token_vector_test)
print(f'max_len_token_vector (BOTH) = {max_len_token_vector}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


max_len_raw_train = 18
max_len_token_vector_train = 28

max_len_raw_test = 12
max_len_token_vector_test = 20

max_len_token_vector (BOTH) = 28


In [None]:
tokenizer

DebertaV2TokenizerFast(name_or_path='microsoft/deberta-v3-base', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [None]:
def preprocess_function(examples):
    global max_len_token_vector
    label = examples["sentiment"] 
    examples = tokenizer(examples["title"], truncation=True, padding="max_length", max_length=max_len_token_vector)
    
    # Change this to real number
    examples["label"] = float(label)
    return examples

## Perform 10-Fold Cross Validation

In [None]:
def tenfold_cross_val(train_ds, checkpoint):

    all_results = []

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    for i, (train_index, test_index) in enumerate(kfold.split(train_ds['title'], train_ds['sentiment'])):
        
        print(f"Fold {i}:")
        print(f"  Train: index={train_index}")
        print(f"  Test:  index={test_index}")

        tokenized_cv_train = Dataset.from_dict(train_ds[train_index]).map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])
        tokenized_cv_test = Dataset.from_dict(train_ds[test_index]).map(preprocess_function, remove_columns=['id', 'company', 'title', 'sentiment'])

        data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

        cv_train_dataset_tf = tokenized_cv_train.to_tf_dataset(
            columns=["attention_mask", "input_ids"],# "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=4,
        )

        cv_test_dataset_tf = tokenized_cv_test.to_tf_dataset(
            columns=["attention_mask", "input_ids"],# "token_type_ids"],
            label_cols=["labels"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=4,
        )
    
        model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1, ignore_mismatched_sizes=True, from_pt=True) # num_labels=1 --> regression head after BERT layer (linear layer for output)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss="mse", metrics=['cosine_proximity'])
        model.fit(cv_train_dataset_tf, epochs=10)

        predicted_sentiments = model.predict(cv_test_dataset_tf)
        result = cosine_score(predicted_sentiments['logits'][:,0],  np.concatenate([y for x, y in cv_test_dataset_tf], axis=0))
        print(f'Fold {i} cosine_similarity = {result}\n\n')
        all_results.append(result)

    return all_results

In [None]:
cv_raw_data_cosine_similarity_all_folds = tenfold_cross_val(raw_train_ds, checkpoint)
cv_raw_data_cosine_similarity_avg = (sum(cv_raw_data_cosine_similarity_all_folds) / len(cv_raw_data_cosine_similarity_all_folds)) * 100

Fold 0:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  10   23   31   44   51   56   70   86   88   96  101  109  113  128
  140  158  178  198  199  208  240  243  244  247  277  291  292  296
  298  306  318  319  323  331  333  336  342  344  355  361  362  367
  371  377  388  390  398  422  429  439  442  447  448  451  453  461
  482  493  494  497  514  528  529  530  535  538  558  599  605  622
  644  666  674  701  703  708  711  718  727  733  741  754  757  771
  774  777  793  810  814  827  844  875  893  909  922  924  933  940
  961  970  976 1001 1008 1011 1014 1046 1062 1077 1080 1094 1096 1107
 1119 1128 1133]


  0%|          | 0/1027 [00:00<?, ?ex/s]

  0%|          | 0/115 [00:00<?, ?ex/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 0 cosine_similarity = 0.7441458702087402


Fold 1:
  Train: index=[   0    1    2 ... 1138 1139 1141]
  Test:  index=[   3   12   30   39   49   54   58   59   63   66   67   76   78   83
  100  107  136  138  139  141  156  168  174  192  209  210  218  220
  231  260  270  273  274  275  290  294  299  321  328  346  351  359
  363  382  394  404  405  413  425  436  462  465  479  506  513  519
  534  536  549  552  553  561  567  570  575  581  582  585  587  596
  602  617  661  668  722  728  745  746  755  761  768  781  789  798
  825  855  862  881  896  901  907  916  918  930  936  945  948  965
  977  981  984  986 1005 1006 1022 1061 1069 1076 1088 1090 1106 1111
 1127 1135 1140]


  0%|          | 0/1027 [00:00<?, ?ex/s]

  0%|          | 0/115 [00:00<?, ?ex/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 1 cosine_similarity = 0.8226367235183716


Fold 2:
  Train: index=[   0    1    3 ... 1139 1140 1141]
  Test:  index=[   2    6   25   29   47   55   60   72   92  106  110  120  137  165
  182  184  213  215  237  248  256  261  265  280  286  289  307  308
  309  327  332  334  352  354  380  381  423  424  430  432  433  435
  460  481  485  501  525  526  531  551  560  572  584  597  620  631
  636  649  660  665  667  673  678  693  695  707  715  723  731  736
  737  740  743  758  759  760  784  785  796  813  817  820  822  828
  841  842  843  846  861  872  874  882  904  908  915  917  919  925
  927  934  947  949  969  989  993  998 1027 1047 1057 1059 1065 1089
 1103 1108]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 2 cosine_similarity = 0.7813006639480591


Fold 3:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[   5    9   33   62   65   71   77   81   82   84   94   97  104  118
  135  144  145  177  196  204  211  221  227  235  238  239  249  250
  254  259  266  281  285  305  310  312  314  350  365  370  389  409
  411  420  427  428  445  449  457  464  467  478  490  523  527  539
  541  542  548  557  566  578  583  591  593  613  614  615  618  650
  657  675  688  690  713  714  717  720  750  752  762  764  773  809
  811  824  830  847  848  877  880  903  910  921  928  939  942  978
 1004 1009 1029 1032 1040 1054 1066 1072 1083 1086 1093 1099 1101 1114
 1117 1129]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 3 cosine_similarity = 0.7899244427680969


Fold 4:
  Train: index=[   1    2    3 ... 1139 1140 1141]
  Test:  index=[   0    7   28   41   43   69   73   79   90  108  125  131  132  133
  148  155  163  164  169  172  173  181  185  193  212  214  223  228
  234  251  300  302  311  316  326  329  338  357  360  408  440  450
  458  475  477  486  491  495  499  504  507  516  518  522  532  533
  543  545  554  568  588  589  594  598  621  626  629  634  643  652
  679  682  692  694  696  700  704  706  716  721  724  738  739  753
  765  787  790  816  823  832  834  849  852  868  884  894  898  899
  926  932  941  953  968  979  982 1038 1063 1081 1085 1098 1100 1104
 1105 1136]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 4 cosine_similarity = 0.7690895199775696


Fold 5:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  11   15   18   22   24   42   61   68   74   75   89   93   99  114
  153  167  176  179  188  203  217  222  236  257  264  271  272  278
  284  324  335  340  356  366  368  375  383  393  395  396  412  416
  417  426  431  434  444  446  454  456  468  483  487  498  500  521
  544  547  576  580  590  595  601  604  611  616  628  630  670  677
  689  712  730  732  780  782  783  786  799  801  803  808  812  839
  850  853  857  867  869  879  885  895  931  958  959  966  987 1002
 1013 1023 1031 1034 1043 1049 1050 1058 1067 1068 1071 1091 1097 1102
 1118 1120]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 5 cosine_similarity = 0.8565505743026733


Fold 6:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[   4   16   17   19   38   45   46   48   50   57  102  105  115  116
  117  124  126  127  142  149  154  157  171  175  180  190  191  195
  245  255  263  268  287  301  304  313  320  322  341  349  353  369
  372  399  407  443  470  473  476  489  511  512  517  537  559  569
  574  603  606  625  633  635  638  653  655  656  662  664  684  685
  697  726  734  756  767  772  788  792  802  807  837  845  851  858
  859  876  887  888  905  913  920  935  937  943  944  951  954  963
  973  983  992  996 1003 1010 1018 1019 1026 1033 1035 1042 1073 1115
 1121 1138]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 6 cosine_similarity = 0.8292694687843323


Fold 7:
  Train: index=[   0    1    2 ... 1138 1140 1141]
  Test:  index=[   8   26   32   36   37   53  103  111  119  123  143  146  147  150
  151  152  160  162  186  194  207  225  226  229  253  262  283  297
  303  325  345  348  364  374  403  414  419  421  437  452  463  469
  480  488  503  515  546  550  571  579  586  608  610  623  640  651
  658  659  669  672  680  687  705  710  749  751  770  778  800  819
  821  833  864  865  866  873  886  889  890  892  900  902  911  912
  914  923  938  946  950  956  964  985  988  994  997 1000 1030 1036
 1045 1052 1053 1055 1060 1070 1074 1075 1079 1092 1109 1122 1124 1131
 1132 1139]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 7 cosine_similarity = 0.8419369459152222


Fold 8:
  Train: index=[   0    2    3 ... 1138 1139 1140]
  Test:  index=[   1   27   35   52   80   85   95  112  122  129  159  170  183  197
  202  219  224  232  233  242  246  258  267  279  282  293  317  339
  347  358  373  376  384  386  400  402  410  415  438  441  471  472
  484  496  505  509  524  540  555  556  573  577  607  609  619  624
  627  632  637  639  641  645  648  654  663  671  676  681  691  698
  709  735  744  748  795  797  806  818  826  829  835  836  838  854
  883  906  952  962  967  971  972  974  980  990  991  999 1007 1012
 1015 1020 1024 1037 1039 1041 1056 1078 1084 1087 1110 1113 1116 1134
 1137 1141]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 8 cosine_similarity = 0.8109098672866821


Fold 9:
  Train: index=[   0    1    2 ... 1139 1140 1141]
  Test:  index=[  13   14   20   21   34   40   64   87   91   98  121  130  134  161
  166  187  189  200  201  205  206  216  230  241  252  269  276  288
  295  315  330  337  343  378  379  385  387  391  392  397  401  406
  418  455  459  466  474  492  502  508  510  520  562  563  564  565
  592  600  612  642  646  647  683  686  699  702  719  725  729  742
  747  763  766  769  775  776  779  791  794  804  805  815  831  840
  856  860  863  870  871  878  891  897  929  955  957  960  975  995
 1016 1017 1021 1025 1028 1044 1048 1051 1064 1082 1095 1112 1123 1125
 1126 1130]


  0%|          | 0/1028 [00:00<?, ?ex/s]

  0%|          | 0/114 [00:00<?, ?ex/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDebertaV2ForSequenceClassification: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDebertaV2ForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequen

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 9 cosine_similarity = 0.7991452813148499




In [None]:
cv_raw_data_cosine_similarity_avg

80.44909358024597