In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

References for this notbook as:
https://iq.opengenus.org/sentence-semantic-similarity-bert/
https://www.analyticsvidhya.com/blog/2021/09/an-explanatory-guide-to-bert-tokenizer/
https://mccormickml.com/2019/07/22/BERT-fine-tuning/
https://skimai.com/fine-tuning-bert-for-sentiment-analysis/
https://keras.io/examples/nlp/semantic_similarity_with_bert/

In [None]:
# libraries 
!pip install transformers
import pandas as pd
import numpy as np
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import os
import seaborn as sns
import tensorflow as tf
import torch
from transformers import BertTokenizer, TFBertModel

In [None]:
#Checking GPU availability in order to operate on it, otherwise on CPU. 
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0) )
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
import os

PATH = '/kaggle/input/bertbaseuncasedv'
'''
BertTokenizer.from_pretrained("bert-base-uncased").save_pretrained(PATH)
TFBertModel.from_pretrained("bert-base-uncased").save_pretrained(PATH)
'''
for dirname, _, filenames in os.walk('/kaggle/input/bertbaseuncasedv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#Loading training dataset
train_df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv')
train_df.shape

In [None]:
# configuration on input length of the phrase, bactch_size and epochs applied for compiling the model
max_length = 128
batch_size = 32
epochs = 10

# checking for any missing rows in training dataset, then drop those NAN if there are
print("number of missing values")
print(train_df.isnull().sum())
train_df.dropna(axis=0, inplace = True)

In [None]:
#Checking the distribution of our training and test targets.
train_df.context.values
#test_df.context.values
print("Train Target Distribution")
print(train_df.context.value_counts())
#print('\n\n')
#print("Test Target Distribution")
#print(test_df.context.value_counts())

Grouping the cpc codes into classes of A,B,C,D,E,F,G,H,Y according to CPC classification version 2021.05

In [None]:
# CPC labels-configuration


A_CPC_labels = ["A01", "A01B", "A01C","A01D", "A01F", "A01F","A01G","A01H","A01J", "A01K", "A01L", "A01M", "A01N", 
                "A21", "A21B", "A21C","A21D", 
                "A22", "A22B","A22C","A022D",
                "A23", "A23B", "A23C", "A23D","A23F", "A23G", "A23J","A23K","A23L","A23N", "A23P", "A23V", "A23Y", 
                "A24", "A24B", "A24C", "A24D","A24F",
                "A41", "A41B", "A41C", "A41D","A41F", "A41G", "A41J",
                "A42", "A42B", "A42C",
                "A43", "A43B", "A43C", "A43D","A43F",
                "A44", "A44B", "A44C", "A44D",
                "A45", "A45B", "A45C", "A45D","A45F",
                "A46", "A46B", "A46D",
                "A47", "A47B", "A47C", "A47D","A47F", "A47G", "A47H", "A47J", "A47K", "A47L",
                "A61", "A61B", "A61C", "A61D","A61F", "A61G", "A61H", "A61J", "A61K", "A61L", "A61M", "A61N", "A61P", "A61Q",
                "A62", "A62B", "A62C", "A62D",
                "A63", "A63B", "A63C", "A63D","A63F", "A63G", "A63H", "A63J", "A63K",
                "A99", "A99Z"]

B_CPC_labels = ["B01", "B01B", "B01D","B01F", "B01J", "B01L", 
                "B02", "B02B", "B02C",
                "B03", "B03B", "B03C", "B03D",
                "B04", "B04B", "B04C",
                "B05", "B05B", "B05C", "B05D",
                "B06", "B06B",
                "B07", "B07B", "B07C",
                "B08", "B08B",
                "B09", "B09B", "B09C",
                "B21", "B21B", "B21C", "B21D", "B21F", "B21G", "B21H", "B21J","B21K", "B21L",  
                "B22", "B22C", "B22D", "B22F",
                "B23", "B23B", "B23C", "B23D", "B23F", "B23G", "B23H", "B21G", "B21H", "B21J","B21K",
                "B24", "B24B", "B24C", "B24D",
                "B25", "B25B", "B25C", "B25D", "B25F", "B25G", "B25H", "B25J",
                "B26", "B26C", "B26D", "B26F",
                "B27", "B27B", "B27C", "B27D", "B27F", "B27G", "B27H", "B27J","B27K", "B27L", "B27M", "B27N",
                "B28", "B28B", "B28C", "B28D",
                "B29", "B29B", "B29C", "B29D", "B29K", "B29L",
                "B30", "B30B",
                "B31", "B31B", "B31C", "B31D", "B31F",
                "B32", "B32B",
                "B33", "B33Y",
                "B41", "B41B", "B41C", "B41D", "B41F", "B41G", "B41J", "B41K", "B41L", "B41M", "B41N", "B41P",
                "B42", "B42B", "B42C", "B42D", "B42F", "B42P",
                "B43", "B43K", "B43L", "B43M",
                "B44", "B44B", "B44C", "B44D", "B44F",
                "B60", "B60B", "B60C", "B60D", "B60F", "B60G", "B60H", "B60J", "B60K", "B60L", "B60M", "B60N", "B60P", "B60Q", "B60R", "B60S", "B60T", "B60V", "B60W", "B60Y",
                "B61", "B61B", "B61C", "B61D", "B61F", "B61G", "B61H", "B61J", "B61K", "B61L",
                "B62", "B62B", "B62C", "B62D", "B62H", "B62J", "B62K", "B62L", "B62M",
                "B63", "B63B", "B63C", "B63G", "B63H", "B63J",
                "B64", "B64B", "B64C", "B64D", "B64F", "B64G",
                "B65", "B65B", "B65C", "B65D", "B65F", "B65G", "B65H", 
                "B66", "B66B", "B66C", "B66D", "B66F",
                "B67", "B67B", "B67C", "B67D",
                "B67", "B68B", "B68C", "B68F", "B68G",
                "B81", "B81B", "B81C",
                "B82", "B82B", "B82Y",
                "B99", "B99Z"]
#B_CPC_labels = list(itertools.chain(B1_CPC_labels, B2_CPC_labels))

C_CPC_labels = ["C01", "C01B", "C01C", "C01D", "C01F", "C01G", "C01P", 
                "C02", "C02F",
                "C03", "C03B", "C03C",
                "C04", "C04B",
                "C05", "C05B", "C05C", "C05D", "C05F", "C05G",
                "C06", "C06B", "C06C", "C06D", "C06F",
                "C07", "C07B", "C07C", "C07D", "C07F", "C07G", "C07H", "C07J", "C07K",
                "C08", "C08B", "C08C", "C08F", "C08G", "C08H", "C08J", "C08K", "C08L",
                "C09", "C09B", "C09C", "C09D", "C09F", "C09G", "C09H", "C09J", "C09K",
                "C10", "C10B", "C10C", "C10F", "C10G", "C10H", "C10J", "C10K", "C10L", "C10M", "C10N",
                "C11", "C11B", "C11C", "C11D",
                "C12", "C12C", "C12F", "C12G", "C12H", "C12J", "C12L", "C12M", "C12N", "C12P", "C12Q", "C12R", "C12Y",
                "C13", "C13B", "C13K",
                "C14", "C14B", "C14C",
                "C21", "C21B", "C21C", "C21D",
                "C22", "C22B", "C22C", "C22F",
                "C23", "C23C", "C23D", "C23F", "C23G",
                "C25", "C25B", "C25C", "C25D", "C25F",
                "C30", "C30B",
                "C40", "C40B",
                "C99", "C99Z"]

D_CPC_labels = ["D01", "D01B", "D01C", "D01D", "D01F", "D01G", "D01H",
                "D02", "D02G", "D02H", "D02J",
                "D03", "D03C", "D03D", "D03J",
                "D04", "D04B", "D04C", "D04D", "D04G", "D04H",
                "D05", "D05B", "D05C", "D05D",
                "D06", "D06B", "D06C", "D06F", "D06G", "D06H", "D06F", "D06L", "D06M", "D06N", "D06P", "D06Q", 
                "D07", "D07B", 
                "D10", "D10B", 
                "D21", "D21B", "D21C", "D21D", "D21F", "D21G", "D21H", "D21J",
                "D99", "D99Z" ]


E_CPC_labels = ["E01", "E01B", "E01C", "E01D", "E01F", "E01H",
                "E02", "E02B", "E02C", "E02D", "E02F",
                "E03", "E03B", "E03C", "E03D", "E03F",
                "E04", "E04B", "E04C", "E04D", "E04F", "E04G", "E04H",
                "E05", "E05B", "E05C", "E05D", "E05F", "E05G", "E05Y",
                "E06", "E06B", "E06C",
                "E21", "E21B", "E21C", "E21D", "E21F",
                "E99", "E99Z"]


F_CPC_labels = ["F01", "F01B", "F01C", "F01D", "F01K", "F01L", "F01M", "F01N", "F01P", 
                "F02", "F02B", "F02C", "F02D", "F02F", "F02G", "F02K", "F02M", "F02N", "F02P",
                "F03", "F03B", "F03C", "F03D", "F03G", "F03H",
                "F04", "F04B", "F04C", "F04D", "F04F", 
                "F05", "F05B", "F05C", "F05D",
                "F15", "F15B", "F15C", "F15D",
                "F16", "F16B", "F16C", "F16D", "F16F", "F16G", "F16H", "F16J", "F16K", "F16L", "F16M", "F16N", "F16P", "F16S", "F16T", 
                "F17", "F17B", "F17C", "F17D",
                "F21", "F21H", "F21K", "F21L", "F21S", "F21V", "F21W", "F21Y", 
                "F22", "F22B", "F22D", "F22G",
                "F23", "F23B", "F23C", "F23D", "F23G", "F23H", "F23J", "F23K", "F23L", "F23M", "F23N", "F23Q", "F23R",
                "F24", "F24B", "F24C", "F24D", "F24F", "F24H", "F24S", "F24T", "F24V", 
                "F25", "F25B", "F25C", "F25D", "F25J", 
                "F26", "F26B",
                "F27", "F27B", "F27D", "F27M",
                "F28", "F28B", "F28C", "F28D", "F28F", "F28G",
                "F41", "F41A", "F41B", "F41C", "F41F", "F41G", "F41H", "F41J", 
                "F42", "F42B", "F42C", "F42D", 
                "F99", "F99Z"]

G_CPC_labels = ["G01", "G01B", "G01C", "G01D", "G01F", "G01G", "G01H", "G01J", "G01K", "G01L", "G01M", "G01N", "G01P", "G01Q", "G01R", "G01S", "G01T", "G01V", "G01W",  
                "G02", "G02B", "G02C", "G02F", 
                "G03", "G03B", "G03C", "G03D", "G03F", "G03G", "G03H", 
                "G04", "G04B", "G04C", "G04D", "G04F", "G04G", "G04R", 
                "G05", "G05B", "G05D", "G05F", "G05G", 
                "G06", "G06C", "G06D", "G06E", "G06F", "G06G", "G06J", "G06K", "G06M", "G06N", "G06Q", "G06T", 
                "G07", "G07B", "G07C", "G07D", "G07F", "G07G",
                "G08", "G08B", "G08C", "G08G", 
                "G09", "G09B", "G09C", "G09D", "G09F", "G09G", 
                "G10", "G10B", "G10C", "G10D", "G10F", "G10G", "G10H", "G10K", "G10L", 
                "G11", "G11B", "G11C", 
                "G12", "G12B",
                "G16", "G16B", "G16C", "G16H", "G16Y", 
                "G21", "G21B", "G21C", "G21D", "G21F", "G21G", "G21H", "G21J", "G21K",   
                "G99", "G99Z"]

H_CPC_labels = ["H01", "H01B", "H01C", "H01F", "H01G", "H01H", "H01J", "H01K", "H01L", "H01M", "H01P", "H01Q", "H01R", "H01S", "H01T", 
                "H02", "H02B", "H02G", "H02H", "H02J", "H02K", "H02M", "H02N", "H02P", "H02S", 
                "H03", "H03B", "H03C", "H03D", "H03F", "H03G", "H03H", "H03J", "H03K", "H03L", "H03M", 
                "H04", "H04B", "H04H", "H04J", "H04K", "H04L", "H04M", "H04N", "H04Q", "H04R", "H04S", "H04T", "H04W", 
                "H05", "H05B", "H05C", "H05F", "H05G", "H05H", "H05K", 
                "H99", "H99Z"]

Y_CPC_labels = ["Y02", "Y02A", "Y02B", "Y02C", "Y02D", "Y02E", "Y02P", "Y02T", "Y02W", 
                "Y04", "Y04S", 
                "Y10", "Y10S", "Y10T"]

In [None]:
#create a CPC function used in One-hot encode training, validation, and test labels 


def CPC_func(cpc):
    
    cpc_A = 1
    cpc_B = 2
    cpc_C = 3
    cpc_D = 4
    cpc_E = 5
    cpc_F = 6
    cpc_G = 7
    cpc_H = 8
    cpc_Y = 9
    
    if cpc in A_CPC_labels:
        return cpc_A
  
    if cpc in B_CPC_labels:
        return cpc_B
    
    if cpc in C_CPC_labels:
        return cpc_C
  
    if cpc in D_CPC_labels:
        return cpc_D
    
    if cpc in E_CPC_labels:
        return cpc_E
  
    if cpc in F_CPC_labels:
        return cpc_F
    
    if cpc in G_CPC_labels:
        return cpc_G
  
    if cpc in H_CPC_labels:
        return cpc_H
    
    if cpc in Y_CPC_labels:
        return cpc_Y
    
#train_df = shuffle(train_df)
#train_df.head(-1)

In [None]:
# define labels for checking similarities later
labels = ["cpc_A", "cpc_B","cpc_C", "cpc_D", "cpc_E", "cpc_F", "cpc_G", "cpc_H", "cpc_Y"]

In [None]:
# split train_ds into 90% train and 10% val datasets
from sklearn.model_selection import train_test_split

# Splitting the data by a percentage
train_df, val_df = train_test_split(train_df, train_size=0.9, test_size=0.1, shuffle=True)
#print(train_data)

#val_df = val_df.copy()
print("Shape of new dataframes - {} , {}".format(train_df.shape, val_df.shape))

In [None]:
# One-hot encode training, validation labels

train_df["label"] = train_df["context"].apply(CPC_func)
#train_df.to_int()
y_train = to_categorical(train_df.label-1, num_classes=9, dtype ="uint8")
print(train_df)
print('\n')
print(y_train)
val_df["label"] = val_df["context"].apply(CPC_func)
#print(val_df)
#print(df_val.isnull().sum())
y_val = to_categorical(val_df.label-1, num_classes = 9, dtype ="uint8")
#print(y_val)
#print('\n')

In [None]:
# using tf.keras.utils.Sequence and BertTokenizer+"bert-base-uncased" as data genrator
# for encoding

#from transformers import BertTokenizer

class BertDataGenerator(tf.keras.utils.Sequence):
    
   
    def __init__(
        self,
        phrase_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.phrase_pairs = phrase_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        
        # BertTokenizer to encode the text using base-base-uncased pretrained model.
        # 
        self.tokenizer = BertTokenizer.from_pretrained(PATH, local_files_only=True, do_lower_case=True)
        self.indexes = np.arange(len(self.phrase_pairs))
        self.on_epoch_end()

    def __len__(self):
        # number of batches per epoch.
        return len(self.phrase_pairs) // self.batch_size

    def __getitem__(self, idx):
        # the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        phrase_pairs = self.phrase_pairs[indexes]
        '''
        #*** added in trying to turn sharding off
        phrase_pairs = tf.data.Dataset.from_tensors((phrase_pairs)) 
        phrase_pairs = phrase_pairs.batch(batch_size)
        options = tf.data.Options()
        options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
        phrase_pairs = phrase_pairs.with_options(options)
        #************************* end of sharding turn off*****
        '''
        # both the sentences are encoded together and separated by [SEP] token
        # using BertTokenizer's batch_encode_plus 
        encoded = self.tokenizer.batch_encode_plus(
            phrase_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")
        
        
        # 
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

Building the model

In [None]:
#Build the model
#
# Using tf.distribute.MirroredStrategy+transformers.TFBertModel with "bert-base-uncased" for the Bert-model.

strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # input ids 
    input_ids = Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
    
    # Attention masks 
    input_masks = Input(shape=(max_length,), dtype=tf.int32, name="attention_masks")
    
    # Token type ids 
    input_token_ids = Input(shape=(max_length,), dtype=tf.int32, name="token_type_ids")
    
    # turning off auto-sharding 
    
    # BERT with pretrained of 'bert-base-uncased' model.
    bert_model = TFBertModel.from_pretrained(PATH, local_files_only=True)
    
    bert_model.trainable = True

    # last hidden state
    embeddings = bert_model(input_ids, attention_mask=input_masks, token_type_ids=input_token_ids)[0]
    
    #The pooler_output
    pooled_output = bert_model(input_ids, attention_mask=input_masks, token_type_ids=input_token_ids)[1]
    
    # 
    bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.1, recurrent_dropout=0.2, return_sequences=True))(embeddings)
    
    # pooling approach of average and Max on bi_lstm.
    avg_outp = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_outp = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat_outp = tf.keras.layers.concatenate([avg_outp, max_outp])
    output = tf.keras.layers.Dropout(0.3)(concat_outp)
    output = Dense(64, activation = 'relu')(output) # chaged from 128 to 64
    output = tf.keras.layers.Dropout(0.1)(output) # added in
    outp = Dense(9, activation="softmax")(output)
    
    model = tf.keras.Model(
        inputs=[input_ids, input_masks, input_token_ids], outputs=outp)

    model.compile(
        optimizer= Adam(learning_rate=2e-5, epsilon=1e-08), # changed from 1e-5 to 2e-5, deleted clipnorm=1.0
        loss="categorical_crossentropy",
        metrics=["accuracy"])
    

    
print(f"Strategy: {strategy}")
model.summary()

In [None]:
# Create train and validation data generators
train_phrase_pairs = train_df[["anchor", "target"]].values.astype("str")
'''
train_phrase_pairs = tf.data.Dataset.from_tensors((train_phrase_pairs)) 
train_phrase_pairs = train_phrase_pairs.batch(batch_size)
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
train_phrase_pairs = train_phrase_pairs.with_options(options)
'''
train_data = BertDataGenerator(
    train_phrase_pairs,
    y_train,
    batch_size=batch_size,
    shuffle=False,)

val_phrase_pairs = val_df[["anchor", "target"]].values.astype("str")
'''
val_phrase_pairs = tf.data.Dataset.from_tensors((val_phrase_pairs)) 
val_phrase_pairs = val_phrase_pairs.batch(batch_size)
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
val_phrase_pairs = val_phrase_pairs.with_options(options)
'''
valid_data = BertDataGenerator(
    val_phrase_pairs,
    y_val,
    batch_size=batch_size,
    shuffle=False,)

In [None]:
#Train the model 
#
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,)

In [None]:
#visualizing the loss and accuracy of training and validation data

#loss    
train_metric = history.history['loss']
val_metric = history.history['val_' + 'loss']
epo = range(1, epochs + 1)
plt.plot(epo, train_metric, 'bo', label='Train ' + 'loss')
plt.plot(epo, val_metric, 'ro', label='Validation ' + 'loss')
plt.xlabel('Epoch number')
plt.ylabel('loss')
plt.title('Training and Validation ' + 'loss')
plt.legend()
plt.show()

#accuracy
train_metric = history.history['accuracy']
val_metric = history.history['val_' + 'accuracy']
epo = range(1, epochs + 1)
plt.plot(epo, train_metric, 'bo', label='Train ' + 'acc')
plt.plot(epo, val_metric, 'ro', label='Validation ' + 'acc')
plt.xlabel('Epoch number')
plt.ylabel('acc')
plt.title('Training and Validation ' + 'acurracy')
plt.legend()
plt.show()

In [None]:
#Loading test dataset

test_df = pd.read_csv('/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv')
#test_df.shape

test_df["label"] = test_df["context"].apply(CPC_func)
#print(test_df)
#print(test_df['label'].isnull().sum())
y_test = to_categorical(test_df.label-1, num_classes=9, dtype ="uint8")
#print(y_test)

In [None]:
#Evaluate model on the test set
#

test_data = BertDataGenerator(
    test_df[["anchor", "target"]].values.astype("str"),
    y_test,
    batch_size=batch_size,
    shuffle=False,
)
model.evaluate(test_data, verbose=1)

In [None]:
#Inference for checking the similarities on two CPC phrases according to CPC classes of A, B, ..., H, Y
#


def similarity_eval(test_ds):
    #*********************trying to turn sharding off******************************
    #phrase_pairs = test_ds[["anchor", "target"]].values.astype("str")
    test_phrase_pairs = test_ds[["anchor", "target"]].values.astype("str")
    
    # unable to turn sharding off 
    '''
    test_phrase_pairs = tf.data.Dataset.from_tensor_slices((test_phrase_pairs)) 
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    test_phrase_pairs = test_phrase_pairs.with_options(options)
    '''
    test_data = BertDataGenerator(
        test_phrase_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,)
    print(test_data)
    
    #length = len(test_phrase_pairs)
    proba_pred = []
    pred_lbl = []
    y_test_mx = []
    index = np.arange(len(test_phrase_pairs))
    
    for i in index:
        y_test_max = np.argmax(y_test[i]) # used later in calculating Pearson's CC
        y_test_mx.append(y_test_max) # used later in calculating Pearson's CC
        
        proba = model.predict(test_data[i])[0]
        #print(proba)
        #proba_cy = proba.copy()
        idx = np.argmax(proba)
        proba = f"{proba[idx]: .2f}"
        pred = labels[idx]    
        proba_pred.append(proba)
        pred_lbl.append(pred)
        
        results = pd.DataFrame({'pred_label': pred_lbl, 'proba_pred':proba_pred, 'y_test_max':y_test_mx})
        #print(results)
    return results

#predicting on test dataset
result = similarity_eval(test_df)
print(result.pred_label + result.proba_pred)

In [None]:
# function for calculating Pearson's Correlation coefficient
import math

# calculates the mean
def mean(x):
    sum = 0.0
    for xi in x:
         sum += xi
    return sum / len(x) 

# calculates the sample standard deviation
def StandardDeviation(x):
    sumv = 0.0
    for xi in x:
         sumv += (xi - mean(x))**2
    return math.sqrt(sumv/(len(x)-1))

# Note this pearson function does not calculate sum(x*y)/(n-1), 
# just calculate the products of xi, yi for each pair(x,y) 
def pearson(x,y):
    scorex = []
    scorey = []
    n = len(x)
    for xi, yj in zip(x,y): 
        scorex.append((xi - mean(x))/StandardDeviation(x))
        
        scorey.append((yj - mean(y))/StandardDeviation(y))
        
        results = (np.array(scorex)*np.array(scorey))#/(n-1)
    #print(results)
       
    return results

In [None]:
#from scipy.stats import pearsonr

#def pred_score(y_true, y_pred):
predictions = result['proba_pred'].astype('float32')
#predictions = np.array(predictions).flatten()
    
#print(predictions)
print('\n')
y_true = result['y_test_max'].astype('float32') 
    
#print(y_true) # = y_test_max.append()
#pred_true = test_df['label'].astype('float32') 
    #pred_true = y_test_max.astype('float32') # one-hot code data
    #pred_true = pred_true.flatten()# np.array(pred_true).flatten()
    #print(pred_true)

pearson_corr = abs(pearson(y_true, predictions))
pearson_cc = np.reshape(pearson_corr, -1)
#pearson_corr = num / den #pearsoncr(y_true, y_pred_i)
score_pd = pd.DataFrame(pearson_cc, columns = ['pred_score'])

#generating the score for phrases_pairs using np.select
conditions = [(score_pd['pred_score']  > 0) & (score_pd['pred_score'] <=0.10),
              (score_pd['pred_score'] > 0.10 ) & (score_pd['pred_score'] <= 0.25 ),
              (score_pd['pred_score'] > 0.25 ) & (score_pd['pred_score'] <= 0.50 ),
              (score_pd['pred_score'] > 0.50 ) & (score_pd['pred_score'] <= 0.75 ),
              (score_pd['pred_score'] > 0.75 ) & (score_pd['pred_score'] <= 1.00 ),
              (score_pd['pred_score'] > 1 ) 
             ]
score_est = ['0', '0.25', '0.5', '0.75', '1.00', '1.25']
score_pd['score'] = np.select(conditions, score_est)
score_pd.score.astype('float32')

In [None]:
submission = pd.DataFrame({'Id': test_df['id'], 'score':score_pd.score})

#print(submission)
submission.to_csv('submission.csv', index=False)

This is fun to learn as well as to improve ML's skill