In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
strategy = tf.distribute.get_strategy()
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

In [26]:
from google.colab import output
output.enable_custom_widget_manager()

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
train = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Sentence Correlation/train.xlsx")
test = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Sentence Correlation/evaluation.xlsx")

# Data Augmentation

## Using a BERT Base model to find the most semantically similar reasons to a text and using them as a reason in a new sample.

In [29]:
#%%capture
!pip install sentence_transformers

!pip install transformers

!pip install sentencepiece

!pip install nlpaug

#!pip install sacremoses #for backtranslate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [30]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
text_embeddings = model.encode(train['text'])
reason_embeddings = model.encode(train['reason'])

In [32]:
train2 = pd.DataFrame(columns=["text", "reason", "label"])
train2

Unnamed: 0,text,reason,label


In [None]:
for i in range(len(train)):
    try: text_sim_with_all_reasons = cosine_similarity([text_embeddings[i]],
                                                    reason_embeddings)[0]
    except IndexError: pass    
    highest_sim_indices = np.argsort(text_sim_with_all_reasons)[-3:-1]

    train2.reset_index(inplace=True, drop=True)
    for j in range(2):
        a = [train['text'][i], train['reason'][highest_sim_indices[j]], 1.0]
        train2 = train2.append(pd.DataFrame([a], columns=["text", "reason", "label"]), ignore_index=True)

In [None]:
train = pd.concat([train, train2], axis=0)
train.dropna(inplace=True)

## Randomly assigning a reason to each text and setting the label to 0


In [None]:
import random
train2 = pd.DataFrame()
train2['text'] = train['text']
a = list(train['reason'])
random.shuffle(a)
train2['reason'] = a
train2['label'] = [0.0]*len(train2)
train2

In [None]:
train = pd.concat([train, train2], axis=0)

## Other data augmentation tools: Replacing words with synonyms and antonyms from wordnet, and using a DistilBERT to paraphrase texts and reasons.

In [None]:
import nlpaug.augmenter.word as naw

In [None]:
synaug = naw.SynonymAug(aug_src='wordnet', model_path=None, name='Synonym_Aug', aug_min=1, aug_max=10, aug_p=0.3, lang='eng', 
                        stopwords=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, force_reload=False, 
                        verbose=0)

ant_aug = naw.AntonymAug(name='Antonym_Aug', aug_min=1, aug_max=10, aug_p=0.3, lang='eng', stopwords=None, tokenizer=None, 
                        reverse_tokenizer=None, stopwords_regex=None, verbose=0)

aug_bert = naw.ContextualWordEmbsAug(
        model_path='distilbert-base-uncased', 
        #device='cuda',
        action='insert', top_k=20)

#back_translation_aug = naw.BackTranslationAug() 

In [None]:
def synonymaug(text):
    return synaug.augment(text)[0]

def antonymaug(text):
    return ant_aug.augment(text)[0]

def flowaug(text):
    return aug_bert.augment(text)[0]

def backtranslate(tex):
    return back_translation_aug.augment(text)[0]

'''def word2vecaug(text):
    aug_w2v = naw.WordEmbsAug(
        model_type='glove', model_path='/content/glove.6B.300d.txt',
        action="substitute")
    return aug_w2v.augment(text)'''

In [None]:
def applyaugtodf(df, augfunction):
    train2 = pd.DataFrame()
    train2['text'] = train['text'].progress_apply(augfunction)
    train2['reason'] = train['reason'].progress_apply(augfunction)
    train2['label'] = train['label']
    return train2

In [None]:
train = pd.concat([train,
                   #applyaugtodf(train, synonymaug),
                   #applyaugtodf(train, antonymaug),
                   #applyaugtodf(train, flowaug)
                   #applyaugtodf(train, backtranslate) #takes too long
], axis=0)

train.reset_index(drop=True, inplace=True)

In [None]:
print(len(train))
train.drop_duplicates(inplace=True)
print(len(train))

#train.to_csv("/content/drive/MyDrive/Colab Notebooks/Sentence Correlation/train_augmented.csv", index=False)
train

In [None]:
#train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Sentence Correlation/train_augmented.csv")

# Tokenisation

In [None]:
from transformers import TFAutoModel,AutoTokenizer, AutoModel
import tensorflow as tf

In [None]:
tokenizer=AutoTokenizer.from_pretrained('joeddav/xlm-roberta-large-xnli')

In [None]:
train_enc=tokenizer.batch_encode_plus(train[['text','reason']].values.tolist(),padding='max_length',max_length=100,truncation=True,return_attention_mask=True)
test_enc=tokenizer.batch_encode_plus(test[['text','reason']].values.tolist(),padding='max_length',max_length=100,truncation=True,return_attention_mask=True)
train_tf1=tf.convert_to_tensor(train_enc['input_ids'],dtype=tf.int32)
train_tf2=tf.convert_to_tensor(train_enc['attention_mask'],dtype=tf.int32)
train_input={'input_word_ids':train_tf1,'input_mask':train_tf2}
test_tf1=tf.convert_to_tensor(test_enc['input_ids'],dtype=tf.int32)
test_tf2=tf.convert_to_tensor(test_enc['attention_mask'],dtype=tf.int32)
test_input={'input_word_ids':test_tf1,'input_mask':test_tf2}

# Model Definition

In [None]:
with strategy.scope():
    input_ids = tf.keras.Input(shape = (100,), dtype = tf.int32,name='input_word_ids') 
    input_mask=tf.keras.Input(shape=(100,),dtype=tf.int32,name='input_mask')    
    roberta = TFAutoModel.from_pretrained('joeddav/xlm-roberta-large-xnli')
    roberta = roberta([input_ids,input_mask])[0]
    output = tf.keras.layers.GlobalAveragePooling1D()(roberta)
    output = tf.keras.layers.Dense(3, activation = 'softmax')(output)
    model = tf.keras.Model(inputs = [input_ids,input_mask], outputs = output)
    model.compile(optimizer = tf.keras.optimizers.Adam(lr = 1e-5), 
                  loss = 'sparse_categorical_crossentropy', 
                  metrics = ['accuracy']) 
    model.summary()

Downloading tf_model.h5:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Some layers from the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing TFXLMRobertaModel: ['classifier']
- This IS expected if you are initializing TFXLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLMRobertaModel were initialized from the model checkpoint at joeddav/xlm-roberta-large-xnli.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 100)]        0           []                               
                                                                                                  
 tfxlm_roberta_model (TFXLMRobe  TFBaseModelOutputWi  559890432  ['input_word_ids[0][0]',         
 rtaModel)                      thPoolingAndCrossAt               'input_mask[0][0]']             
                                tentions(last_hidde                                               
                                n_state=(None, 100,                                           

  super(Adam, self).__init__(name, **kwargs)


In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=2,restore_best_weights=True)
model.fit(train_input,train.label,validation_split = 0.2,epochs=2,batch_size=16*strategy.num_replicas_in_sync,callbacks=[early_stop],verbose=1)

In [None]:
#x = np.asarray(test).astype('float32')

In [None]:
#test_input=bert_encode(test.hypothesis.values,test.premise.values,tokenizer)
#predictions = [np.argmax(i) for i in model.predict(test_input)]

# Inference

In [None]:
pred = model.predict(test_input)

In [None]:
predx = [list(i).index(max(list(i))) for i in list(pred)]
predx

In [None]:
test['pred'] = predx

In [None]:
test.to_csv("/content/drive/MyDrive/Colab Notebooks/Sentence Correlation/pred_evalantsyn.csv", index=False)

In [None]:
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Sentence Correlation/pred_evalantsyn.csv")

In [None]:
test

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test['pred'], test['label'])

In [None]:
test['pred']

In [None]:
test['label']

In [None]:
len(test[test['pred'] == 1])/len(test), len(test[test['pred'] == 0])/len(test)