In [None]:
!pip install fasttext
!pip install tensorflow_addons
!pip install transformers
!pip install sentence_transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!7za x '/content/drive/MyDrive/Copy of cc.en.100.bin.7z'

In [None]:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from numpy import dot
from numpy.linalg import norm
import string
from sklearn.metrics import accuracy_score
from itertools import combinations
import ast
import tensorflow as tf
import fasttext
from sklearn.metrics import accuracy_score
import tensorflow_addons as tfa
from sentence_transformers import SentenceTransformer
import re
from sklearn.metrics import confusion_matrix
from keras import backend as K

In [None]:
def preprocess(text):
  clean_text = text.strip()
  clean_text = clean_text.replace(' ','')
  clean_text = clean_text.lower()
  clean_text = re.sub(r'\d+','',clean_text)
  lookup_table = clean_text.maketrans('', '', string.punctuation)
  clean_text = clean_text.translate(lookup_table)
  return clean_text

In [None]:
def get_metrics(y_test_final,y_test_predicted_clean):
  tn, fp, fn, tp = confusion_matrix(y_test_final, y_test_predicted_clean).ravel()
  specificity = tn / (tn+fp)
  precision = tp / (tp+fp)
  recall = tp / (tp+fn)
  accuracy = (tn+tp)/(tn+fp+fn+tp)
  f1 = 2*(precision*recall) / (precision+recall)
  return specificity,precision,recall,accuracy,f1

In [None]:
def split(df):
  train = df[df['split'].isin(['train'])]
  x_train = train[["embedding0","embedding1","category_0","category_1"]]
  test = df[df['split'].isin(['test'])]
  x_test =  test[["embedding0","embedding1","category_0","category_1"]]
  y_train = train["label"]
  y_test = test["label"]
  return x_train, x_test, y_train, y_test

In [None]:
def split_and_train_mlp(df,shape):
  x_train, x_test, y_train, y_test = split(df)
  np.random.seed(42)
  tf.random.set_seed(42)
  input1 = tf.keras.layers.Input(shape=(shape,))
  input2 = tf.keras.layers.Input(shape=(shape,))
  layer = tf.keras.layers.concatenate([input1,input2],axis=1)
  layer = tf.keras.layers.Dense(128,activation='relu',kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42))(layer)
  layer = tf.keras.layers.Dense(1,activation='sigmoid',kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42))(layer)
  model = tf.keras.Model([input1,input2],layer)
  model.compile(optimizer=tf.keras.optimizers.Adam(),loss=tf.keras.losses.BinaryCrossentropy(),metrics='accuracy')
  x_train_embedding0 = np.array(x_train['embedding0'].values.tolist()).astype('float32')
  x_train_embedding1 = np.array(x_train['embedding1'].values.tolist()).astype('float32')
  y_train_final = np.array(y_train.tolist()).astype(int)
  x_test_embedding0 = np.array(x_test['embedding0'].values.tolist()).astype('float32')
  x_test_embedding1 = np.array(x_test['embedding1'].values.tolist()).astype('float32')
  y_test_final = np.array(y_test.tolist()).astype(int)
  callback_1 = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
  model.fit(x=[x_train_embedding0,x_train_embedding1],y=y_train_final,epochs=20,validation_split=0.2,callbacks=[callback_1],shuffle=True)
  y_test_predicted = model.predict(x=[x_test_embedding0,x_test_embedding1])
  y_test_predicted_clean = [round(pred[0]) for pred in y_test_predicted]
  specificity,precision,recall,accuracy,f1 = get_metrics(y_test_final,y_test_predicted_clean)
  print("Specificity: {}, Precision: {}, Recall {}, Accuracy {},F1 Score {}".format(specificity,precision,recall,accuracy,f1))

In [None]:
def split_and_train_lstm(df):
  x_train, x_test, y_train, y_test = split(df)
  np.random.seed(42)
  tf.random.set_seed(42)
  cat0 = x_train['category_0'].tolist()
  cat1 = x_train['category_1'].tolist()
  testcat0 = x_test['category_0'].tolist()
  testcat1= x_test['category_1'].tolist()
  chars = set()
  for c in cat0:
    for x in c:
      chars.add(x)
  for c in cat1:
    for x in c:
      chars.add(x)
  chars = list(chars)
  char_to_id = {c:i for i,c in enumerate(chars)}
  id_to_char = {i:c for i,c in enumerate(chars)}
  x_train_emb1 = []
  x_train_emb2 = []
  x_test_emb1 = []
  x_test_emb2 = []

  for cat in cat0:
    curr_emb = [char_to_id[c] for c in cat]
    x_train_emb1.append(curr_emb)

  for cat in cat1:
    curr_emb = [char_to_id[c] for c in cat]
    x_train_emb2.append(curr_emb)

  for cat in testcat0:
    curr_emb = [char_to_id[c] for c in cat]
    x_test_emb1.append(curr_emb)

  for cat in testcat1:
    curr_emb = [char_to_id[c] for c in cat]
    x_test_emb2.append(curr_emb)

  x_train_emb1 = tf.keras.utils.pad_sequences(
  x_train_emb1,
  maxlen=100,
  dtype='int32',
  padding='pre',
  truncating='pre',
  value=0.0)
  x_train_emb2 = tf.keras.utils.pad_sequences(
      x_train_emb2,
      maxlen=100,
      dtype='int32',
      padding='pre',
      truncating='pre',
      value=0.0)
  x_test_emb1 = tf.keras.utils.pad_sequences(
      x_test_emb1,
      maxlen=100,
      dtype='int32',
      padding='pre',
      truncating='pre',
      value=0.0)
  x_test_emb2 = tf.keras.utils.pad_sequences(
      x_test_emb2,
      maxlen=100,
      dtype='int32',
      padding='pre',
      truncating='pre',
      value=0.0)
  MAX_LEN = 100
  callback_1 = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
  print("Training LSTM......")
  first_sent_in = tf.keras.layers.Input(shape=(MAX_LEN,))
  second_sent_in = tf.keras.layers.Input(shape=(MAX_LEN,))
  embedding_layer =  tf.keras.layers.Embedding(len(char_to_id)+1,100, input_length=MAX_LEN)
  first_sent_embedding = embedding_layer(first_sent_in)
  second_sent_embedding = embedding_layer(second_sent_in)
  lstm =  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=256, return_sequences=False,kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42)))
  first_sent_encoded = lstm(first_sent_embedding)
  second_sent_encoded = lstm(second_sent_embedding)
  l1_norm = lambda x: 1 - K.abs(x[0] - x[1])
  merged = tf.keras.layers.Lambda(function=l1_norm, output_shape=lambda x: x[0], name='L1_distance')([first_sent_encoded, second_sent_encoded])
  predictions = tf.keras.layers.Dense(1, activation='sigmoid', name='classification_layer',kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42))(merged)
  model = tf.keras.Model([first_sent_in, second_sent_in], predictions)
  model.compile(loss = 'binary_crossentropy', optimizer = "adam", metrics=["accuracy"])
  print(model.summary())
  model.fit([x_train_emb1, x_train_emb2], y_train.to_numpy().astype(int), validation_split=0.1, epochs = 20,shuffle=True, batch_size = 512,callbacks=[callback_1])
  y_test_predicted = model.predict(x=[x_test_emb1,x_test_emb2])
  y_test_predicted_clean = [round(pred[0]) for pred in y_test_predicted]
  specificity,precision,recall,accuracy,f1 = get_metrics(y_test.to_numpy().astype(int),y_test_predicted_clean)
  print("Specificity: {}, Precision: {}, Recall {}, Accuracy {},F1 Score {}".format(specificity,precision,recall,accuracy,f1))
  return model

In [None]:
# ft = fasttext.load_model('/content/cc.en.100.bin')
df = pd.read_csv("/content/ood_pairs_simple3_86.csv")
df = df[["category_0","category_1","label","split"]]
df["category_0"] = df["category_0"].map(preprocess)
df["category_1"] = df["category_1"].map(preprocess)
df = df.sample(frac=1,random_state=42).reset_index() #shuffle

In [None]:
xf = pd.read_csv("synthetic_data_final.csv")
xf.rename(columns={'cat0':'category_0','cat1':'category_1'},inplace=True)
xf.dropna(inplace=True)
xf["split"] = "train"
xf["category_0"] = xf["category_0"].map(preprocess)
xf["category_1"] = xf["category_1"].map(preprocess)
combined_dataset = pd.concat([xf,df])
combined_dataset

# Experiments with OOD data

In [None]:
df['embedding0'] = df['category_0'].apply(lambda x:ft.get_word_vector(x).tolist())
df['embedding1'] = df['category_1'].apply(lambda x:ft.get_word_vector(x).tolist())

In [None]:
df['embedding0'] = 0
df['embedding1'] = 1

In [None]:
split_and_train_mlp(df,100)

In [None]:
llm = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings0 =  llm.encode(df['category_0'].tolist())
bert_embeddings1 =  llm.encode(df['category_1'].tolist())
df['embedding0'] = bert_embeddings0.tolist()
df['embedding1'] = bert_embeddings1.tolist()

In [None]:
split_and_train_mlp(df,384)

In [None]:
split_and_train_lstm(df)

# Experiments with Synthetic Data


In [None]:
combined_dataset['embedding0'] = combined_dataset['category_0'].apply(lambda x:ft.get_word_vector(x).tolist())
combined_dataset['embedding1'] = combined_dataset['category_1'].apply(lambda x:ft.get_word_vector(x).tolist())

In [None]:
combined_dataset['embedding0'] = 0
combined_dataset['embedding1'] = 1

In [None]:
split_and_train_mlp(combined_dataset,100)

In [None]:
llm = SentenceTransformer('all-MiniLM-L6-v2')
bert_embeddings0 =  llm.encode(combined_dataset['category_0'].tolist())
bert_embeddings1 =  llm.encode(combined_dataset['category_1'].tolist())
combined_dataset['embedding0'] = bert_embeddings0.tolist()
combined_dataset['embedding1'] = bert_embeddings1.tolist()

In [None]:
split_and_train_mlp(combined_dataset,384)

In [None]:
model = split_and_train_lstm(combined_dataset)