In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/291_Project/

In [None]:
!pip install tensorflow_addons
! pip install transformers

In [None]:
import pandas as pd
from itertools import combinations
import ast
import numpy as np
import sys
import tqdm
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from torch import nn
from torchsummary import summary
import tensorflow_addons as tfa
import tensorflow as tf


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
ood_df = pd.read_csv("ood_pairs_simple3_86.csv")
ood_df = ood_df.sample(frac=1, random_state=42).reset_index(drop=True)
ood_df = ood_df[ ["category_0", "category_1", "label", "split"]]
ood_df.head()

Unnamed: 0,category_0,category_1,label,split
0,atorney at law,attoney at law,1,test
1,new york 10038,san francisco 94102,0,test
2,san francisco 94121,san francisco ca 94123,1,train
3,new york 10013,chicago 60611,0,test
4,manhattan new york city,san franciscosequoia national pa,0,test


In [None]:
synth_df = pd.read_csv("synthetic_data_final.csv")
synth_df.dropna(inplace = True)
synth_df.rename(columns = {"cat0":"category_0", "cat1": "category_1"}, inplace = True)
synth_df["split"] = "train"
synth_df = synth_df[["category_0", "category_1", "label", "split"]]
# list(synth_df['category_0'])
len(synth_df)

print(synth_df[synth_df["label"]==1].shape, synth_df[synth_df["label"]==0].shape)

(64198, 4) (129458, 4)


In [None]:
# for ood_pairs_simple.csv
test_df = ood_df[ood_df['split']=="test"]
train_df = pd.concat((ood_df[ood_df['split']=="train"], synth_df))

print(test_df.shape)
print(train_df.shape)

(7318, 4)
(203756, 4)


In [None]:
# Approach 1: Use SimCSE pretrained embeddings + MLP
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
simcse_pretrained_model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

device = "cuda" if torch.cuda.is_available() else "cpu"
simcse_pretrained_model = simcse_pretrained_model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
text1 = np.array(train_df['category_0']).reshape(-1,1)
print(text1.shape)
text2 = np.array(train_df['category_1']).reshape(-1,1)
print(text2.shape)
labels = np.array(train_df['label']).reshape(-1,1)
print(labels.shape)


(203756, 1)
(203756, 1)
(203756, 1)


In [None]:
text1 = train_df['category_0'].values.tolist()
text2 = train_df['category_1'].values.tolist()
labels = train_df['label'].values.tolist()

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    """
    Custom data generator class for Digits dataset
    """
    def __init__(self, cat0, cat1, y, batch_size: int=64):
      self.cat0 = cat0
      self.y = np.array(y)
      self.cat1 = cat1
      self.batch_size = batch_size

    def __len__(self):
        return np.math.ceil(len(self.cat0) / self.batch_size)

    def __getitem__(self, index):
        """
        Returns a batch of data
        """

        batch_cat1 = self.cat0[index * self.batch_size : (index + 1) * self.batch_size]
        batch_cat2 = self.cat1[index * self.batch_size : (index + 1) * self.batch_size]
        batch_y = self.y[index * self.batch_size : (index + 1) * self.batch_size]

        inputs1 = tokenizer(list(batch_cat1), padding=True, truncation=True, return_tensors="pt").to(device)
        inputs2 = tokenizer(list(batch_cat2), padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
          embed1 = simcse_pretrained_model(**inputs1, output_hidden_states=True, return_dict=True).pooler_output.detach().cpu().numpy()
          embed2 = simcse_pretrained_model(**inputs2, output_hidden_states=True, return_dict=True).pooler_output.detach().cpu().numpy()

        return [embed1, embed2], batch_y

In [None]:
np.random.seed(42)
tf.random.set_seed(42)
init = tf.keras.initializers.GlorotUniform(seed=42)
input1 = tf.keras.layers.Input(shape=(768,))
input2 = tf.keras.layers.Input(shape=(768,))
layer = tf.keras.layers.concatenate([input1,input2],axis=1)
# layer = tf.keras.layers.Dense(768,activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42))(layer)
# layer = tf.keras.layers.Dense(256,activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42))(layer)
layer = tf.keras.layers.Dense(128,activation='relu', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42))(layer)
layer = tf.keras.layers.Dense(1,activation='sigmoid', kernel_initializer=tf.keras.initializers.GlorotUniform(seed=42))(layer)
model = tf.keras.Model([input1,input2],layer)
model.compile(optimizer=tf.keras.optimizers.Adam(),loss=tf.keras.losses.BinaryCrossentropy(),metrics='accuracy')

In [None]:
generator = DataGenerator(text1, text2, labels, batch_size= 512)

# callback_1 = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=6, restore_best_weights=True)


model.fit(generator,epochs=5)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f3a3a380d00>

In [None]:
model.save('my_model')

In [None]:
model = tf.keras.models.load_model('my_model')
model.fit(generator,epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f39fc6b8970>

In [None]:
text1 = list(test_df['category_0'])
print(len(text1))
text2 = list(test_df['category_1'])
print(len(text2))
labels = list(test_df['label'])
print(len(labels))

batch_size = 1024

idx = 0
embeddings1 = []
embeddings2 = []
while idx < len(text1):
  start = idx
  end = idx + batch_size
  end = len(text1) if end > len(text1) else end
  t1 = text1[start:end]
  t2 = text2[start:end]
  inputs1 = tokenizer(t1, padding=True, truncation=True, return_tensors="pt").to(device)
  inputs2 = tokenizer(t2, padding=True, truncation=True, return_tensors="pt").to(device)
  idx = idx + batch_size

  with torch.no_grad():
      embed1 = simcse_pretrained_model(**inputs1, output_hidden_states=True, return_dict=True).pooler_output
      embed2 = simcse_pretrained_model(**inputs2, output_hidden_states=True, return_dict=True).pooler_output
      embeddings1 = embeddings1 + embed1.tolist()
      embeddings2 = embeddings2 + embed2.tolist()


test_df["embedding_0"] = embeddings1
test_df["embedding_1"] = embeddings2
print(test_df.shape)

X_test,y_test = test_df[["embedding_0","embedding_1","category_0","category_1"]],test_df[["category_0","category_1", "label"]]
x_test_embedding0 = np.array(X_test['embedding_0'].values.tolist())
x_test_embedding1 = np.array(X_test['embedding_1'].values.tolist())
y_test_final = np.array(y_test['label'].values).astype(int)

7318
7318
7318


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["embedding_0"] = embeddings1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["embedding_1"] = embeddings2


(7318, 6)


In [None]:
model.evaluate(x=[x_test_embedding0,x_test_embedding1],y=y_test_final)



[0.43326711654663086, 0.8603443503379822]

In [None]:
y_test_predicted = model.predict(x=[x_test_embedding0,x_test_embedding1])
y_test_predicted_clean = [round(pred[0]) for pred in y_test_predicted]



In [None]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_test_final, y_test_predicted_clean).ravel()
specificity = tn / (tn+fp)
accuracy = (tn+tp)/(tn + fp + fn + tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*(precision*recall)/(precision+recall)
print("Specificity: {}, Precision: {}, Recall {}, Accuracy {},F1 Score {}".format(specificity,precision,recall,accuracy,f1_score))

Specificity: 0.9329107237189646, Precision: 0.7237128353879623, Recall 0.6089078706528371, Accuracy 0.860344356381525,F1 Score 0.6613651424784626


In [None]:
#Code to analyze the misclassified examples
misclassified_examples = {"category_0": [], "category_1":[], "target": [], "predicted":[], "filename":[]}
i=0
for idx, row in y_test.iterrows():
  # print(row)
  target = round(row['label'])
  predicted = round(y_test_predicted[i][0])
  i+=1
  if target!=predicted:
    misclassified_examples["category_0"].append(row["category_0"])
    misclassified_examples["category_1"].append(row["category_1"])
    misclassified_examples["target"].append(target)
    misclassified_examples["predicted"].append(predicted)
    misclassified_examples["filename"].append(row["filename"])

In [None]:
misclassfied_simcse = pd.DataFrame(misclassified_examples.items())
misclassfied_simcse=pd.DataFrame.from_dict(misclassified_examples,orient='index').transpose()
misclassfied_simcse.to_csv('misclassified_ood.csv')
misclassfied_files = {}
unique_filenames = y_test["filename"].unique()
for filename in unique_filenames:
  misclassfied_files[filename] = 0
i=0
for idx, row in y_test.iterrows():
  # print(row)
  target = round(row['label'])
  predicted = round(y_test_predicted[i][0])
  filename = row["filename"]
  i+=1
  if target!=predicted:
    misclassfied_files[filename]+=1
