# Key Notes

1. Goal is to find outliers in a corpus of addresses.
2. This notebook contains supervised approaches - Corrupting data and generating fake data.
3. BERT pre-trained model is trained in both cases.

## Utilities

In [None]:
# !pip install nltk
# !pip install transformers
# !pip install spacy
# !python3 -m spacy download en_core_web_sm
# !pip install torch
#!pip install faker

In [None]:
from transformers import *
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import codecs
from nltk.corpus import stopwords
import string
from scipy import sparse
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from math import log
import operator

import torch 
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.autograd as autograd
import math
import spacy
torch.manual_seed(123)

import random
random.seed(123)

np.random.seed(123)
nlp = spacy.load("en_core_web_sm")

device = torch.device('cuda')

import sys
import os

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import codecs
import string
from scipy import sparse
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.autograd as autograd
import math
import torch.utils.data as data_utils
import transformers

import faker
from faker import Faker

import random
from random import shuffle

from sklearn.metrics import roc_auc_score, f1_score, classification_report

In [None]:
pd.set_option('display.max_columns', 60)

## Load Model

In [None]:
model_class, tokenizer_class, config_class, pretrained_weights = DistilBertModel, DistilBertTokenizer, DistilBertConfig, 'distilbert-base-uncased'

In [None]:
pretrained_weights = 'distilbert-base-uncased'
config = config_class.from_pretrained(pretrained_weights,
                                      num_labels=2)
tokenizer = tokenizer_class.from_pretrained(pretrained_weights,
                                            do_lower_case=True)
bert_model = model_class.from_pretrained(pretrained_weights,
                                    config=config)

## Data

In [None]:
df_data = pd.read_csv('offices_data.csv')
addresses = df_data['adr_ln_1_txt'].unique()

## Corruption Approach

In [None]:
#Corrupting the data by randomly dropping one word from 20 percent of the corpus (also chosen randomly)
#The corrupted data points are labelled as 0 and and the normal data is labelled as 1
#A neural network using pre-trained BERT Embeddings is trained to predict 0s (the outliers) and 1s
#Recall is optimized for outlier class using some rules if needed

In [None]:
def corrupt_data(corpus, corrupt_sample_size):
    corrupt_indices = sorted(random.sample(range(len(corpus)), corrupt_sample_size))
    corrupt_corpus =[corpus[i] for i in range(len(corpus)) if i in corrupt_indices] 
    real_corpus = [corpus[i] for i in range(len(corpus)) if i not in corrupt_indices] 
    corrupt_corpus_tokenized = []
    for address in corrupt_corpus:
        tokens = address.split(" ")
        num_words = len(address.split(" "))
        index = np.random.randint(0, num_words, size = 1)
        corrupt_corpus_tokenized.append(" ".join([token for idx, token in enumerate(tokens) if idx not in index]))
#       dropped_index_length = int(len(address)/5)
#       index = np.random.randint(0, len(address), size = dropped_index_length)
#       corrupt_corpus_tokenized.append("".join([char for idx, char in enumerate(address) if idx not in index]))
    return real_corpus, corrupt_corpus_tokenized

In [None]:
def make_data_for_model(real_corpus, corrupt_corpus, val_sample_size, test_sample_size, corpus):
    df_real = pd.DataFrame()
    df_corrupt = pd.DataFrame()
    df_real['corpus'] = real_corpus
    df_real['label'] = 1
    #corrupt_corpus_word.extend(corrupt_corpus_char)
    df_corrupt['corpus'] = corrupt_corpus
    df_corrupt['label'] = 0
    df_for_model = pd.concat([df_real, df_corrupt], axis = 0)
    df_for_model = df_for_model.sample(frac = 1)
    df_for_model['corpus'] = df_for_model['corpus'].str.lower()
    x = df_for_model['corpus'].values
    y = df_for_model['label'].values
    test_indices = sorted(random.sample(range(len(corpus)), test_sample_size))
    test_x = [x[i] for i in range(len(corpus)) if i in test_indices]
    test_y = [y[i] for i in range(len(corpus)) if i in test_indices]
    x = [x[i] for i in range(len(corpus)) if i not in test_indices]
    y = [y[i] for i in range(len(corpus)) if i not in test_indices]
    from sklearn.model_selection import train_test_split
    train_x, val_x, train_y, val_y = train_test_split(x, y, test_size = val_sample_size)
    return train_x, val_x, test_x, train_y, val_y, test_y

In [None]:
real_corpus, corrupt_corpus = corrupt_data(addresses, int(len(addresses)/5))
train_x, val_x, test_x, train_y, val_y, test_y = make_data_for_model(real_corpus, corrupt_corpus, int(len(addresses)/5), int(len(addresses)/10), addresses)

In [None]:
corpus, dev_corpus, test_corpus, y, dev_y, test_y = train_x, val_x, test_x, train_y, val_y, test_y

In [None]:
corpus_tokenized = corpus
dev_corpus_tokenized = dev_corpus
test_corpus_tokenized = test_corpus
for i in range(len(corpus)):
    try:
        corpus_tokenized[i] = tokenizer.tokenize(corpus[i])
    except:
        pass
    
for i in range(len(dev_corpus)):
    try:
        dev_corpus_tokenized[i] = tokenizer.tokenize(dev_corpus[i])
    except:
        pass
for i in range(len(test_corpus)):
    try:
        test_corpus_tokenized[i] = tokenizer.tokenize(test_corpus[i])
    except:
        pass

In [None]:
def vectorize(corpus):
    input_ids_list = []
    segment_ids_list = []
    input_mask_list = []
    max_seq_length = 64
    faulty_index = []
    min_idx = 0
    for i in range(len(corpus)):
        to_append = ["[CLS]"] + corpus[i] + ["[SEP]"] 
        segment_ids = [0] * (len(corpus[i]) + 2) 
        input_ids = tokenizer.convert_tokens_to_ids(to_append)
        for idx in input_ids:
            if idx > min_idx:
                min_idx = idx
        input_mask = [1] * len(input_ids)
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        assert (len(input_ids) == max_seq_length)
        assert (len(input_mask) == max_seq_length)
        assert (len(segment_ids) == max_seq_length)
        input_ids_list.append(input_ids)
        segment_ids_list.append(segment_ids)
        input_mask_list.append(input_mask)
    print (min_idx)

    return input_ids_list, segment_ids_list, input_mask_list

In [None]:
input_ids_list, segment_ids_list, input_mask_list = vectorize(corpus_tokenized)
input_ids_list2, segment_ids_list2, input_mask_list2 = vectorize(dev_corpus_tokenized)
input_ids_list_test, segment_ids_list_test, input_mask_list_test = vectorize(test_corpus_tokenized)

In [None]:
y = np.array(y)
dev_y = np.array(dev_y)
test_y = np.array(test_y)

y = y[..., np.newaxis]
dev_y = dev_y[..., np.newaxis]
test_y = test_y[..., np.newaxis]

In [None]:
input_ids_list, segment_ids_list, input_mask_list = np.array(input_ids_list), np.array(segment_ids_list), np.array(input_mask_list)
input_ids_list2, segment_ids_list2, input_mask_list2 = np.array(input_ids_list2), np.array(segment_ids_list2), np.array(input_mask_list2)
input_ids_list_test, segment_ids_list_test, input_mask_list_test = np.array(input_ids_list_test), np.array(segment_ids_list_test), np.array(input_mask_list_test)


train_dset = data_utils.TensorDataset(torch.from_numpy(input_ids_list).to(device), torch.from_numpy(segment_ids_list).to(device), torch.from_numpy(input_mask_list).to(device), torch.from_numpy(y).to(device))
train_loader = data_utils.DataLoader(
    train_dset,
    batch_size=32
)

val_dset = data_utils.TensorDataset(torch.from_numpy(input_ids_list2).to(device), torch.from_numpy(segment_ids_list2).to(device), torch.from_numpy(input_mask_list2).to(device))
val_loader = data_utils.DataLoader(
    val_dset,
    batch_size=32
)
test_dset = data_utils.TensorDataset(torch.from_numpy(input_ids_list_test).to(device), torch.from_numpy(segment_ids_list_test).to(device), torch.from_numpy(input_mask_list_test).to(device))
test_loader = data_utils.DataLoader(
    test_dset, batch_size = 32
)

## Model

In [None]:
class ClassifierDBert(nn.Module):
    def __init__(self, bert_model, dropout_p):
        super(ClassifierDBert, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout_p)
        self.pre_classifier = nn.Linear(768,768)
        self.hidden2label = nn.Linear(768, 2)
#         self.hidden2confidence = nn.Linear(768, 2)

    def forward(self, sentence1, segment_ids1, input_mask1):
        x1 = self.bert(sentence1, attention_mask=input_mask1)[0]
        x1 = x1[:,0]
        x1 = self.dropout(nn.ReLU()(self.pre_classifier(x1)))
        y  = torch.log_softmax(self.hidden2label(x1), dim = 1)
#         score  = torch.log_softmax(self.hidden2confidence(x1), dim = 0)
        return y

In [None]:
model = ClassifierDBert(bert_model, 0.6)

# loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-5)
no_up = 0
EPOCH = 10
loss_function = nn.NLLLoss()
# loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))


## Training

In [None]:
def train_epoch(model, dataloader, loss_function, optimizer, epoch_num):
    from sklearn.metrics import f1_score
    model.train() 
    avg_loss = 0.0
    count = 0
    truth_res = []
    pred_res = []
    pred_probs_list_train = []
    for input_ids, segment_ids, input_mask, label in dataloader:
        input_ids, segment_ids, input_mask, label = input_ids.to("cuda"), segment_ids.to("cuda"), input_mask.to("cuda"), label.to("cuda")
        model.to(device)
        pred = model(input_ids, segment_ids, input_mask)
        pred_prob = pred[:, 1].detach().data.cpu().numpy()
        pred_probs_list_train.append(np.exp(pred_prob))
        model.zero_grad()
        loss = loss_function(pred, label.view(-1))
        loss.backward()
        optimizer.step()
        pred_label = pred.data.max(1)[1].cpu()
        pred_res += [pred_label]
        truth_res += [label.detach().data.cpu()]
        avg_loss += loss.detach().data.item()
        count += 1
        if count % 5000 == 0:
            print('[TRAIN] epoch: %d iterations: %d loss :%g' % (epoch_num, count, loss.detach().data.item()))

    avg_loss /= len(input_ids_list)
    print('[TRAIN] epoch: %d done! \n train avg_loss:%g , f1:%g'%(epoch_num, avg_loss, f1_score(torch.cat(truth_res),torch.cat(pred_res), average = 'macro')))
    


In [None]:
def eval_epoch(model, dataloader, loss_function, optimizer, epoch_num):
    from sklearn.metrics import f1_score
    model.eval()
    #avg_loss = 0.0
    count = 0
    truth_res = []
    pred_res = []
    pred_probs_list = []
    for input_ids, segment_ids, input_mask in dataloader:
        input_ids, segment_ids, input_mask = input_ids.to("cuda"), segment_ids.to("cuda"), input_mask.to("cuda")
        model.to(device)
        pred = model(input_ids, segment_ids, input_mask)
        #loss = loss_function(pred, label.view(-1))
        pred_prob = pred[:, 1].detach().data.cpu().numpy()
        pred_probs_list.append(np.exp(pred_prob))
        #pred_probs = np.argmax(pred_probs, axis=1)
        #pred_probs_list += [pred_probs]
        pred_label = pred.data.max(1)[1].cpu()
        pred_res += [pred_label]
        #truth_res += [label.detach().data.cpu()]
        #avg_loss += loss.detach().data.item()
        #count += 1
    #avg_loss /= len(input_ids_list)
    print('[EVAL] epoch: %d done!'%(epoch_num))
    return pred_probs_list, pred_res

In [None]:
EPOCHS = 5
epoch = 0
for epoch in range(EPOCHS):
    train_epoch(model, train_loader, loss_function, optimizer, epoch)
    pred_probs_list, pred_res = eval_epoch(model, val_loader, loss_function, optimizer, epoch)
    pred_probs_list = np.concatenate(pred_probs_list).ravel()
    pred_res = np.concatenate(pred_res).ravel()
    print(roc_auc_score(dev_y, pred_probs_list), f1_score(dev_y, pred_res))
    pred_probs_list_test ,pred_res_test  = eval_epoch(model, test_loader, loss_function, optimizer, epoch)
    pred_probs_list_test = np.concatenate(pred_probs_list_test).ravel()
    pred_res_test = np.concatenate(pred_res_test).ravel()
    print(roc_auc_score(test_y, pred_probs_list_test), f1_score(test_y, pred_res_test))
    print(classification_report(test_y, pred_res_test))

# Fake Data Approach

In [None]:
#Generating fake data and corrupting them by dropping some words and characters at random
#Followed the same approach as data corruption after the above step

In [None]:
fake = Faker()

In [None]:
val_addresses = []
test_addresses = []
for _ in range(len(train_addresses)):
    val_addresses.append(fake.address())
    test_addresses.append(fake.address())
    val_addresses[idx] = val_addresses[idx].replace('\n', " ")
    test_addresses[idx] = test_addresses[idx].replace('\n', " ")
    val_addresses[idx] = val_addresses[idx].replace(",", "")
    test_addresses[idx] = test_addresses[idx].replace(",", "")

In [None]:
train_addresses = train_addresses.tolist()
train_addresses.extend(val_addresses)
train_addresses.extend(test_addresses)
addresses = train_addresses

In [None]:
shuffle(addresses)

real_corpus = addresses[:65757]
corrupt_corpus_word = addresses[65757:131514]
corrupt_corpus_char = addresses[131514:]

In [None]:
corrupt_corpus_word_level = []
for address in corrupt_corpus_word:
    tokens = address.split(" ")
    num_words = len(address.split(" "))
    index = np.random.randint(0, num_words, size = 1)
    corrupt_corpus_word_level.append(" ".join([token for idx, token in enumerate(tokens) if idx not in index]))

corrupt_corpus_char_level = []
for address in corrupt_corpus_char:
    dropped_index_length = np.random.randint(1, 5, size = 1)
    index = np.random.randint(0, len(address), size = dropped_index_length)
    corrupt_corpus_char_level.append("".join([char for idx, char in enumerate(address) if idx not in index]))

In [None]:
#Follow the same vectorizing process followed by running the model to complete this approach.