In [17]:
import torch
import torch.nn as nn

import torch.optim as optim

import torchtext
import torchtext.experimental
import torchtext.experimental.vectors
from torchtext.experimental.datasets.raw.text_classification import RawTextIterableDataset
from torchtext.experimental.datasets.text_classification import TextClassificationDataset
from torchtext.experimental.functional import sequential_transforms, vocab_func, totensor

import collections
import random
import time

import pandas as pd

from pathlib import Path

import spacy
import re

from functools import partial
from tqdm.notebook import tqdm

import gensim

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
seed = 1234

torch.manual_seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
def transform_dataframe_to_dict(data_frame, spacy_model="en_core_web_sm", max_length=None):
    final_data = []
    for i in data_frame.iterrows():
        temp = {
            'original_text': i[1]['comment'],
            'lable': i[1]['is_toxic'],
            'toxicity': i[1]['toxicity'],
            'text': clean_text(i[1]['comment'])
        }
        final_data.append(temp)
    texts = [f['text'] for f in final_data]
    tokenized_text = batch_tokenize(texts=texts, spacy_model=spacy_model, max_length=max_length)
    for index, f in enumerate(final_data):
        f['tokenized_text'] = tokenized_text[index]
        final_data[index] = f
    
    return final_data

def clean_text(text:str):
    """
    cleans text casing puntations and special characters. Removes extra space
    """
    text = re.sub('[^ a-zA-Z0-9]|unk', '', text)
    text = text.strip()
    return text

class Tokenizer:
    """Cleans the data and tokenizes it"""

    def __init__(self, spacy_model:str="en_core_web_sm", clean_text=clean_text, max_length=None):
        self.tokenizer_model = spacy.load("en_core_web_sm")
        self.clean_text = clean_text
        self.max_length = max_length
    
    def tokenize(self, s):
        if self.clean_text:
            s = clean_text(s)
        doc = self.tokenizer_model(s)
        tokens = [token.text for token in doc]
        
        if self.max_length:
            tokens = tokens[:self.max_length]
        
        return tokens

def batch_tokenize(texts:list, spacy_model="en_core_web_sm", max_length=None):
    """tokenizes a list via nlp pipeline space"""
    nlp = spacy.load(spacy_model)
    
    tokenized_list = []
    
    if max_length:
        for doc in tqdm(nlp.pipe(texts, disable=["ner", "tok2vec"])):
            tokenized_list.append([t.text for t in doc][:max_length])
    else:
        for doc in tqdm(nlp.pipe(texts, disable=["ner", "tok2vec"])):
            tokenized_list.append([t.text for t in doc])
    
    return tokenized_list
    
tokenizer = Tokenizer(spacy_model="en_core_web_sm", clean_text=clean_text, max_length=-1)
print(tokenizer.tokenize("This is a test sentence ?? "))

['This', 'is', 'a', 'test']


In [5]:
# read the csv files and do a process over it 

debias_train = Path('../data/wiki_debias_train.csv')
debias_dev = Path('../data/wiki_debias_dev.csv')
debias_test = Path('../data/wiki_debias_test.csv')

# Optimize this later. We don't need pandas dataframe
debias_train_raw = transform_dataframe_to_dict(pd.read_csv(debias_train))
debias_dev_raw = transform_dataframe_to_dict(pd.read_csv(debias_dev))
debias_test_raw = transform_dataframe_to_dict(pd.read_csv(debias_test))

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [6]:
def build_vocab_from_data(raw_train_data, raw_dev_data):
    """This has been made customly for the given dataset. Need to write your own for any other use case"""
    
    token_freqs = collections.Counter()
    for data_point in raw_train_data:
        token_freqs.update(data_point['tokenized_text'])
    for data_point in raw_dev_data:
        token_freqs.update(data_point['tokenized_text'])
#     token_freqs.update(data_point['tokenized_text'] for data_point in raw_train_data)
#     token_freqs.update(data_point['tokenized_text'] for data_point in raw_dev_data)    
    vocab = torchtext.vocab.Vocab(token_freqs)
    return vocab

In [7]:
def process_data(raw_data, vocab):
    """raw data is assumed to be tokenized"""
    final_data = [(data_point['lable'], data_point['tokenized_text']) for data_point in raw_data]
    text_transformation = sequential_transforms(vocab_func(vocab),
                                               totensor(dtype=torch.long))
    label_transform = sequential_transforms(totensor(dtype=torch.long))
    
    transforms = (label_transform, text_transformation)
    
    return TextClassificationDataset(final_data,vocab,transforms)
    

In [8]:
vocab = build_vocab_from_data(raw_train_data=debias_train_raw,raw_dev_data=debias_dev_raw )

In [9]:
train_data = process_data(raw_data=debias_train_raw, vocab=vocab)
dev_data = process_data(raw_data=debias_dev_raw, vocab=vocab)
test_data = process_data(raw_data=debias_test_raw, vocab=vocab)

In [10]:
class Collator:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
    
    def collate(self, batch):
        labels, text = zip(*batch)
        labels = torch.LongTensor(labels)
        lengths = torch.LongTensor([len(x) for x in text])
        text = nn.utils.rnn.pad_sequence(text, padding_value= self.pad_idx)
        
        return labels, text, lengths

In [11]:
pad_token = '<pad>'
pad_idx = vocab[pad_token]

collator = Collator(pad_idx)

In [12]:
batch_size = 256

train_iterator = torch.utils.data.DataLoader(train_data, 
                                             batch_size, 
                                             shuffle = True, 
                                             collate_fn = collator.collate)

dev_iterator = torch.utils.data.DataLoader(dev_data, 
                                             batch_size, 
                                             shuffle = False, 
                                             collate_fn = collator.collate)

test_iterator = torch.utils.data.DataLoader(test_data, 
                                            batch_size, 
                                            shuffle = False, 
                                            collate_fn = collator.collate)

In [13]:
class BiLSTM(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim, n_layers, dropout, pad_idx):

        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers = n_layers, bidirectional = True, dropout = dropout)
        self.fc = nn.Linear(2 * hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, lengths):

        # text = [seq len, batch size]
        # lengths = [batch size]

        embedded = self.dropout(self.embedding(text))

        # embedded = [seq len, batch size, emb dim]

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths, enforce_sorted = False)

        packed_output, (hidden, cell) = self.lstm(packed_embedded)

        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output)

        # outputs = [seq_len, batch size, n directions * hid dim]
        # hidden = [n layers * n directions, batch size, hid dim]

        hidden_fwd = hidden[-2]
        hidden_bck = hidden[-1]

        # hidden_fwd/bck = [batch size, hid dim]

        hidden = torch.cat((hidden_fwd, hidden_bck), dim = 1)

        # hidden = [batch size, hid dim * 2]

        prediction = self.fc(self.dropout(hidden))

        # prediction = [batch size, output dim]

        return prediction
    
def initialize_parameters(m):
    if isinstance(m, nn.Embedding):
        nn.init.uniform_(m.weight, -0.05, 0.05)
    elif isinstance(m, nn.LSTM):
        for n, p in m.named_parameters():
            if 'weight_ih' in n:
                i, f, g, o = p.chunk(4)
                nn.init.xavier_uniform_(i)
                nn.init.xavier_uniform_(f)
                nn.init.xavier_uniform_(g)
                nn.init.xavier_uniform_(o)
            elif 'weight_hh' in n:
                i, f, g, o = p.chunk(4)
                nn.init.orthogonal_(i)
                nn.init.orthogonal_(f)
                nn.init.orthogonal_(g)
                nn.init.orthogonal_(o)
            elif 'bias' in n:
                i, f, g, o = p.chunk(4)
                nn.init.zeros_(i)
                nn.init.ones_(f)
                nn.init.zeros_(g)
                nn.init.zeros_(o)
    elif isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)

In [14]:


input_dim = len(vocab)
emb_dim = 300
hid_dim = 256
output_dim = 2
n_layers = 2
dropout = 0.5

model = BiLSTM(input_dim, emb_dim, hid_dim, output_dim, n_layers, dropout, pad_idx)
model.apply(initialize_parameters)


BiLSTM(
  (embedding): Embedding(252723, 300, padding_idx=1)
  (lstm): LSTM(300, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [15]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
# device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
device = torch.device('cpu')

In [21]:
# set embeddings to your liking
# def get_pretrained_embedding(initial_embedding, pretrained_vocab, pretrained_vectors, vocab, unk_token):
    
#     pretrained_embedding = torch.FloatTensor(initial_embedding.weight.clone()).detach()    
    
#     unk_tokens = []
    
#     for idx, token in tqdm(enumerate(vocab.itos)):
#         if token in pretrained_vocab:
#             pretrained_vector = torch.tensor(pretrained_vectors[token],device=device)
#             pretrained_embedding[idx] = pretrained_vector
#         else:
#             unk_tokens.append(token)
        
#     return pretrained_embedding, unk_tokens


# set embeddings to your liking


print("reading new vector file")
pretrained_embedding = gensim.models.KeyedVectors.load_word2vec_format("../../bias-in-nlp/src/testvec1")
pretrained_vocab = [key for key in pretrained_embedding.vocab.keys()]



reading new vector file


In [22]:
def get_pretrained_embedding(initial_embedding, pretrained_vocab, pretrained_vectors, vocab, unk_token,device):
    
    pretrained_embedding = torch.FloatTensor(initial_embedding.weight.clone()).cpu().detach().numpy()  
    
    unk_tokens = []
    
    for idx, token in tqdm(enumerate(vocab.itos)):
        try:
            pretrained_embedding[idx] = pretrained_vectors[token]
        except KeyError:
            unk_tokens.append(token)
    
    pretrained_embedding = torch.from_numpy(pretrained_embedding).to(device)
    return pretrained_embedding, unk_tokens

In [23]:
print("updating embeddings")
unk_token = '<unk>'
pretrained_embedding, unk_tokens = get_pretrained_embedding(initial_embedding=model.embedding, 
                                                            pretrained_vocab=pretrained_vocab,
                                                            pretrained_vectors=pretrained_embedding,
                                                            vocab=vocab, 
                                                            unk_token=unk_token,
                                                           device=device)

model.embedding.weight.data.copy_(pretrained_embedding)

updating embeddings


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




tensor([[-0.0005,  0.0132, -0.0177,  ..., -0.0187, -0.0095,  0.0047],
        [-0.0227,  0.0025,  0.0059,  ..., -0.0050, -0.0287,  0.0113],
        [ 0.0543,  0.0044,  0.0116,  ...,  0.0142, -0.0106, -0.0004],
        ...,
        [-0.0045, -0.0004, -0.0314,  ...,  0.0352,  0.0335, -0.0246],
        [-0.0066,  0.0040, -0.0320,  ..., -0.0102, -0.0273,  0.0151],
        [-0.0180,  0.0105, -0.0027,  ...,  0.0232,  0.0074,  0.0356]])

In [24]:
def calculate_accuracy(predictions, labels):
    top_predictions = predictions.argmax(1, keepdim = True)
    correct = top_predictions.eq(labels.view_as(top_predictions)).sum()
    accuracy = correct.float() / labels.shape[0]
    return accuracy

In [25]:
def train(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for labels, text, lengths in iterator:
        
        labels = labels.to(device)
        text = text.to(device)

        optimizer.zero_grad()
        
        predictions = model(text, lengths)
        
        loss = criterion(predictions, labels)
        
        acc = calculate_accuracy(predictions, labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [26]:
def evaluate(model, iterator, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for labels, text, lengths in iterator:

            labels = labels.to(device)
            text = text.to(device)
            
            predictions = model(text, lengths)
            
            loss = criterion(predictions, labels)
            
            acc = calculate_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [None]:
n_epochs = 10

best_valid_loss = float('inf')

for epoch in range(n_epochs):

    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    
    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bilstm-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
# from gensim.models import KeyedVectors
import gensim

In [None]:
# word2vec =  KeyedVectors.load_word2vec_format("../../bias-in-nlp/src/testvec1", binary=True)

# model = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(os.path.dirname(__file__), 'GoogleNews-vectors-negative300.bin'), binary=True)

pretrained_embedding = gensim.models.KeyedVectors.load_word2vec_format("../../bias-in-nlp/src/testvec1")

In [None]:
pretrained_vocab = [key for key in pretrained_embedding.vocab.keys()]
pretrained_vocab[0]

In [None]:


def get_pretrained_embedding(initial_embedding, pretrained_vocab, pretrained_vectors, vocab, unk_token):
    
    pretrained_embedding = torch.FloatTensor(initial_embedding.weight.clone()).detach()    
    
    unk_tokens = []
    
    for idx, token in tqdm(enumerate(vocab.itos)):
        if token in pretrained_vocab:
            pretrained_vector = torch.tensor(pretrained_vectors[token],device=device)
            pretrained_embedding[idx] = pretrained_vector
        else:
            unk_tokens.append(token)
        
    return pretrained_embedding, unk_tokens


In [None]:
unk_token = '<unk>'
pretrained_embedding, unk_tokens = get_pretrained_embedding(initial_embedding=model.embedding, 
                                                            pretrained_vocab=pretrained_vocab,
                                                            pretrained_vectors=pretrained_embedding,
                                                            vocab=vocab, 
                                                            unk_token=unk_token)

In [None]:
torch.from_numpy(pretrained_embedding.wv.syn0).to(device)

In [None]:
# set embeddings to your liking
def get_pretrained_embedding(initial_embedding, pretrained_vocab, pretrained_vectors, vocab, unk_token):
    
    pretrained_embedding = torch.FloatTensor(initial_embedding.weight.clone()).detach()    
    
    unk_tokens = []
    
    for idx, token in tqdm(enumerate(vocab.itos)):
        if token in pretrained_vocab:
            pretrained_vector = torch.tensor(pretrained_vectors[token],device=device)
            pretrained_embedding[idx] = pretrained_vector
        else:
            unk_tokens.append(token)
        
    return pretrained_embedding, unk_tokens


# set embeddings to your liking
def get_pretrained_embedding(initial_embedding, pretrained_vocab, pretrained_vectors, vocab, unk_token,device):
    
    pretrained_embedding = torch.FloatTensor(initial_embedding.weight.clone()).cpu().detach().numpy()  
    
    unk_tokens = []
    
    for idx, token in tqdm(enumerate(vocab.itos)):
        try:
            pretrained_embedding[idx] = pretrained_vector
        except KeyError:
            unk_tokens.append(token)
    
    pretrained_embedding = torch.from_numpy(pretrained_embedding).to(device)
    return pretrained_embedding, unk_tokens