In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
#import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch.nn as nn

In [2]:
# Normal dataset
df = pd.read_csv("domain_data.csv")

In [3]:
# Updating values for training_data
training_data = df[df['split'] == 'train']
training_data = training_data.drop(training_data.query('toxicity==0').sample(frac=.85).index)

# Getting test_data
test_data = df[df['split'] == 'test']

# Getting validation_data
validation_data = df[df['split'] == 'val']
validation_data = validation_data.drop(validation_data.query('toxicity==0').sample(frac=.85).index)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
model.eval()
#model.to('cuda')

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [5]:
# Creating data loaders
X_train = np.array(training_data['comment_text'].values.tolist())
Y_train = np.array(training_data['toxicity'].values.tolist())

X_test = np.array(test_data['comment_text'].values.tolist())
Y_test = np.array(test_data['toxicity'].values.tolist())

X_val = np.array(validation_data['comment_text'].values.tolist())
Y_val = np.array(validation_data['toxicity'].values.tolist())

In [6]:
print(len(X_train))
print(len(X_test))
print(len(X_train))
print(len(X_val))

24520
47875
24520
4061


In [None]:
tokenized_training = []
for i in range(len(X_train)):
    tokenized_training.append(tokenizer.tokenize(X_train[i]))

In [7]:
lengths = []
for i in range(len(tokenized_training)):
    lengths.append(len(tokenized_training[i]))
lengths = np.array(lengths)    
print(np.mean(lengths))
print(np.median(lengths))
lengths = np.sort(lengths)

90.534176182708
72.0


In [7]:
def BuildSentenceMatrix(dataset, embedding_dimension, features):
    
    converted_dataset = []    
    
    with torch.no_grad():
        
        for i, text in enumerate(dataset):
        
            inputs = tokenizer(text, return_tensors="pt")
            
            #inputs = inputs.to('cuda')
            
            
            outputs = model(**inputs)
            outputs = outputs[0].squeeze()
        
            if len(outputs) < embedding_dimension:
                zeroVectors = torch.zeros((embedding_dimension - len(outputs), features))
                #zeroVectors = zeroVectors.to('cuda')
                outputs = torch.cat((outputs, zeroVectors))
            elif len(outputs) > embedding_dimension:
                outputs = outputs[:embedding_dimension]
            
            
            converted_dataset.append(outputs)
    
    return converted_dataset
    


In [1]:
transformedTestSet = BuildSentenceMatrix(X_train, 20, 768)

In [2]:
epochs = 1000
lr = 0.0001

cuda = True # Set this if training on GPU
cuda = cuda and torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

NameError: name 'torch' is not defined

In [None]:
class LSTM_net(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        
        self.fc2 = nn.Linear(hidden_dim, 1)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        # text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        # embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        # output = [sent len, batch size, hid dim * num directions]
        # output over padding tokens are zero tensors
        
        # hidden = [num layers * num directions, batch size, hid dim]
        # cell = [num layers * num directions, batch size, hid dim]
        
        # concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        # and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        output = self.fc1(hidden)
        output = self.dropout(self.fc2(output))
                
        #hidden = [batch size, hid dim * num directions]
            
        return output