In [157]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
import torch.nn as nn
import re
from torch.utils.data import DataLoader

In [158]:
# Normal dataset
df = pd.read_csv("domain_data.csv")

In [159]:
# Updating values for training_data
training_data = df[df['split'] == 'train']
training_data = training_data.drop(training_data.query('toxicity==0').sample(frac=.85).index)

# Getting test_data
test_data = df[df['split'] == 'test']

# Getting validation_data
validation_data = df[df['split'] == 'val']
validation_data = validation_data.drop(validation_data.query('toxicity==0').sample(frac=.85).index)

In [160]:
df

Unnamed: 0,comment_text,split,toxicity,male,female,LGBTQ,christian,muslim,other_religion,black,white
0,even up here.......BLACKS!,train,1.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0
1,Blame men. There's always an excuse to blame ...,train,1.0,1.0,1.0,0,0.0,0.0,0.0,0.0,0.0
2,"""Let's get the black folks and the white folks...",train,1.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0
3,Are you a Pilgrim?\nWhy arn't you growing your...,train,1.0,1.0,0.0,0,0.0,0.0,0.0,0.0,1.0
4,"No, he was accused of being a racist white man.",train,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
163405,That's official insubordination. We don't have...,val,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
163406,"Yes, lynching means hanging - violent death. ...",val,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0
163407,"Oh my G-d, a group of men in their 60's and 70...",val,0.0,1.0,1.0,0,0.0,0.0,0.0,0.0,0.0
163408,Under JPII we moved away from all that love st...,val,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Creating data loaders
X_train = np.array(training_data['comment_text'].values.tolist())
Y_train = np.array(training_data['toxicity'].values.tolist())

X_test = np.array(test_data['comment_text'].values.tolist())
Y_test = np.array(test_data['toxicity'].values.tolist())

X_val = np.array(validation_data['comment_text'].values.tolist())
Y_val = np.array(validation_data['toxicity'].values.tolist())

In [7]:
print(len(X_train))
print(len(X_test))
print(len(X_train))
print(len(X_val))

24520
47875
24520
4061


In [None]:
tokenized_training = []
for i in range(len(X_train)):
    tokenized_training.append(tokenizer.tokenize(X_train[i]))

In [7]:
lengths = []
for i in range(len(tokenized_training)):
    lengths.append(len(tokenized_training[i]))
lengths = np.array(lengths)    
print(np.mean(lengths))
print(np.median(lengths))
lengths = np.sort(lengths)

90.534176182708
72.0


In [7]:
def CleanText(text):
    text = text.lower() #Turn all text entries into lower-case
    text = re.sub(r'''(https?:\/\/www\.|https?:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,3}[-a-zA-Z0-9()@:%_\+.~#?&\//=<>]*''', "<URL>", text)
    #Replace URL with tag
    text = re.sub(r'''[0-9]+[/\-.]+[0-9]+[/\-.]+[0-9]+''', "<DATE>", text) #Replace dates with tag
    text = re.sub(r'''[a-z0-9._%+-]+\@[a-z0-9.-]+[a-z0-9]\.[a-z]{1,}''', "<EMAIL>", text)
    text = re.sub(r'''[0-9]+''', "<NUM>", text) #Replace numbers with tag
    text = re.sub(r'''[.|,|!|?|\'|\''|\"|\n|\t|\-|\(|\)]''', '', text)
    text = re.sub(r'''^\s+|\s+$''', '', text) #Remove whitespaces at the end and start of string
    text = re.sub(r'''[ ][ ]+|_''', " ", text) #Remove multiple whitespace
    return text


def BuildSentenceMatrix(dataset, embedding_dimension, features):
    
    converted_dataset = []
    
    with torch.no_grad():
        for i, text in enumerate(dataset):
            text = CleanText(text)
            inputs = tokenizer(text, return_tensors="pt")
            inputs = inputs.to('cuda')           
            
            outputs = model(**inputs)
            outputs = outputs[0].squeeze()
        
            if len(outputs) < embedding_dimension:
                zeroVectors = torch.zeros((embedding_dimension - len(outputs), features))
                zeroVectors = zeroVectors.to('cuda')
                outputs = torch.cat((outputs, zeroVectors))
            elif len(outputs) > embedding_dimension:
                outputs = outputs[:embedding_dimension]
            
            if i%20 == 0:
                print(i)
            converted_dataset.append(outputs)
    
    return converted_dataset
    


In [128]:
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
#model = AutoModel.from_pretrained("bert-base-uncased")
#model.eval()
#model.to('cuda')

from transformers import BertTokenizer, BertModel

#def ResizeEmbeds(tokenized_text, embedding_dimension):
#    if len(tokenized_text[0]) > embed_len

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
model.to('cuda')

input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute hello hello", add_special_tokens=True, max_length=20, truncation=True, padding="max_length")).unsqueeze(0)  # Batch size 1
print(input_ids.shape)

input_ids


torch.Size([1, 20])


tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,  7592,  7592,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [143]:
prep_training = []
prep_test = []
prep_val = []

for i, text in enumerate(X_train):
    prep_training.append([torch.tensor(tokenizer.encode(text, add_special_tokens=True, max_length=100, truncation=True, padding="max_length")), Y_train[i]])

for i, text in enumerate(X_test):
    prep_test.append([torch.tensor(tokenizer.encode(text, add_special_tokens=True, max_length=20, truncation=True, padding="max_length")), Y_test[i]])

for i, text in enumerate(X_val):
    prep_val.append([torch.tensor(tokenizer.encode(text, add_special_tokens=True, max_length=20, truncation=True, padding="max_length")), Y_test[i]])

In [145]:
train_loader = DataLoader(prep_training, batch_size=16, shuffle=True)
valid_loader = DataLoader(prep_val, batch_size=16, shuffle=False)
test_loader = DataLoader(prep_test, batch_size=16, shuffle=False)

In [146]:

epochs = 50
lr = 0.0001
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

cuda = True # Set this if training on GPU
cuda = cuda and torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

In [153]:

# [batch_size, seq_len, input_size]
class LSTM_Net(nn.Module):
    
    def __init__(self):
        super(LSTM_Net, self).__init__()
        
        self.lstm = nn.LSTM(input_size=768, hidden_size=768, num_layers=1, batch_first=True) #bidirectional=True
        
        self.fc = nn.Linear(762, 1)
        
        self.sigmoid = nn.Sigmoid()
    

    def forward(self, x):
        
        x = model(x)
        lstm_out, (ht, ct) = self.lstm(x)
        
        output = self.fc(ht[-1, :, :])
        
        output = self.sigmoid(output)
        return output


In [154]:
# Setting up model parameters
lstm_model = LSTM_Net().to(device)

for epoch in range(1, epochs + 1): 
    
    train_loss = 0.0
    
    for batch_idx, data in enumerate(train_loader):
        
        # get the input
        inputs, labels = data
        
        #inputs = inputs.unsqueeze(1)
        
        inputs = inputs.to(device).float()
        labels = labels.to(device).float()
        
        print(inputs.shape)
        print(labels.shape)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward + backward + optimize            
        outputs = lstm_model(inputs)
        
        loss = loss_function(outputs, labels)
        
        loss.backward()
        
        optimizer.step()
        
        train_loss += loss.item()
    
    train_loss /= len(train_loader.dataset)

torch.Size([16, 100])
torch.Size([16])
tensor([[  101.,  1000.,  2054.,  ...,     0.,     0.,     0.],
        [  101.,  1015.,  1012.,  ...,     0.,     0.,     0.],
        [  101.,  2296.,  2137.,  ...,  2068.,  2000.,   102.],
        ...,
        [  101.,  4302.,  2003.,  ...,  1012.,  2054.,   102.],
        [  101., 22894., 22894.,  ...,     0.,     0.,     0.],
        [  101.,  2304., 10558.,  ...,     0.,     0.,     0.]],
       device='cuda:0')


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)

In [98]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute hello hello")).unsqueeze(0)  # Batch size 1
print(input_ids)

outputs = model(input_ids)
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,  7592,  7592,   102]])


In [95]:
last_hidden_states.shape

torch.Size([1, 8, 768])

In [155]:
input_ids

tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,  7592,  7592,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])