In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from sklearn.model_selection import train_test_split
import pickle
import random
import sys
import math
from scipy import stats
from scipy.spatial import distance
import os
import pandas as pd
from tqdm import tqdm
import torch.nn.functional as F
from torchsummary import summary

print("Num GPUs Available: ", torch.cuda.device_count())

  from .autonotebook import tqdm as notebook_tqdm


Num GPUs Available:  1


In [2]:
base_folder = "../../datasets/"
input_file = "hate_int_prof_SVO.tsv"

output_folder = "output_weights/"
output_file = "output_weights"

roberta_model = "roberta-base"
max_length = 256
TEST_SIZE = 0.2
seed = 42

use_attention = True

roberta_dropout = 0.2
lstm_units = 256
dense_units = 50
lstm_dropout = 0.1
dense_dropout = 0.2
epochs = 10  # (Default 10)
batch_size = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def random_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

random_seed(seed)

dataframe = pd.read_table(base_folder + input_file)
dataframe.head()


Unnamed: 0,Sentence,Intensity,Profanity,Subject,Verb,Object
0,Islam looks like a cult more than like a relig...,9.0,1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,We must send Islamics back to their native cou...,7.0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"Europe is civilised, Muslims should not stay t...",7.0,0,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"If they love Sharia law so much, why do not th...",8.0,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Islam=evil. Islam is invading us and trying to...,7.0,0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ..."


In [3]:
# Define base Roberta configs
config = RobertaConfig.from_pretrained(
    roberta_model,
    output_hidden_states=False,
    attention_probs_dropout_prob=roberta_dropout,
    output_attentions=True
)
config.output_hidden_states = False

# Load the Roberta model with the specified config
transformer_model = RobertaModel.from_pretrained(roberta_model, config=config)

# Freeze the first 3 layers
for param in transformer_model.encoder.layer[:3]:
    param.requires_grad = False  # Freeze first 3 layers

# Define tokenizer
tokenizer = RobertaTokenizer.from_pretrained(
    roberta_model,
    do_lower_case=True,
    add_special_tokens=True,
    max_length=max_length,
    padding='max_length'
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
class RobertaLSTMModel(nn.Module):
    def __init__(self, transformer_model, lstm_units, dense_units, roberta_dropout, lstm_dropout, dense_dropout, use_attention):
        super(RobertaLSTMModel, self).__init__()
        self.transformer = transformer_model
        self.lstm = nn.LSTM(input_size=transformer_model.config.hidden_size, hidden_size=lstm_units, num_layers=1, batch_first=True, bidirectional=True, dropout=lstm_dropout)
        self.attention = nn.MultiheadAttention(embed_dim=2*lstm_units, num_heads=1) if use_attention else None
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dense = nn.Linear(2*lstm_units, dense_units)
        self.dropout = nn.Dropout(dense_dropout)
        self.output = nn.Linear(dense_units, 1)

    def forward(self, input_ids, attention_mask):
        transformer_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        x = transformer_output.last_hidden_state

        x, _ = self.lstm(x)

        if self.attention:
            x = x.permute(1, 0, 2)  # Change to (seq_len, batch_size, hidden_size)
            x, _ = self.attention(x, x, x)
            x = x.permute(1, 0, 2)  # Change back to (batch_size, seq_len, hidden_size)
        
        x = x.permute(0, 2, 1)  # Change to (batch_size, hidden_size, seq_len)
        x = self.pool(x).squeeze(2)
        x = torch.relu(self.dense(x))
        x = self.dropout(x)
        x = self.output(x)
        return x

In [5]:
model = RobertaLSTMModel(transformer_model, lstm_units, dense_units, roberta_dropout, lstm_dropout, dense_dropout, use_attention).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.MSELoss()



In [11]:
class RobertaLSTMModelWrapper(nn.Module):
    def __init__(self, model):
        super(RobertaLSTMModelWrapper, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)

model = RobertaLSTMModel(transformer_model, lstm_units, dense_units, roberta_dropout, lstm_dropout, dense_dropout, use_attention).to(device)
wrapped_model = RobertaLSTMModelWrapper(model).to(device)

# Define the input size (batch_size, sequence_length)
input_size = (max_length,)  # Example for sequence length of 256

# Use the torchsummary to print the model summary
summary(wrapped_model, [(input_size,), (input_size,)], device=str(device))

TypeError: rand(): argument 'size' failed to unpack the object at pos 2 with error "type must be tuple of ints,but got tuple"

In [13]:
def tokenize(sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings, tokenizer, sentence_length):
    input_ids, input_masks, input_segments = [], [], []
    
    for sentence, subj_emb, verb_emb, obj_emb, svo_emb in tqdm(list(zip(sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings))):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=sentence_length,
                                       padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

        input_ids[-1].extend(svo_emb.tolist())
        input_masks[-1].extend([1] * len(svo_emb))
        input_segments[-1].extend([1] * len(svo_emb))
    
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')


In [14]:
def pad_sequences(sequences, maxlen, padding='post'):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            if padding == 'post':
                padded_seq = F.pad(torch.tensor(seq), (0, maxlen - len(seq)), "constant", 0)
            else:
                padded_seq = F.pad(torch.tensor(seq), (maxlen - len(seq), 0), "constant", 0)
        else:
            padded_seq = torch.tensor(seq[:maxlen])
        padded_sequences.append(padded_seq)
    return torch.stack(padded_sequences)


In [15]:
input_data = pd.read_table(base_folder + input_file)

sentences = input_data['Sentence'].tolist()
intensity_value = input_data['Intensity'].astype(int).tolist()

SVO_length = 128
subject_embeddings = pad_sequences([np.fromstring(embedding[1:-1], dtype=int, sep=',') for embedding in input_data["Subject"].tolist()], maxlen=SVO_length, padding='post')
verb_embeddings = pad_sequences([np.fromstring(embedding[1:-1], dtype=int, sep=',') for embedding in input_data["Verb"].tolist()], maxlen=SVO_length, padding='post')
object_embeddings = pad_sequences([np.fromstring(embedding[1:-1], dtype=int, sep=',') for embedding in input_data["Object"].tolist()], maxlen=SVO_length, padding='post')

# Adding all the embeddings together to truncate them when putting them in a model (instead of 128 * 3, it will be 128)
svo_embeddings = subject_embeddings + verb_embeddings + object_embeddings

# Shuffle and split the data
c = list(zip(intensity_value, sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings))
random.shuffle(c)
intensity_value, sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings = zip(*c)

X_tr, X_te, y_tr, y_te = train_test_split(sentences, intensity_value, test_size=TEST_SIZE, random_state=1)

sentence_length = 128
train_input_ids, train_input_masks, train_input_segment = tokenize(X_tr, subject_embeddings[:len(X_tr)], verb_embeddings[:len(X_tr)], object_embeddings[:len(X_tr)], svo_embeddings[:len(X_tr)], tokenizer, sentence_length)
test_input_ids, test_input_masks, test_input_segment = tokenize(X_te, subject_embeddings[len(X_tr):], verb_embeddings[len(X_tr):], object_embeddings[len(X_tr):], svo_embeddings[len(X_tr):], tokenizer, sentence_length)

y_tr = np.asarray(y_tr)
y_te = np.asarray(y_te)


100%|████████████████████████████████████████████████████████████████████████████| 4843/4843 [00:01<00:00, 3923.15it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1211/1211 [00:00<00:00, 5003.66it/s]


In [18]:
print(len(train_input_ids[100]))

256


In [9]:
class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask, token_type_ids, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
        attention_mask = torch.tensor(self.attention_mask[idx], dtype=torch.long)
        token_type_ids = torch.tensor(self.token_type_ids[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return input_ids, attention_mask, token_type_ids, label


In [10]:
train_dataset = CustomDataset(train_input_ids, train_input_masks, train_input_segment, y_tr)
test_dataset = CustomDataset(test_input_ids, test_input_masks, test_input_segment, y_te)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, token_type_ids, targets in loader:
        input_ids, attention_mask, token_type_ids, targets = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for input_ids, attention_mask, token_type_ids, targets in loader:
            input_ids, attention_mask, token_type_ids, targets = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), targets.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()
    return total_loss / len(loader)




In [12]:
# Print PyTorch model parameters
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

transformer.embeddings.word_embeddings.weight: torch.Size([50265, 768])
transformer.embeddings.position_embeddings.weight: torch.Size([514, 768])
transformer.embeddings.token_type_embeddings.weight: torch.Size([1, 768])
transformer.embeddings.LayerNorm.weight: torch.Size([768])
transformer.embeddings.LayerNorm.bias: torch.Size([768])
transformer.encoder.layer.0.attention.self.query.weight: torch.Size([768, 768])
transformer.encoder.layer.0.attention.self.query.bias: torch.Size([768])
transformer.encoder.layer.0.attention.self.key.weight: torch.Size([768, 768])
transformer.encoder.layer.0.attention.self.key.bias: torch.Size([768])
transformer.encoder.layer.0.attention.self.value.weight: torch.Size([768, 768])
transformer.encoder.layer.0.attention.self.value.bias: torch.Size([768])
transformer.encoder.layer.0.attention.output.dense.weight: torch.Size([768, 768])
transformer.encoder.layer.0.attention.output.dense.bias: torch.Size([768])
transformer.encoder.layer.0.attention.output.LayerNo

In [24]:


mock_input_ids = torch.randint(low=0, high=len(tokenizer), size=(batch_size, max_length), dtype=torch.long).to(device)
mock_attention_mask = torch.ones_like(mock_input_ids).to(device)

summary(model.to(device), input_size=(batch_size, max_length))

TypeError: forward() missing 1 required positional argument: 'attention_mask'

In [13]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

In [15]:
for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    
    # Train
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc='Training', leave=False):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        targets = batch[2].to(device).float()  # Convert targets to float type
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Assuming outputs is of shape [batch_size, 1], reshape if necessary
        outputs = outputs.view(-1, 1)  # Reshape to [batch_size, 1]
        
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * input_ids.size(0)
        
    train_loss = train_loss / len(train_loader.dataset)
    train_rmse = math.sqrt(train_loss)
    
    # Evaluate
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Evaluating', leave=False):
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            targets = batch[2].to(device).float()  # Convert targets to float type
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Assuming outputs is of shape [batch_size, 1], reshape if necessary
            outputs = outputs.view(-1, 1)  # Reshape to [batch_size, 1]
            
            loss = criterion(outputs, targets)
            val_loss += loss.item() * input_ids.size(0)
            
    val_loss = val_loss / len(test_loader.dataset)
    val_rmse = math.sqrt(val_loss)
    
    print(f'Train Loss: {train_loss:.4f}, Train RMSE: {train_rmse:.4f}, Val Loss: {val_loss:.4f}, Val RMSE: {val_rmse:.4f}')


Epoch 1/10


                                                                                                                       

KeyboardInterrupt: 

In [None]:
results = evaluate(model, test_loader, criterion, device)
print("Test Loss:", results)

# Prediction
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for input_ids, attention_mask, token_type_ids, labels in test_loader:
        input_ids, attention_mask, token_type_ids, labels = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device), labels.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.extend(outputs.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

predictions = np.array(predictions).flatten()
true_labels = np.array(true_labels)

rmse = np.sqrt(np.mean((predictions - true_labels) ** 2))
pearson = stats.pearsonr(predictions, true_labels)[0]
cosine_sim = 1 - distance.cosine(predictions, true_labels)

print("RMSE:", rmse)
print("Pearson:", pearson)
print("Cosine Similarity:", cosine_sim)