In [None]:
from transformers import AutoTokenizer, RobertaModel, AutoModel, AdamW
from torch.utils.data import Dataset, DataLoader
from string import digits
from html import unescape
from sklearn.model_selection import train_test_split
from torchmetrics.functional.classification import auroc, accuracy

import torch
import torch.nn as nn
import pandas as pd
import re
import os
import pytorch_lightning as pl
import math
import matplotlib.pyplot as plt
import json

torch.set_float32_matmul_precision('medium')

In [None]:
os.chdir(r'C:\Users\rabby\CS 7643 - Deep Learning\Project')

config = {
    'model_name': 'roberta-base',
    'n_labels': 2,
    'batch_size': 64,
    'dropout': 0.2,
    'lr': 1e-5,
    'n_epochs': 20,
    'device': 'cuda',
    'n_threads': 1000
}

In [None]:
pol_path = r'Data\pol_0616-1119_labeled\pol_062016-112019_labeled.ndjson'

In [None]:
def Clean_Post(post):
    # Remove HTML tags
    cleaned_post = re.sub('<.*?>', '', post)

    # Unescape HTML entities
    cleaned_post = unescape(cleaned_post)

    # Remove line breaks and extra spaces
    cleaned_post = cleaned_post.replace('\n', ' ').replace('\r', ' ').strip()
    
    x = cleaned_post.split('>')
    cleaned_post = ' '.join(x).strip()
    cleaned_post = cleaned_post.lstrip(digits)

    return cleaned_post

def Load_Pol(pol_path, n_threads = 10**5, thresholds = {'tox': 0.2, 'inf': 0.2}, test_size = 0.2, random_state = 24):
    
    #Load first n_threads rows from 4chan data
    pol_raw = pd.read_json(pol_path, lines = True, nrows = n_threads)
    #Declare variables for storage of posts and toxicity scores
    posts = []
    toxicity = []
    inflammatory = []
    #Extract posts, scores from nested dictionaries
    for i in range(n_threads):
        thread = pol_raw.loc[i][0]
        n_posts = len(thread)
        for j in range(n_posts):
            try:
                posts.append(thread[j]['com'])
                toxicity.append(thread[j]['perspectives']['SEVERE_TOXICITY'])
                inflammatory.append(thread[j]['perspectives']['INFLAMMATORY'])
            except:
                pass
    
    #Create pol_data df
    pol_data = pd.DataFrame(data = {'Posts': posts, 'Toxicity': toxicity, 'Inflammatory': inflammatory})
    #Clean comments
    pol_data.loc[:, 'Posts'] = pol_data.loc[:, 'Posts'].apply(Clean_Post)
    #Set Toxic Flag to 1 for posts exceeding toxicity and inflammatory thresholds

    #Split Train and Val
    pol_train, pol_val = train_test_split(pol_data, test_size = test_size, random_state = random_state)
    
    return pol_train, pol_val

In [None]:
class Pol_Dataset(Dataset):
    
    def __init__(self, data, tokenizer, max_length = 128):
        #Declare variables
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        #Select index
        item = self.data.iloc[index]
        #Extract posts and toxicity flag
        post = str(item.Posts)
        label = torch.Tensor([item[['Toxicity', 'Inflammatory']]])
        #Convert to tokens
        tokens = self.tokenizer.encode_plus(post, add_special_tokens = True, return_tensors = 'pt', truncation = True, 
                                           max_length = self.max_length, padding = 'max_length', return_attention_mask = True)
        
        return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': label}
    
class Pol_Data_Module(pl.LightningDataModule):
    
    def __init__(self, train, val, test = None, batch_size = 16, max_length = 128,  model = 'roberta-base'):
        super().__init__()
        self.train = train
        self.val = val
        if test == None:
            self.test = val
        else:
            self.test = test
        self.batch_size = batch_size
        self.max_length = max_length
        self.model = model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model)
        
    def setup(self, stage = None):
        self.train_ds = Pol_Dataset(self.train, self.tokenizer, max_length = self.max_length)
        self.val_ds = Pol_Dataset(self.val, self.tokenizer, max_length = self.max_length)
        self.test_ds = Pol_Dataset(self.test, self.tokenizer, max_length = self.max_length)
        
    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size = self.batch_size, shuffle = True)
    
    def val_dataloader(self):
        return DataLoader(self.val_ds, batch_size = self.batch_size, shuffle = False)
    
    def test_dataloader(self):
        return DataLoader(self.test_ds, batch_size = self.batch_size, shuffle = False)

In [None]:
class pol_Classifier(pl.LightningModule):

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.model = AutoModel.from_pretrained(config['model_name'], return_dict = True)
        self.linear = nn.Sequential(nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size),
                                    nn.ReLU(), 
                                    nn.Dropout(config['dropout']), 
                                    nn.Linear(self.model.config.hidden_size, self.config['n_labels']))
        torch.nn.init.xavier_uniform_(self.linear[0].weight)
        torch.nn.init.xavier_uniform_(self.linear[3].weight)
        self.softmax = nn.Softmax(dim = 1)
        self.loss = nn.MSELoss()
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.tloss = []
        self.vloss = []
           
    def forward(self, input_ids, attention_mask, labels = None):
        out = self.model(input_ids = input_ids, attention_mask = attention_mask)
        out = torch.mean(out.last_hidden_state, 1)
        # final logits
        out = self.linear(out)
        return out
    
    def training_step(self, batch, batch_index):
        loss, out, y = self._common_step(batch, batch_index)
        self.training_step_outputs.append(loss)
        self.log("Training Loss", loss, prog_bar = True, logger = True)
        return loss
    
    def on_train_epoch_end(self):
        epoch_mean = torch.stack(self.training_step_outputs).mean()
        self.tloss.append(float(epoch_mean.detach().cpu().numpy()))
        self.training_step_outputs.clear()
    
    def validation_step(self, batch, batch_index):
        loss, out, y = self._common_step(batch, batch_index)
        self.validation_step_outputs.append(loss)
        self.log("Validation Loss", loss, prog_bar = True, logger = True)
        return loss
    
    def on_validation_epoch_end(self):
        epoch_mean = torch.stack(self.validation_step_outputs).mean()
        self.vloss.append(float(epoch_mean.detach().cpu().numpy()))
        self.validation_step_outputs.clear()
    
    def test_step(self, batch, batch_index):
        loss, out, y = self._common_step(batch, batch_index)
        self.log("Test Loss", loss, prog_bar = True, logger = True)
        return loss

    def predict_step(self, batch, batch_index):
        loss, out, y = self._common_step(batch, batch_index)
        return loss
    
    def _common_step(self, batch, batch_index):
        x = batch['input_ids']
        y = batch['labels'].squeeze(1)
        attn_mask = batch['attention_mask']
        out = self.forward(x, attn_mask)
        y = y.to(config['device'])
        loss = self.loss(out, y)
        return loss, out, y
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr = self.config['lr'])
        return [optimizer]
    
    def plot_loss(self):
        self.vloss.pop()
        plt.plot(self.tloss, label = 'Training')
        plt.plot(self.vloss, label = 'Validation')
        plt.title('Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()
        
    def plot_auroc(self):
        self.vauroc.pop()
        plt.plot(self.tauroc, label = 'Training')
        plt.plot(self.vauroc, label = 'Validation')
        plt.title('AUROC')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()
        
    def plot_accuracy(self):
        self.vacc.pop()
        plt.plot(self.tacc, label = 'Training')
        plt.plot(self.vacc, label = 'Validation')
        plt.title('Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

In [None]:
# datamodule
pol_train, pol_val = Load_Pol(pol_path, n_threads = config['n_threads'], thresholds = config['thresholds'])
pol_data_module = Pol_Data_Module(pol_train, pol_val, batch_size = config['batch_size'])

# model
model = pol_Classifier(config)

# trainer and fit
trainer = pl.Trainer(max_epochs = config['n_epochs'], devices = 1, accelerator = "gpu", num_sanity_val_steps = 0)
trainer.fit(model, pol_data_module)
trainer.validate(model, pol_data_module)
trainer.test(model, pol_data_module)

In [None]:
model.plot_loss()

In [None]:
model.plot_auroc()

In [None]:
model.plot_accuracy()