In [1]:
'''
these are all of my imports (I may have imported some unecessary stuff as well)
'''
import torch
import torch.nn as nn
import transformers
from transformers import (AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
                            ,DistilBertTokenizer, DistilBertModel, DistilBertConfig
                            ,DistilBertForSequenceClassification, PreTrainedModel
                            ,AdamW)
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from tqdm import tqdm
import numpy as np

In [3]:
'''
use the gpu
'''
torch.cuda.is_available()

device = torch.device('cuda')

In [4]:
'''
set some values that we will use later
'''
LEARNING_RATE = 1e-07
MAX_SIZE = 512
BATCH_SIZE = 32
EPOCHS = 1

In [4]:
'''
create the tokenizer. I use distilbert-base-uncased and then also truncated if it was over max size, and also used lower case in the tokenizer
'''
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', \
                                                truncation=True, do_lower_case=True)

In [5]:
'''
read in the train data
'''
df = pd.read_csv("train.csv")
len(df)

159571

In [6]:
df.head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [7]:
'''
edit the dataframe to move anything that was unecessary and create the one hot encoding.
'''
classes = list(df.columns)[2:]
df['labels'] = df.iloc[:, 2:].values.tolist()
df['text'] = df['comment_text']
df.drop(['id', 'comment_text'], inplace=True, axis=1)
df.drop(classes, inplace=True, axis=1)
df.head()

Unnamed: 0,labels,text
0,"[0, 0, 0, 0, 0, 0]",Explanation\nWhy the edits made under my usern...
1,"[0, 0, 0, 0, 0, 0]",D'aww! He matches this background colour I'm s...
2,"[0, 0, 0, 0, 0, 0]","Hey man, I'm really not trying to edit war. It..."
3,"[0, 0, 0, 0, 0, 0]","""\nMore\nI can't make any real suggestions on ..."
4,"[0, 0, 0, 0, 0, 0]","You, sir, are my hero. Any chance you remember..."


In [8]:
'''
this class is used to take a dataset, encode it and the return the parts of the dataset that are important

it returns - ids, attention mask, token_type_ids, the targets (or labels), and the text itself
I also use tokenizer encode plus in order to get the outputs that I needed
this also creates an iterator to return the data
'''
class ParsedDF(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.texts = df.text
        self.targets = self.df.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])
        token_type_ids = torch.LongTensor(inputs['token_type_ids'])
        targets = torch.FloatTensor(self.targets[index])

        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'targets': targets,
            'text': text
        }

In [9]:
'''
I used this to split my training and testing datasets. I then took that datasets, and then passed it into my Data class, and then created a dataloader
'''
split = round(len(df.index)*0.7)

train_dataset = ParsedDF(df.iloc[:split], tokenizer, MAX_SIZE)
df_train = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

valid_dataset = ParsedDF(df.iloc[split:].reset_index(drop=True), tokenizer, MAX_SIZE)
df_valid = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

In [10]:
print(f'train size: {len(df_train)}')
print(f'valid size: {len(df_valid)}')

train size: 3491
valid size: 1496


In [14]:
'''
here is the actual creation of the model. In this I am use distilbert and then also specifying the number of labels, 
and that it is a multi_label_classification problem (so that it does not softmax the outputs in the huggingface transformers library)
'''
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6, problem_type="multi_label_classification")
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [15]:
'''
here I create the optimizer and define a loss function. I use the learning rate that I specified earlier
I also use a loss function that is specified for Binary Cross Entropy
'''
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
loss = torch.nn.BCEWithLogitsLoss()

In [16]:
'''
here is where I do the actual training
I only train for one epoch, but if I had more time/power I would train for more
I get the batch of data, then I get the necessary things to train with, then I input it into the model. 
out[0] represents the loss function from the outputs of the model
I also step the optimizer every iteration.
'''
def train(epoch):
    model.train()
    for i,data in tqdm(enumerate(df_train, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        optimizer.zero_grad()
        out = model(input_ids=ids, attention_mask=mask, labels=targets)
        
        if i%100==0: print(f'epoch: {epoch} | loss: {out[0].item()}')
        
        out[0].backward()
        optimizer.step()

In [17]:
for epoch in range(EPOCHS):
    train(epoch)



epoch: 0 | loss: <built-in method item of Tensor object at 0x15040254aa70>


100it [00:59,  1.75it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed64fc40>


200it [01:56,  1.72it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed66b240>


300it [02:53,  1.75it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1504003005e0>


400it [03:51,  1.73it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1504e42af6a0>


500it [04:49,  1.73it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed651ee0>


600it [05:47,  1.74it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed64cc70>


700it [06:44,  1.72it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed67cea0>


800it [07:42,  1.71it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed67f380>


900it [08:40,  1.75it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed653a10>


1000it [09:38,  1.71it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6b0b30>


1100it [10:36,  1.73it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed654860>


1200it [11:34,  1.74it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed646890>


1300it [12:32,  1.73it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed67d7b0>


1400it [13:29,  1.74it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6a8e00>


1500it [14:27,  1.73it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed69d4e0>


1600it [15:25,  1.72it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed67d7b0>


1700it [16:23,  1.75it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6d9170>


1800it [17:21,  1.72it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6b0b30>


1900it [18:19,  1.73it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6bf380>


2000it [19:16,  1.73it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6be840>


2100it [20:14,  1.74it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6d9120>


2200it [21:12,  1.74it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6bf2e0>


2300it [22:10,  1.72it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6b2890>


2400it [23:08,  1.75it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x15040267c590>


2500it [24:05,  1.75it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6a8ef0>


2600it [25:03,  1.73it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6b31a0>


2700it [26:01,  1.73it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6bd940>


2800it [26:59,  1.75it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6be890>


2900it [27:56,  1.72it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6bfa10>


3000it [28:54,  1.73it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6b2890>


3100it [29:52,  1.74it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed64fb50>


3200it [30:50,  1.75it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed64f290>


3300it [31:48,  1.75it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed69c860>


3400it [32:46,  1.74it/s]

epoch: 0 | loss: <built-in method item of Tensor object at 0x1503ed6bed90>


3491it [33:38,  1.73it/s]


In [None]:
'''
here I do the actual testing of the model with the first 20 samples of the training set
this is meant more as a sanity check more than anything cause the validation stage can take quite a while
I create a function that encodes the data as well as a function that just inputs the encoded data into the model and gets predictions.
I then take the logits and put it into a sigmoid function in order to get the actual percentages out
'''
df_test_text = pd.read_csv("test.csv")
df_test_labels = pd.read_csv("test_labels.csv")[['id', 'toxic']]
df_test = pd.merge(df_test_text, df_test_labels, on='id')
df_test.head(10)

df_test['text'] = df_test['comment_text']
df_test.drop(['id', 'comment_text'], inplace=True, axis=1)
df_test.head(20)

df_test['output'] = df.apply(lambda row: np.array([]), axis=1)

f'test size: {len(df_test)}'

def encode_data(text):
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=MAX_SIZE,
        pad_to_max_length=True,
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
    )
    ids = torch.LongTensor(inputs['input_ids'])
    mask = torch.LongTensor(inputs['attention_mask'])
    token_type_ids = torch.LongTensor(inputs['token_type_ids'])

    return {
        'ids': ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'text': text
    }

def get_preds(text):
    data = encode_data(text)
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)

    return model(ids, mask)

[print(df_test.text[i] + "\n" + 
       str(df_test.toxic[i]) + "\n" + 
       str(get_preds(df.text[i])["logits"].sigmoid().cpu().detach().numpy())) for i in range(0, 20)]

In [51]:
'''
This is my validation function where I check the model against the train dataset. This is very similar to the actual training, except I figure out how much it is getting correct
'''
def validate(model, data):
    low = 0
    low_ac = 0
    equal = 0
    equal_ac = 0
    count = 0
    for i,data in tqdm(enumerate(data, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        
        output = model(ids, mask)
        
        for i, out in enumerate(output):
            count += 1
            if torch.all(out < 0.7):
                low += 1
                if torch.all(targets[i] == 0):
                    low_ac += 1
            else:
                equal += 1
                if targets[i][out.argmax()] == 1:
                    equal_ac += 1
    full_ac = low_ac + equal_ac
    return full_ac, low_ac, low, equal_ac, equal, count

In [52]:
'''
here is where I print the results of the validation
'''
model = model.cuda()

full_ac, low_ac, low, equal_ac, equal, count = validate(model, df_valid)
print(f'Accuracy: {full_ac / count}, Low: {low_ac/low}, Equal: {equal_ac/equal}, Count: {count}')

1496it [01:58, 12.59it/s]

Accuracy: 0.9090681205740427, Low: 0.9464816650148662, Equal: 0.5470561998215878, Count: 47871





In [None]:
'''
Here I test the model with the test dataset. It does not have specific labels however, so I can only test for toxicity in general.
'''
low = 0
low_ac = 0
equal = 0
equal_ac = 0
count = 0
for i, data_raw in tqdm(enumerate(df_test.text, 0)):
    output = get_preds(data_raw)
    df_test.iloc[i]['output'] = output.cpu().detach().numpy()
    for i, out in enumerate(output):
        count += 1
        if torch.all(out < 0.7):
            low += 1
            if df_test.toxic[i] == 0:
                low_ac += 1
        else:
            equal += 1
            if df_test.toxic[i] == 1:
                equal_ac += 1

In [None]:
full_ac = low_ac + equal_ac
print(f'Accuracy: {full_ac / count}, Low: {low_ac/low}, Equal: {equal_ac/equal}, Count: {count}')

In [73]:
'''
here I save the tokenizer
'''
tokenizer.save_pretrained("model")

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.txt',
 'model/added_tokens.json')

In [None]:
'''
Here I save the model
'''
model.save_pretrained("model")