In [1]:
!pip install transformers



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("HateSpeechDetection.csv")
train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [3]:
train_df['target'].value_counts()

0    1906
1     494
Name: target, dtype: int64

In [4]:
test_df['target'].value_counts()

0    494
1    106
Name: target, dtype: int64

In [5]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_df['text'], train_df['target'], test_size=0.2, random_state=42)

In [6]:
test_texts, test_labels=(test_df['text'], test_df['target'])

In [7]:
# text processing function

import re
import string
def clean_text(text):
    # to lower case
    text = text.lower()
    # remove links
    text = re.sub('https:\/\/\S+', '', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # remove next line
    text = re.sub(r'[^ \w\.]', '', text)
    # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)

In [8]:
train_df.head()

Unnamed: 0,Platform,text,target
642,Reddit,Dont value the opinions of those not qualified...,0
700,Reddit,you just aren't grinding hard enough,0
226,Reddit,Seems like a been there done that moment,0
1697,Twitter,Stay real or stay away from me.,0
1010,Twitter,Start your morning off in prayer!!! Have a gre...,0


In [9]:
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset

In [10]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def tokenize_texts(texts, labels):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 64,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.to_numpy())
    dataset = TensorDataset(input_ids, attention_masks, labels)
    return dataset

train_dataset = tokenize_texts(train_texts, train_labels)
val_dataset = tokenize_texts(val_texts, val_labels)
test_dataset = tokenize_texts(test_texts, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [13]:
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch in test_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total_samples += labels.size(0)
        total_correct += (predicted == labels).sum().item()

accuracy = total_correct / total_samples
print("Test Accuracy:", accuracy)

Test Accuracy: 0.945
