# Text Classification Using Transformer Networks (BERT)

Some initialization:

In [1]:
!pip3 install datasets
!pip3 install transformers
!pip install -U accelerate
!pip install -U transformers



In [2]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


Read the train/dev/test datasets and create a HuggingFace `Dataset` object:

In [3]:
def read_dota_data(filename):
    # read csv file
    df = pd.read_csv(filename, header=0)
    # Get only the text and label columns
    return df[["text","target"]]

In [4]:
# labels = open('data/ag_news_csv/classes.txt').read().splitlines()
data = read_dota_data('1/tagged-data.csv')
data

Unnamed: 0,text,target
0,COMMEND ME TY,0
1,sorry nex,0
2,what is the best soup?,0
3,man that silence on axe,0
4,not coming into play,0
...,...,...
3262,"wt?f?asfU JGOFIDLK,YH",1
3263,you must really suck,2
3264,YOU HAVE IDIOT PLAYER,2
3265,SUPER IDIOT,2


In [20]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data, train_size=0.9)
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

print(f'train rows: {len(train_df.index):,}')
print(f'test rows: {len(eval_df.index):,}')
# print(f'test rows: {len(test_df.index):,}')

train rows: 2,940
test rows: 327


In [22]:
test_df

Unnamed: 0,text,target
0,noobs why focusing on terror,1
1,u wasted to much money on wards,0
2,im covered by noobs here,2
3,please >>> END FAST <<< (hopeless) > push FAS...,2
4,REPORT MIRANA,0
...,...,...
322,fuck that shit,2
323,trash can`t buy ward,2
324,win against me,0
325,how to fin with brainless fuckheads,2


In [23]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds['train'] = Dataset.from_pandas(train_df)
# ds['validation'] = Dataset.from_pandas(eval_df)
ds['test'] = Dataset.from_pandas(test_df)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 2940
    })
    test: Dataset({
        features: ['text', 'target'],
        num_rows: 327
    })
})

Tokenize the texts:

In [7]:
from transformers import AutoTokenizer

transformer_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(transformer_name)

In [25]:
def tokenize(examples):
    return tokenizer(examples['text'], truncation=True)

train_ds = ds['train'].map(
    tokenize, 
    batched=True,
    # remove_columns=['title', 'description', 'text'],
)
test_ds = ds['test'].map(
    tokenize,
    batched=True,
    # remove_columns=['title', 'description', 'text'],
)
train_ds.to_pandas()

Map:   0%|          | 0/2940 [00:00<?, ? examples/s]

Map:   0%|          | 0/327 [00:00<?, ? examples/s]

Unnamed: 0,text,target,input_ids,token_type_ids,attention_mask
0,timber is motherfucker,2,"[101, 7148, 1110, 1534, 14703, 8638, 102]","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]"
1,Why isnt necro mid also?,0,"[101, 2009, 2762, 1204, 24928, 1665, 2180, 228...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
2,JUST LIKE TT,0,"[101, 147, 13329, 1942, 149, 2240, 22441, 157,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,y completly retarded,1,"[101, 194, 3254, 7136, 8671, 1231, 6817, 4902,...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,i think this trash fag is muted too,2,"[101, 178, 1341, 1142, 13151, 175, 8517, 1110,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
...,...,...,...,...,...
2935,that is yo,0,"[101, 1115, 1110, 26063, 102]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]"
2936,fuck team because u lost?,1,"[101, 9367, 1264, 1272, 190, 1575, 136, 102]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1]"
2937,20mins scythe on od,0,"[101, 1406, 19296, 188, 3457, 10681, 1113, 184...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
2938,fuck my mouse,1,"[101, 9367, 1139, 10322, 102]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]"


Create the transformer model:

In [26]:
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel

# https://github.com/huggingface/transformers/blob/65659a29cf5a079842e61a63d57fa24474288998/src/transformers/models/bert/modeling_bert.py#L1486

class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        cls_outputs = outputs.last_hidden_state[:, 0, :]
        cls_outputs = self.dropout(cls_outputs)
        logits = self.classifier(cls_outputs)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [27]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    transformer_name,
    # num_labels=len(labels),
    num_labels=2
)

model = (
    BertForSequenceClassification
    .from_pretrained(transformer_name, config=config)
)

Create the trainer object and train:

In [28]:
from transformers import TrainingArguments

num_epochs = 2
batch_size = 24
weight_decay = 0.01
model_name = f'{transformer_name}-sequence-classification'

training_args = TrainingArguments(
    output_dir=model_name,
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    weight_decay=weight_decay,
)



In [29]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    return {'accuracy': accuracy_score(y_true, y_pred)}

In [30]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [31]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.

Evaluate on the test partition:

In [33]:
test_ds = ds['test'].map(
    tokenize,
    batched=True,
    # remove_columns=['title', 'description', 'text'],
)
test_ds.to_pandas()

Map:   0%|          | 0/327 [00:00<?, ? examples/s]

Unnamed: 0,text,target,input_ids,token_type_ids,attention_mask
0,noobs why focusing on terror,1,"[101, 1185, 12809, 1116, 1725, 7781, 1113, 893...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,u wasted to much money on wards,0,"[101, 190, 15445, 1106, 1277, 1948, 1113, 1255...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1]"
2,im covered by noobs here,2,"[101, 13280, 2262, 1118, 1185, 12809, 1116, 13...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,please >>> END FAST <<< (hopeless) > push FAS...,2,"[101, 4268, 135, 135, 135, 142, 16769, 6820, 9...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,REPORT MIRANA,0,"[101, 155, 16668, 9565, 1942, 26574, 9664, 111...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1]"
...,...,...,...,...,...
322,fuck that shit,2,"[101, 9367, 1115, 4170, 102]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]"
323,trash can`t buy ward,2,"[101, 13151, 1169, 169, 189, 4417, 7910, 102]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1]"
324,win against me,0,"[101, 1782, 1222, 1143, 102]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]"
325,how to fin with brainless fuckheads,2,"[101, 1293, 1106, 15301, 1114, 3575, 2008, 936...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [34]:
output = trainer.predict(test_ds)
output

PredictionOutput(predictions=array([[ 0.3138793 , -0.32866675],
       [ 0.3327461 , -0.37894437],
       [ 0.36434406, -0.36156732],
       [ 0.16857997, -0.27593577],
       [ 0.41438413, -0.3475526 ],
       [ 0.27373788, -0.27770332],
       [ 0.42434388, -0.31356007],
       [ 0.31029508, -0.40805578],
       [ 0.5272815 , -0.14190805],
       [ 0.3875918 , -0.26470685],
       [ 0.4049119 , -0.1946367 ],
       [ 0.32851243, -0.2711021 ],
       [ 0.41177768, -0.32651752],
       [ 0.4178154 , -0.0994579 ],
       [ 0.5046528 , -0.09417546],
       [ 0.20028943, -0.34079787],
       [ 0.3246587 , -0.24282347],
       [ 0.21627653, -0.537625  ],
       [ 0.32227162, -0.20043196],
       [ 0.2652443 , -0.21615222],
       [ 0.30912533, -0.3388834 ],
       [ 0.48802492, -0.27609083],
       [ 0.34360337, -0.2033731 ],
       [ 0.28224587, -0.27648428],
       [ 0.2845721 , -0.3174574 ],
       [ 0.23902014, -0.25361168],
       [ 0.3129741 , -0.26840526],
       [ 0.24782716, -0.33

In [36]:
from sklearn.metrics import classification_report

y_true = output.label_ids
y_pred = np.argmax(output.predictions, axis=-1)
target_names = ["not toxic","toxic"]
print(classification_report(y_true, y_pred, target_names=target_names))

InvalidParameterError: The 'y_true' parameter of classification_report must be an array-like or a sparse matrix. Got None instead.