# Text Classification Using Transformer Networks (BERT)

Some initialization:

In [1]:
!pip3 install datasets
!pip3 install transformers
!pip install -U accelerate
!pip install -U transformers

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting huggingface-hub>=0.23.0 (from datasets)
  Downloading huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


Read the train/dev/test datasets and create a HuggingFace `Dataset` object:

In [3]:
def read_league_data(filename):
    # read csv file
    df = pd.read_csv(filename, header=0)
    # Get only the text and label columns
    return df[["message","toxicity_label"]]

In [4]:
#labels = open('classes.txt').read().splitlines()
labels=["0","1"]
league_data = read_league_data('dataToxic.csv')
print(labels)
league_data = league_data.rename(columns={"toxicity_label": "label"})
league_data['message'] = league_data['message'].str.replace(',',' ',regex=False)
league_data['label'] = league_data['label'].apply(lambda x: 1 if x == 'toxic' else 0)
league_data

['0', '1']


Unnamed: 0,message,label
0,report for unskilled player is useless thx <3 ...,0
1,mimimi,0
2,im comming for you riven pfft focus Zed always...,1
3,thx top no flash for what ? he has 2 kill in l...,1
4,IIII ISI K udyr top dnt us see it? CAMP MORE P...,0
...,...,...
88083,gj &gt;&lt; xD i said ss gj stop go alone plz ...,1
88084,i like the new un-do button GET BACK GET BACK ...,1
88085,thx now i can b its ok re top sry thought cait...,0
88086,take t brb swian i was ogin b i was going blue...,1


In [5]:
def read_kaggle_data(filename):
    # read csv file
    df = pd.read_csv(filename, header=0)
    # Get only the text and label columns
    return df[["Text","oh_label"]]

In [6]:
kaggle_data = read_kaggle_data("kaggle_parsed_dataset.csv")
kaggle_data = kaggle_data.rename(columns={"oh_label": "label", "Text": "message"})
kaggle_data

Unnamed: 0,message,label
0,"""You fuck your dad.""",1
1,"""i really don't understand your point.\xa0 It ...",0
2,"""A\\xc2\\xa0majority of Canadians can and has ...",0
3,"""listen if you dont wanna get married to a man...",0
4,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",0
...,...,...
8794,"""Never really gave it much thought. I just fig...",0
8795,"""Nadie se salva de la regla 34 xd""",0
8796,"""Question: Are you a boy or a girl?""",0
8797,"""Leave your email or phone number and maybe yo...",1


In [8]:
data = pd.concat([kaggle_data,league_data])
data

Unnamed: 0,message,label
0,"""You fuck your dad.""",1
1,"""i really don't understand your point.\xa0 It ...",0
2,"""A\\xc2\\xa0majority of Canadians can and has ...",0
3,"""listen if you dont wanna get married to a man...",0
4,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",0
...,...,...
88083,gj &gt;&lt; xD i said ss gj stop go alone plz ...,1
88084,i like the new un-do button GET BACK GET BACK ...,1
88085,thx now i can b its ok re top sry thought cait...,0
88086,take t brb swian i was ogin b i was going blue...,1


In [9]:
from sklearn.model_selection import train_test_split

train_df, eval_and_test_df = train_test_split(data, train_size=0.8, random_state
= 4)
eval_df, test_df = train_test_split(eval_and_test_df, train_size=0.5, random_state = 4)
train_df.reset_index(inplace=True, drop=True)
eval_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

print(f'train rows: {len(train_df.index):,}')
print(f'eval rows: {len(eval_df.index):,}')
print(f'test rows: {len(test_df.index):,}')

train rows: 77,509
eval rows: 9,689
test rows: 9,689


In [10]:
eval_df

Unnamed: 0,message,label
0,gg wp,0
1,don't wait not gnking double riven HAHA u suck...,1
2,it's my promotion to gold pls play well and do...,1
3,2 cool for dat ward4 FUCK re j4 blue) gg they ...,0
4,this patch :D wtf bush! lol udyr :D ward y chill!,0
...,...,...
9684,ss gj ty gj gj ss puta rammus ss mid b gj gj t...,0
9685,come bot ty invade? oh sorry yes we can he is ...,1
9686,cover red just going to ward wtf this pull shy...,1
9687,gg,0


In [13]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds['train'] = Dataset.from_pandas(train_df)
ds['validation'] = Dataset.from_pandas(eval_df)
ds['test'] = Dataset.from_pandas(test_df)
ds

DatasetDict({
    train: Dataset({
        features: ['message', 'label'],
        num_rows: 77509
    })
    validation: Dataset({
        features: ['message', 'label'],
        num_rows: 9689
    })
    test: Dataset({
        features: ['message', 'label'],
        num_rows: 9689
    })
})

Tokenize the texts:

In [11]:
from transformers import AutoTokenizer

transformer_name = 'FacebookAI/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [14]:
def tokenize(examples):
    return tokenizer(examples['message'], truncation=True)

train_ds = ds['train'].map(
    tokenize, 
    batched=True,
    remove_columns=['message'],
    # remove_columns=['title', 'description', 'text'],
)
eval_ds = ds['validation'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],
    # remove_columns=['title', 'description', 'text'],
)
train_ds.to_pandas()

Map:   0%|          | 0/77509 [00:00<?, ? examples/s]

Map:   0%|          | 0/9689 [00:00<?, ? examples/s]

Unnamed: 0,label,input_ids,attention_mask
0,1,"[0, 710, 28836, 33976, 860, 213, 19634, 3983, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,0,"[0, 298, 5471, 116, 11380, 8635, 897, 748, 324...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
2,0,"[0, 330, 567, 27339, 4420, 98, 1099, 5378, 428...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,0,"[0, 975, 13184, 11467, 102, 4832, 73, 208, 150...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1,"[0, 39088, 127, 2888, 784, 3036, 4356, 98, 666...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...
77504,1,"[0, 34033, 21086, 8, 323, 1964, 734, 295, 3848...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
77505,1,"[0, 1343, 32768, 4832, 947, 7984, 131, 23184, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
77506,1,"[0, 571, 267, 29784, 821, 267, 244, 10390, 158...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
77507,1,"[0, 2362, 1303, 939, 218, 75, 77, 1690, 1120, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


Create the transformer model:

In [15]:
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel

# https://github.com/huggingface/transformers/blob/65659a29cf5a079842e61a63d57fa24474288998/src/transformers/models/bert/modeling_bert.py#L1486

class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        cls_outputs = outputs.last_hidden_state[:, 0, :]
        cls_outputs = self.dropout(cls_outputs)
        logits = self.classifier(cls_outputs)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [16]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(
    transformer_name,
    #num_labels=len(labels),
    num_labels=2
)

model = (
    BertForSequenceClassification
    .from_pretrained(transformer_name, config=config).to(device)
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerN

Create the trainer object and train:

In [17]:
from transformers import TrainingArguments

num_epochs = 2
batch_size = 48
weight_decay = 0.01
model_name = f'{transformer_name}-sequence-classification'

training_args = TrainingArguments(
    output_dir=model_name,
    log_level='error',
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy='epoch',
    weight_decay=weight_decay,
    fp16=True
)

In [18]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    return {'accuracy': accuracy_score(y_true, y_pred)}

In [19]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    processing_class=tokenizer,
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2632,0.23746,0.889256
2,0.1889,0.202028,0.91093


TrainOutput(global_step=3230, training_loss=0.27278684964490013, metrics={'train_runtime': 4695.8381, 'train_samples_per_second': 33.012, 'train_steps_per_second': 0.688, 'total_flos': 2.929855479187248e+16, 'train_loss': 0.27278684964490013, 'epoch': 2.0})

In [21]:
trainer.save_model("league_kaggle-ROBERTA")

Evaluate on the test partition:

In [22]:
test_ds = ds['test'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],
    # remove_columns=['title', 'description', 'text'],
)
test_ds.to_pandas()

Map:   0%|          | 0/9689 [00:00<?, ? examples/s]

Unnamed: 0,label,input_ids,attention_mask
0,0,"[0, 6149, 2]","[1, 1, 1]"
1,1,"[0, 4321, 6620, 2968, 29, 55, 6620, 828, 5559,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,0,"[0, 113, 37457, 43409, 288, 26644, 47, 581, 28...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1,"[0, 10010, 74, 34425, 939, 206, 51, 8052, 1275...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1,"[0, 113, 30250, 324, 47, 32, 10, 35488, 22, 2]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
...,...,...,...
9684,1,"[0, 261, 4173, 116, 384, 4, 139, 1021, 22984, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
9685,1,"[0, 2527, 475, 17202, 1717, 236, 1084, 939, 64...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
9686,0,"[0, 113, 1106, 209, 27726, 856, 15291, 1368, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
9687,1,"[0, 36146, 748, 34090, 266, 17487, 14223, 117,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [23]:
output = trainer.predict(test_ds)
output

PredictionOutput(predictions=array([[ 3.421875  , -3.234375  ],
       [-1.3349609 ,  0.9951172 ],
       [-0.39624023,  0.2421875 ],
       ...,
       [ 0.9892578 , -0.9477539 ],
       [-4.2460938 ,  3.2753906 ],
       [-4.4453125 ,  3.4160156 ]], dtype=float32), label_ids=array([0, 1, 0, ..., 0, 1, 1]), metrics={'test_loss': 0.20473513007164001, 'test_accuracy': 0.9100010320982558, 'test_runtime': 80.2246, 'test_samples_per_second': 120.773, 'test_steps_per_second': 2.518})

In [24]:
from sklearn.metrics import classification_report

y_true = output.label_ids
y_pred = np.argmax(output.predictions, axis=-1)
target_names = ["not toxic", "toxic"]
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   not toxic       0.89      0.91      0.90      4424
       toxic       0.93      0.91      0.92      5265

    accuracy                           0.91      9689
   macro avg       0.91      0.91      0.91      9689
weighted avg       0.91      0.91      0.91      9689



In [51]:
# Sample DataFrame (make sure your real DataFrame has the correct structure)
test_df = {"message": ["hi"], "label": [0]}

cols = ["message", "label"]
data = [["soup",0],["this is a soup", 0],["in the library", 0],["Motherfucking noob",1]]
test = pd.DataFrame(data, columns = cols)

# Convert the DataFrame to a Dataset
test_dataset = Dataset.from_pandas(test)
#test_dataset.reset_index(inplace=True, drop=True)

dota_ds=DatasetDict()
dota_ds['test'] = Dataset.from_pandas(test)
#dota_ds.reset_index(inplace=True, drop=True)


# Now, map the tokenization function to the dataset
# Since your key for the dataset is 'test', make sure you access 'test' from dota_ds
test_ds_dota = dota_ds['test'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],  # Removing 'message' column after tokenization
)

# Convert to pandas DataFrame (for inspection)
test_ds_dota_df = test_ds_dota.to_pandas()

# Output the tokenized dataset
print(test_ds_dota_df)


# Make predictions
output_dota = trainer.predict(test_ds_dota)

# Print or inspect the output
print(output_dota)

output_dota.predictions



Map:   0%|          | 0/4 [00:00<?, ? examples/s]

   label                                    input_ids         token_type_ids  \
0      0                            [101, 11350, 102]              [0, 0, 0]   
1      0          [101, 2023, 2003, 1037, 11350, 102]     [0, 0, 0, 0, 0, 0]   
2      0                 [101, 1999, 1996, 3075, 102]        [0, 0, 0, 0, 0]   
3      1  [101, 2388, 11263, 23177, 2053, 16429, 102]  [0, 0, 0, 0, 0, 0, 0]   

          attention_mask  
0              [1, 1, 1]  
1     [1, 1, 1, 1, 1, 1]  
2        [1, 1, 1, 1, 1]  
3  [1, 1, 1, 1, 1, 1, 1]  


PredictionOutput(predictions=array([[ 3.9316406, -4.1953125],
       [-3.9746094,  4.765625 ],
       [-2.7597656,  3.1152344],
       [-3.6992188,  4.2890625]], dtype=float32), label_ids=array([0, 0, 0, 1]), metrics={'test_loss': 3.6554322242736816, 'test_accuracy': 0.5, 'test_runtime': 0.0222, 'test_samples_per_second': 179.776, 'test_steps_per_second': 44.944})


array([[ 3.9316406, -4.1953125],
       [-3.9746094,  4.765625 ],
       [-2.7597656,  3.1152344],
       [-3.6992188,  4.2890625]], dtype=float32)

In [52]:
np.argmax(output_dota.predictions, axis=-1)

array([0, 1, 1, 1])

In [69]:
labels = open('classes.txt').read().splitlines()
df = pd.read_csv("tagged-data.csv", header=0)
# Get only the text and label columns
df = df[["text","target"]]
print(labels)
df = df.rename(columns={"text": "message"})
df = df.rename(columns={"target": "label"})

df['label'] = df['label'].replace(2,1)
df

['0', '1', '2']


Unnamed: 0,message,label
0,COMMEND ME TY,0
1,sorry nex,0
2,what is the best soup?,0
3,man that silence on axe,0
4,not coming into play,0
...,...,...
3262,"wt?f?asfU JGOFIDLK,YH",1
3263,you must really suck,1
3264,YOU HAVE IDIOT PLAYER,1
3265,SUPER IDIOT,1


In [70]:
dota_ds = DatasetDict()
dota_ds['test'] = Dataset.from_pandas(df)
dota_ds

DatasetDict({
    test: Dataset({
        features: ['message', 'label'],
        num_rows: 3267
    })
})

In [71]:
test_ds_dota = dota_ds['test'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],
    # remove_columns=['title', 'description', 'text'],
)
test_ds_dota.to_pandas()

Map:   0%|          | 0/3267 [00:00<?, ? examples/s]

Unnamed: 0,label,input_ids,token_type_ids,attention_mask
0,0,"[101, 4012, 3549, 2094, 2033, 5939, 102]","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]"
1,0,"[101, 3374, 11265, 2595, 102]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]"
2,0,"[101, 2054, 2003, 1996, 2190, 11350, 1029, 102]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1]"
3,0,"[101, 2158, 2008, 4223, 2006, 12946, 102]","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]"
4,0,"[101, 2025, 2746, 2046, 2377, 102]","[0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1]"
...,...,...,...,...
3262,1,"[101, 1059, 2102, 1029, 1042, 1029, 2004, 1126...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3263,1,"[101, 2017, 2442, 2428, 11891, 102]","[0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1]"
3264,1,"[101, 2017, 2031, 10041, 2447, 102]","[0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1]"
3265,1,"[101, 3565, 10041, 102]","[0, 0, 0, 0]","[1, 1, 1, 1]"


In [72]:
output_dota = trainer.predict(test_ds_dota)
output_dota

PredictionOutput(predictions=array([[-1.2646484,  1.6171875],
       [ 4.3359375, -4.8046875],
       [-4.1445312,  4.9257812],
       ...,
       [-4.1601562,  5.0390625],
       [ 2.6445312, -2.53125  ],
       [-3.5527344,  4.3085938]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 1, 1]), metrics={'test_loss': 2.7994821071624756, 'test_accuracy': 0.5677992041628406, 'test_runtime': 1.7236, 'test_samples_per_second': 1895.454, 'test_steps_per_second': 40.033})

In [73]:
output_dota.label_ids

array([0, 0, 0, ..., 1, 1, 1])

In [74]:
y_pred = np.argmax(output_dota.predictions, axis=-1)
print(y_pred)

[1 0 1 ... 1 0 1]


In [75]:
y_true = output_dota.label_ids
y_pred = np.argmax(output_dota.predictions, axis=-1)
target_names = ["not toxic", "toxic"]
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

   not toxic       0.63      0.53      0.57      1800
       toxic       0.52      0.62      0.56      1467

    accuracy                           0.57      3267
   macro avg       0.57      0.57      0.57      3267
weighted avg       0.58      0.57      0.57      3267



## Testing for fun

In [32]:
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import Trainer
from transformers import AutoConfig
from torch import nn
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel

# https://github.com/huggingface/transformers/blob/65659a29cf5a079842e61a63d57fa24474288998/src/transformers/models/bert/modeling_bert.py#L1486

from transformers import AutoTokenizer

transformer_name = "league_model"
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)

def tokenize(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')


class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs,
        )
        cls_outputs = outputs.last_hidden_state[:, 0, :]
        cls_outputs = self.dropout(cls_outputs)
        logits = self.classifier(cls_outputs)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

config = AutoConfig.from_pretrained(
    "league_model",
    #num_labels=len(labels),
    num_labels=2
)

model = (
    BertForSequenceClassification
    .from_pretrained("league_model", config=config)
)

trainer = Trainer(
    model=model,
)

In [40]:
cols = ["text", "label"]
data = [["soup",0],
        ["this is a soup", 0],
        ["in the library", 0],
        ["Motherfucking noob",1],
        ["AHAHHHAHHASDHADHU",0],
        ["bread and butter",0],
        ["soup is bad",0],
        ["I hope you beat your children",1],
        ["die soup",0], 
        ["I hope you die",1],
        ["I hope your kids die",1],
        ["I hope your kids", 0],
        ["I hope your kids soup", 0],
        ["die udyr", 1],
        ["die kids", 1],
        ["I hope your lose",0],
        ["I hope your succeed",0],
       ]
test = pd.DataFrame(data, columns = cols)

# Convert the DataFrame to a Dataset
test_dataset = Dataset.from_pandas(test)
#test_dataset.reset_index(inplace=True, drop=True)

dota_ds=DatasetDict()
dota_ds['test'] = Dataset.from_pandas(test)
#dota_ds.reset_index(inplace=True, drop=True)


# Now, map the tokenization function to the dataset
# Since your key for the dataset is 'test', make sure you access 'test' from dota_ds
test_ds_dota = dota_ds['test'].map(
    tokenize,
    batched=True,
    # remove_columns=['message'],  # Removing 'message' column after tokenization
)

# Convert to pandas DataFrame (for inspection)
test_ds_dota_df = test_ds_dota.to_pandas()

# Output the tokenized dataset
# print(test_ds_dota_df)


# Make predictions
output_dota = trainer.predict(test_ds_dota)

# Print or inspect the output
# print(output_dota)

output_dota.predictions

chats = test_dataset["text"]
labels = np.argmax(output_dota.predictions, axis=-1)
pd.DataFrame(labels, chats)

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Unnamed: 0,0
soup,0
this is a soup,1
in the library,1
Motherfucking noob,1
AHAHHHAHHASDHADHU,0
bread and butter,1
soup is bad,1
I hope you beat your children,1
die soup,0
I hope you die,0
