In [2]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


In [3]:
def read_league_data(filename):
    # read csv file
    df = pd.read_csv(filename, header=0)
    # Get only the text and label columns
    return df[["message","toxicity_label"]]

In [26]:
labels = open('classes.txt').read().splitlines()
data = read_league_data('dataToxic.csv')
print(labels)
data = data.rename(columns={"toxicity_label": "label"})
data['message'] = data['message'].str.replace(',',' ',regex=False)
data['label'] = data['label'].apply(lambda x: 1 if x == 'toxic' else 0)
data

['0', '1', '2']


Unnamed: 0,message,label
0,report for unskilled player is useless thx <3 ...,0
1,mimimi,0
2,im comming for you riven pfft focus Zed always...,1
3,thx top no flash for what ? he has 2 kill in l...,1
4,IIII ISI K udyr top dnt us see it? CAMP MORE P...,0
...,...,...
88083,gj &gt;&lt; xD i said ss gj stop go alone plz ...,1
88084,i like the new un-do button GET BACK GET BACK ...,1
88085,thx now i can b its ok re top sry thought cait...,0
88086,take t brb swian i was ogin b i was going blue...,1


In [5]:
from sklearn.model_selection import train_test_split

train_df, eval_and_test_df = train_test_split(data, train_size=0.8, random_state
= 4)
eval_df, test_df = train_test_split(eval_and_test_df, train_size=0.5, random_state = 4)
train_df.reset_index(inplace=True, drop=True)
eval_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

print(f'train rows: {len(train_df.index):,}')
print(f'eval rows: {len(eval_df.index):,}')
print(f'test rows: {len(test_df.index):,}')

train rows: 70,470
eval rows: 8,809
test rows: 8,809


In [6]:
eval_df

Unnamed: 0,message,label
0,i looked at the game too late thoughzed no its...,1
1,can i have next blue? ty amumu uselesss 3 time...,0
2,"-.-"" blizt too report",1
3,help bluw blue heimer noob fizz you dont help ...,1
4,yes lee is in bot no mana,0
...,...,...
8804,... so bad tf tf are u fucking retarded? go mi...,1
8805,nvm can u pls ward or do smth u arnt doing any...,1
8806,ss re i tank GG,0
8807,yes nice ward wow jinx att speed wat ? gj send...,1


In [7]:
from datasets import Dataset, DatasetDict

ds = DatasetDict()
ds['train'] = Dataset.from_pandas(train_df)
ds['validation'] = Dataset.from_pandas(eval_df)
ds['test'] = Dataset.from_pandas(test_df)
ds

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['message', 'label'],
        num_rows: 70470
    })
    validation: Dataset({
        features: ['message', 'label'],
        num_rows: 8809
    })
    test: Dataset({
        features: ['message', 'label'],
        num_rows: 8809
    })
})

In [8]:
from transformers import AutoTokenizer

transformer_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)

In [9]:
def tokenize(examples):
    return tokenizer(examples['message'], truncation=True)

train_ds = ds['train'].map(tokenize, batched=True, remove_columns=['message'])
eval_ds = ds['validation'].map(tokenize, batched=True, remove_columns=['message'])
test_ds = ds['test'].map(tokenize, batched=True, remove_columns=['message'])


Map: 100%|██████████| 70470/70470 [00:04<00:00, 16706.98 examples/s]
Map: 100%|██████████| 8809/8809 [00:00<00:00, 13621.64 examples/s]
Map: 100%|██████████| 8809/8809 [00:00<00:00, 18818.79 examples/s]


In [10]:
import torch.nn as nn
from transformers import BertModel, BertPreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput

class BertLSTMForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        # BERT Model
        self.bert = BertModel(config)
        # LSTM Layer (works with BERT hidden size)
        self.lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=128, num_layers=1, batch_first=True, dropout=0.1)
        self.dropout = nn.Dropout(0.1)
        # Fully connected layer to output logits
        self.classifier = nn.Linear(128, config.num_labels)
        
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        # Pass through BERT
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        # Get sequence-level output from BERT
        sequence_output = outputs.last_hidden_state
        # Pass through LSTM layer
        lstm_output, (hn, cn) = self.lstm(sequence_output)
        # Get the last hidden state from LSTM
        # pooled_output = hn[-1]  # Using the last hidden state (it could be any strategy: mean, max, etc.)
        pooled_output=sequence_output.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        
        return SequenceClassifierOutput(loss=loss, logits=logits)


In [11]:
from transformers import AutoConfig

# Load the configuration
config = AutoConfig.from_pretrained(
    transformer_name,
    num_labels=2  # assuming binary classification (toxic / not toxic)
)

# Instantiate the hybrid model
model = BertLSTMForSequenceClassification.from_pretrained(transformer_name, config=config)


Some weights of BertLSTMForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'lstm.bias_hh_l0', 'lstm.bias_ih_l0', 'lstm.weight_hh_l0', 'lstm.weight_ih_l0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

# Training arguments
training_args = TrainingArguments(
    output_dir=f'{transformer_name}-sequence-classification',
    log_level='error',
    num_train_epochs=2,
    per_device_train_batch_size=48,
    per_device_eval_batch_size=48,
    eval_strategy='epoch',
    weight_decay=0.01,
    fp16=True,
    disable_tqdm=False
)

# Compute metrics function
def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    return {'accuracy': accuracy_score(y_true, y_pred)}

# Create Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    # add this
    processing_class=tokenizer
)


In [13]:
trainer.train()


RuntimeError: mat1 and mat2 shapes cannot be multiplied (48x768 and 128x2)

In [None]:
output = trainer.predict(test_ds)

y_true = output.label_ids
y_pred = np.argmax(output.predictions, axis=-1)
target_names = ["not toxic", "toxic"]
print(classification_report(y_true, y_pred, target_names=target_names))


In [None]:
# test
# Sample DataFrame (make sure your real DataFrame has the correct structure)
test_df = {"message": ["hi"], "label": [0]}

cols = ["message", "label"]
data = [["soup",0],["this is a soup", 0],["in the library", 0],["Motherfucking noob",1], ["you guys were winning early game", 0], ["Support my ass senpai", 1], ["ur fucking ass at this game go kys",1]]
test = pd.DataFrame(data, columns = cols)

# Convert the DataFrame to a Dataset
test_dataset = Dataset.from_pandas(test)
#test_dataset.reset_index(inplace=True, drop=True)

dota_ds=DatasetDict()
dota_ds['test'] = Dataset.from_pandas(test)
#dota_ds.reset_index(inplace=True, drop=True)


# Now, map the tokenization function to the dataset
# Since your key for the dataset is 'test', make sure you access 'test' from dota_ds
test_ds_dota = dota_ds['test'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],  # Removing 'message' column after tokenization
)

# Convert to pandas DataFrame (for inspection)
test_ds_dota_df = test_ds_dota.to_pandas()

# Output the tokenized dataset
print(test_ds_dota_df)


# Make predictions
output_dota = trainer.predict(test_ds_dota)

# Print or inspect the output
print(output_dota)

output_dota.predictions

np.argmax(output_dota.predictions, axis=-1)

In [None]:
# trainer.save_model("second trial")

In [None]:
test_df={"message": ["hi"], "label":[0]}

ds['test2'] = Dataset.from_pandas(test_df)
test_df.reset_index(inplace=True, drop=True)

ds = DatasetDict()
ds['test2'] = Dataset.from_pandas(test_df)


dota_ds = DatasetDict()
dota_ds['test'] = Dataset.from_pandas(df)
dota_ds

test_ds_dota = dota_ds['test2'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],
    # remove_columns=['title', 'description', 'text'],
)
test_ds_dota.to_pandas()


output_dota = trainer.predict(test_ds_dota)
output_dota



In [None]:
from datasets import Dataset, DatasetDict
from transformers import Trainer

# Sample DataFrame (make sure your real DataFrame has the correct structure)
test_df = {"message": ["hi"], "label": [0]}

# Convert the DataFrame to a Dataset
test_dataset = Dataset.from_pandas(test_df)
test_dataset.reset_index(inplace=True, drop=True)

# Initialize DatasetDict
ds = DatasetDict()
ds['test2'] = test_dataset

# If you're working with a second dataset for Dota:
# Create another example dataset for "dota"
dota_df = {"message": ["hello", "good game"], "label": [0, 1]}  # Example
dota_ds = DatasetDict()
dota_ds['test'] = Dataset.from_pandas(dota_df)


# Now, map the tokenization function to the dataset
# Since your key for the dataset is 'test', make sure you access 'test' from dota_ds
test_ds_dota = dota_ds['test'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],  # Removing 'message' column after tokenization
)

# Convert to pandas DataFrame (for inspection)
test_ds_dota_df = test_ds_dota.to_pandas()

# Output the tokenized dataset
print(test_ds_dota_df)


# Make predictions
output_dota = trainer.predict(test_ds_dota)

# Print or inspect the output
# print(output_dota)


In [27]:
labels = open('classes.txt').read().splitlines()
df = pd.read_csv("tagged-data.csv", header=0)
# Get only the text and label columns
df = df[["text","target"]]
print(labels)
df = df.rename(columns={"text": "message"})
df['target'] = df['target'].replace(2,1)
df

['0', '1', '2']


Unnamed: 0,message,target
0,COMMEND ME TY,0
1,sorry nex,0
2,what is the best soup?,0
3,man that silence on axe,0
4,not coming into play,0
...,...,...
3262,"wt?f?asfU JGOFIDLK,YH",1
3263,you must really suck,1
3264,YOU HAVE IDIOT PLAYER,1
3265,SUPER IDIOT,1


In [15]:
dota_ds = DatasetDict()
dota_ds['test'] = Dataset.from_pandas(df)
dota_ds

DatasetDict({
    test: Dataset({
        features: ['message', 'target'],
        num_rows: 3267
    })
})

In [16]:
test_ds_dota = dota_ds['test'].map(
    tokenize,
    batched=True,
    remove_columns=['message'],
    # remove_columns=['title', 'description', 'text'],
)
test_ds_dota.to_pandas()

Map: 100%|██████████| 3267/3267 [00:00<00:00, 44944.13 examples/s]


Unnamed: 0,target,input_ids,token_type_ids,attention_mask
0,0,"[101, 18732, 25290, 11680, 2137, 22157, 157, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,0,"[101, 2959, 24928, 1775, 102]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]"
2,0,"[101, 1184, 1110, 1103, 1436, 13128, 136, 102]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1]"
3,0,"[101, 1299, 1115, 3747, 1113, 16301, 102]","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]"
4,0,"[101, 1136, 1909, 1154, 1505, 102]","[0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1]"
...,...,...,...,...
3262,1,"[101, 192, 1204, 136, 175, 136, 1112, 2087, 25...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3263,1,"[101, 1128, 1538, 1541, 13054, 102]","[0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1]"
3264,1,"[101, 19141, 145, 26390, 2036, 10999, 19368, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
3265,1,"[101, 156, 18124, 9637, 10999, 19368, 1942, 102]","[0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1]"


In [17]:
output_dota = trainer.predict(test_ds_dota)
output_dota

RuntimeError: mat1 and mat2 shapes cannot be multiplied (48x768 and 128x2)

In [None]:
np.argmax(output_dota.predictions, axis=-1)

In [None]:
from sklearn.metrics import classification_report
y_true = output_dota.label_ids
y_pred = np.argmax(output_dota.predictions, axis=-1)
target_names = ["not toxic", "toxic"]
# print(classification_report(y_true, y_pred, target_names=target_names))

In [18]:
model = BertModel.from_pretrained("./first trial/", torch_dtype=torch.float16, attn_implementation="sdpa")


In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    # add this
    processing_class=tokenizer
)

In [30]:
output_dota = trainer.predict(test_ds_dota)
output_dota

PredictionOutput(predictions=(array([[[ 3.5660338e-01,  1.1932389e-01,  3.5519615e-01, ...,
         -1.9240448e-01,  3.7812015e-01, -3.9147049e-01],
        [ 2.2477226e-01,  6.9438174e-02, -2.5595695e-01, ...,
          3.9837021e-01,  6.2837720e-01,  1.2285419e-01],
        [-2.3931769e-01,  1.0512624e-01, -2.1768065e-02, ...,
          7.7340508e-01, -1.7562312e-01, -2.0802525e-01],
        ...,
        [-1.0000000e+02, -1.0000000e+02, -1.0000000e+02, ...,
         -1.0000000e+02, -1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02, -1.0000000e+02, ...,
         -1.0000000e+02, -1.0000000e+02, -1.0000000e+02],
        [-1.0000000e+02, -1.0000000e+02, -1.0000000e+02, ...,
         -1.0000000e+02, -1.0000000e+02, -1.0000000e+02]],

       [[ 5.5914807e-01,  2.9495448e-02,  1.6484490e-01, ...,
          1.8535736e-01,  2.5782201e-01, -1.4750110e-01],
        [ 6.7941672e-01, -4.5113319e-01,  1.9211547e-01, ...,
          4.4155937e-01,  5.7889801e-01,  3.9043745e-

In [28]:
output_dota.label_ids