### Experiment 5 - **NOT STARTED**

- Test the ability of models trained on Dataset 3 to predict on Dataset 2
- Negative Class (0) Non-Cyberbullying
- Positive Class (1) Cyberbullying

In [66]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import train_test_split

from pathlib import Path
import sys

from torch.utils.data import Dataset

sys.path.append('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\src')
from utils.results import create_results_file, append_results_to_json

In [23]:
# CONSTANTS
RANDOM_SEED = 115
DATA_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\data\\en_only')
EXPERIMENTS_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments')
RESULT_PATH = Path('C:\\Users\\rooty\\UWEC\\Research\\CyberBullyingML\\venv\\cyberbullying-ml\\experiments\\results\\exp0')

np.random.seed(RANDOM_SEED)

In [26]:
df = pd.read_csv( DATA_PATH / '48000_cyberbullying_tweets_basic_clean.csv')
df.dropna(axis=0, inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df['label'].value_counts()

label
religion     7942
age          7910
gender       7396
ethnicity    7319
other        6731
notcb        6377
Name: count, dtype: int64

In [29]:
df['label'] = df['label'].map({'age':0, 'gender':1, 'other':2, 'religion':3, 'ethnicity':4, 'notcb':5})

In [2]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [13]:
def preprocess_text(text):
    return tokenizer(text, truncation=True)


In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
accuracy = evaluate.load("accuracy")

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 4.82MB/s]


In [21]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [30]:
id2label = {0: "age", 1: "gender", 2: "other", 3: "religion", 4: "ethnicity", 5: "notcb"}
label2id = {"age": 0, "gender": 1, "other": 2, "religion": 3, "ethnicity": 4, "notcb": 5}

In [50]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=6, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
x_train, x_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2, random_state=RANDOM_SEED)

In [67]:
class CyberbullyingDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.texts = self.data['tweet'].tolist()
        self.labels = self.data['label'].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx])
        return {'text': text, 'label': label}


In [53]:
from datasets import load_dataset
dataset = load_dataset('imdb')

In [68]:
df = df.sample(frac=1, random_state=RANDOM_SEED)
count = int(len(df) * 0.8)
train_df = df[:count]
test_df = df[count:]

train_df = CyberbullyingDataset(train_df)
test_df = CyberbullyingDataset(test_df)

In [69]:
train_df.texts = [preprocess_text(text) for text in train_df.texts]
test_df.texts = [preprocess_text(text) for text in test_df.texts]


In [70]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=test_df,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/4368 [04:30<?, ?it/s]
  0%|          | 0/4368 [00:00<?, ?it/s]

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']