In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
#from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
df = pd.read_csv("train.csv")
df['list'] = df[df.columns[2:]].values.tolist()
df.tail(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,list
159561,ffd2e85b07b3c7e4,"""\nNo he did not, read it again (I would have ...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159562,ffd72e9766c09c97,"""\n Auto guides and the motoring press are not...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159563,ffe029a7c79dc7fe,"""\nplease identify what part of BLP applies be...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159564,ffe897e7f7182c90,Catalan independentism is the social movement ...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159565,ffe8b9316245be30,The numbers in parentheses are the additional ...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0]"


In [4]:
new_df=df[['comment_text','list']].copy()
new_df.head()

Unnamed: 0,comment_text,list
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [5]:
new_df = new_df.rename(columns={"list": "label"})

In [6]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


In [7]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        print(self.data.label)
        print(type(self.data))
        self.targets = self.data.label.values.tolist()
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [8]:
train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (159571, 2)
TRAIN Dataset: (127657, 2)
TEST Dataset: (31914, 2)
0         [0, 0, 0, 0, 0, 0]
1         [0, 0, 0, 0, 0, 0]
2         [0, 0, 0, 0, 0, 0]
3         [0, 0, 0, 0, 0, 0]
4         [0, 0, 0, 0, 0, 0]
                 ...        
127652    [0, 0, 0, 0, 0, 0]
127653    [0, 0, 0, 0, 0, 0]
127654    [0, 0, 0, 0, 0, 0]
127655    [0, 0, 0, 0, 0, 0]
127656    [0, 0, 0, 0, 0, 0]
Name: label, Length: 127657, dtype: object
<class 'pandas.core.frame.DataFrame'>
0        [0, 0, 0, 0, 0, 0]
1        [0, 0, 0, 0, 0, 0]
2        [0, 0, 0, 0, 0, 0]
3        [0, 0, 0, 0, 0, 0]
4        [0, 0, 0, 0, 0, 0]
                ...        
31909    [0, 0, 0, 0, 0, 0]
31910    [0, 0, 0, 0, 0, 0]
31911    [0, 0, 0, 0, 0, 0]
31912    [0, 0, 0, 0, 0, 0]
31913    [0, 0, 0, 0, 0, 0]
Name: label, Length: 31914, dtype: object
<class 'pandas.core.frame.DataFrame'>


In [15]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [9]:
# Convert pyhton dataframe to Hugging Face arrow dataset
from datasets import Dataset
hg_train_data = Dataset.from_pandas(train_dataset)
hg_test_data = Dataset.from_pandas(test_dataset)

In [10]:
# Length of the Dataset
print(f'The length of hg_train_data is {len(hg_train_data)}.\n')

# Check one review
hg_train_data[0]

The length of hg_train_data is 127657.



{'comment_text': "Goal scored for Portugal \n\nThis could be miles off the mark but did Ricardo actually score a goal while playing for Portugal? I sincerely hope this was no referring to the penalty he scored against England which, of course, was in a penalty shootout. The page about 'goalscoring goalkeepers' only claims he scored a goal for one of his club sides and a quick internet search turned up nothing.",
 'label': [0, 0, 0, 0, 0, 0]}

In [11]:
# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Take a look at the tokenizer
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [12]:
# Funtion to tokenize data
def tokenize_dataset(data):
    return tokenizer(data["comment_text"],
                     max_length=32,
                     truncation=True,
                     padding="max_length")

# Tokenize the dataset
dataset_train = hg_train_data.map(tokenize_dataset)
dataset_test = hg_test_data.map(tokenize_dataset)

Map:   0%|          | 0/127657 [00:00<?, ? examples/s]

Map:   0%|          | 0/31914 [00:00<?, ? examples/s]

In [13]:
print(dataset_train)
print(dataset_test)

Dataset({
    features: ['comment_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 127657
})
Dataset({
    features: ['comment_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 31914
})


In [15]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=6)
# Set up training arguments
training_args = TrainingArguments(
    output_dir="data/toxic_comment/",
    logging_dir='data/toxic_comment/logs',
    logging_strategy='epoch',
    logging_steps=100,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [20]:
# Function to compute the metric
import evaluate
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    # probabilities = tf.nn.softmax(logits)
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [19]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [34]:
#for short training
new_db_train=dataset_train.train_test_split(test_size=0.95)
new_db_test=dataset_test.train_test_split(test_size=0.95)

In [35]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
   # train_dataset=dataset_train,
    train_dataset=new_db_train['train'], #to shorten the training time
    #eval_dataset=dataset_test,
    eval_dataset=new_db_test['train'],
    #compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0567,0.057069


TrainOutput(global_step=1596, training_loss=0.05669555090423813, metrics={'train_runtime': 150.1062, 'train_samples_per_second': 42.517, 'train_steps_per_second': 10.632, 'total_flos': 104952191364864.0, 'train_loss': 0.05669555090423813, 'epoch': 1.0})

In [38]:
# Predictions
y_test_predict = trainer.predict(dataset_test)

# Take a look at the predictions
y_test_predict[:2][0]

In [40]:
len(y_test_predict)

3

In [43]:
y_test_predict

PredictionOutput(predictions=array([[-6.1609645, -8.316399 , -6.8206506, -8.581967 , -7.0541916,
        -8.038598 ],
       [-6.424978 , -8.137386 , -6.9484277, -8.424839 , -7.1769915,
        -7.954104 ],
       [-6.7277884, -7.7755775, -6.9929156, -8.085841 , -7.1592007,
        -7.7696924],
       ...,
       [-5.9740834, -8.337973 , -6.716994 , -8.596476 , -7.0210285,
        -8.002385 ],
       [-4.9953237, -8.608166 , -6.102049 , -8.63173  , -6.4129167,
        -7.8467326],
       [-4.6124005, -8.321336 , -5.897851 , -8.23405  , -6.1099434,
        -7.4720573]], dtype=float32), label_ids=array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]], dtype=float32), metrics={'test_loss': 0.052186448127031326, 'test_runtime': 134.0881, 'test_samples_per_second': 238.008, 'test_steps_per_second': 59.506})

In [46]:
len(y_test_predict.label_ids)

31914

In [47]:
len(dataset_test)

31914

In [51]:
len(dataset_test['label'])

31914

In [84]:
from collections import defaultdict
result_list=[]
comment_list="toxic,severe_toxic,obscene,threat,insult,identity_hate".split(',')
for idx,(x,y) in enumerate(zip(dataset_test['label'],y_test_predict.label_ids.tolist())):
  empt_list=[]
  count=0
  for comment_id,vv in enumerate(range(len(x))):
      aa=int(x[vv])
      bb=int(y[vv])
      if aa==0 and bb==0 and aa==bb:
        count+=1
      if aa==bb and (aa==1 or bb==1):
        empt_list.append(comment_list[comment_id])

  if count==len(comment_list):
    empt_list=[]
    empt_list.append('Neutral')
  result_list.append(empt_list)

In [102]:
text=[str(id)+x[:5] for id,x in enumerate(dataset_test['comment_text'][90:120])]

for x,y in zip(text,result_list[90:120]):
  print(f"{x.strip()}------{y}")


0"
  Y------['Neutral']
1You m------['Neutral']
2"

 D------['Neutral']
3June------['Neutral']
4. Gho------['Neutral']
5Thank------['Neutral']
6No re------['Neutral']
7"

 P------['Neutral']
8page------['Neutral']
9Their------['Neutral']
10|list------['Neutral']
11Oi!!------['toxic']
12Thank------['Neutral']
13I don------['Neutral']
14Muggi------['Neutral']
15"
I w------['Neutral']
16Say o------['toxic']
17Turne------['Neutral']
18Order------['Neutral']
19"

 S------['Neutral']
20Categ------['Neutral']
21WP Ge------['Neutral']
22Warni------['Neutral']
23"

wh------['Neutral']
24genet------['Neutral']
25"

Ma------['Neutral']
26"
LOL------['Neutral']
27"

Yo------['toxic']
28"

""------['Neutral']
29"

 M------['Neutral']


In [103]:
#save model
#save model
import pickle
# Save the model to a pickle file
with open("toxic_pkl.pkl", "wb") as f:
    pickle.dump(model, f)

# Load the model from the pickle file
with open("toxic_pkl.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [128]:
from transformers import pipeline
model_checkpoint = "toxic_pkl.pkl"
tokenizer.save_pretrained('mytokenizer')
classifier = pipeline('text-classification',model=loaded_model,tokenizer=tokenizer, device="cuda")

In [131]:
classifier(dataset_test['comment_text'][0])

[{'label': 'LABEL_0', 'score': 0.011184656992554665}]