In [24]:
import pandas as pd
import re
import emoji
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

[nltk_data] Downloading package words to
[nltk_data]     /Users/rishushrivastava/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [25]:
data = pd.read_csv('data/labeled_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [26]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,24783.0,12681.192027,7299.553863,0.0,6372.5,12703.0,18995.5,25296.0
count,24783.0,3.243473,0.88306,3.0,3.0,3.0,3.0,9.0
hate_speech,24783.0,0.280515,0.631851,0.0,0.0,0.0,0.0,7.0
offensive_language,24783.0,2.413711,1.399459,0.0,2.0,3.0,3.0,9.0
neither,24783.0,0.549247,1.113299,0.0,0.0,0.0,0.0,9.0
class,24783.0,1.110277,0.462089,0.0,1.0,1.0,1.0,2.0


In [27]:
data.groupby(['class']).count()['tweet']

class
0     1430
1    19190
2     4163
Name: tweet, dtype: int64

In [28]:
class_label = {0:'hate_speech',1:'offensive_language',2:'neither'}

In [29]:
data_refactor = data.copy()

data_refactor['label'] = data_refactor['class'].map(class_label)
data_refactor.drop(['Unnamed: 0','count','hate_speech','offensive_language','neither'],inplace=True, axis=1)
data_refactor.rename(columns={'class':'category'}, inplace=True)

data = data_refactor.copy()
data.head()

Unnamed: 0,category,tweet,label
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,neither
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,offensive_language
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,offensive_language
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,offensive_language
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,offensive_language


In [30]:
data[data['category'] == 0]['tweet'][:3]

85     "@Blackman38Tide: @WhaleLookyHere @HowdyDowdy1...
89     "@CB_Baby24: @white_thunduh alsarabsss" hes a ...
110    "@DevilGrimz: @VigxRArts you're fucking gay, b...
Name: tweet, dtype: object

In [31]:
import numpy as np
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_val, y_train, y_val = train_test_split(
    data.index.values,
    data.category.values,
    test_size=0.15,
    random_state=17,
    stratify=data.category.values,    
)

y_train

array([0, 1, 0, ..., 1, 2, 1])

In [33]:
data['data_type'] = ['not_set']*data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'

data.head()

Unnamed: 0,category,tweet,label,data_type
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,neither,train
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,offensive_language,train
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,offensive_language,train
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,offensive_language,val
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,offensive_language,train


In [34]:
data.groupby(['category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tweet
category,label,data_type,Unnamed: 3_level_1
0,hate_speech,train,1216
0,hate_speech,val,214
1,offensive_language,train,16311
1,offensive_language,val,2879
2,neither,train,3538
2,neither,val,625


In [35]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
import torch

In [36]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
print(tokenizer)

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [37]:
encoded_data_train = tokenizer.batch_encode_plus(
    data[data.data_type=='train'].tweet.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    max_length=256,
    return_tensors='pt'
)

In [38]:
encoded_data_val = tokenizer.batch_encode_plus(
    data[data.data_type=='val'].tweet.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    max_length=256,
    return_tensors='pt'
)

In [39]:
print(encoded_data_train['input_ids'][0])
print(encoded_data_train['attention_mask'][0])

tensor([  101,   999,   999,   999, 19387,  1030,  9815, 19454, 21818,  2135,
         1024,  2004,  1037,  2450,  2017,  5807,  1005,  1056, 17612,  2055,
         9344,  2039,  2115,  2160,  1012,  1004, 23713,  1025,  2004,  1037,
         2158,  2017,  2323,  2467,  2202,  1996, 11669,  2041,  1012,  1012,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [42]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type=='train'].category.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type=='val'].category.values)

In [45]:
dataset_train = TensorDataset(
    torch.LongTensor(input_ids_train),
    torch.LongTensor(attention_masks_train),
    labels_train
)

dataset_val = TensorDataset(
    torch.LongTensor(input_ids_val),
    torch.LongTensor(attention_masks_val),
    labels_val
)

In [46]:
from transformers import BertForSequenceClassification

In [48]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = len(class_label),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [49]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [51]:
batch_size = 32

dataloader_train = DataLoader(dataset_train,sampler=RandomSampler(dataset_train),batch_size=batch_size)
dataloader_val = DataLoader(dataset_val,sampler=RandomSampler(dataset_val),batch_size=batch_size)

In [52]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [53]:
optimizer = AdamW(model.parameters(),lr=1e-5,eps=1e-8)
epochs = 5
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

device

device(type='cpu')

In [58]:
import random
from tqdm.notebook import tqdm

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [59]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [60]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(epochs)):
    
    model.train()
    
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }
        
        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')
    
    torch.save(model.state_dict(), 'model/finetuned_BERT.model')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 0:   0%|          | 0/659 [00:00<?, ?it/s]