<a href="https://colab.research.google.com/github/pythonuzgit/elmurodov/blob/master/Natural%20Language%20Processing%20with%20PyTorch/Sarcastic_analysis_with_Hugging_Face_using_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd

import transformers 
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv('/content/Data.csv')
df.head()

Unnamed: 0,headlines,target
0,CNN Triumphs (At Least in Most Demographic Cat...,Non Sarcastic
1,"‘You Did The Best You Could,’ Says Iron Man Ac...",Sarcastic
2,New Emails Reveal Warm Relationship Between Ka...,Non Sarcastic
3,Donald Trump Jr. Gets Slammed Over Racist Birt...,Non Sarcastic
4,God Urges Rick Perry Not To Run For President,Sarcastic


In [None]:
df.target.value_counts()

Non Sarcastic    6531
Sarcastic        5975
Name: target, dtype: int64

In [None]:
df = df[df.target.isin(['Non Sarcastic', 'Sarcastic'])]

In [None]:
possible_labels = df.target.unique()

In [None]:
label_dict = {}

for index, possible_label in enumerate(possible_labels):
  label_dict[possible_label] = index

In [None]:
label_dict

{'Non Sarcastic': 0, 'Sarcastic': 1}

In [None]:
df.target = df['target'].map(label_dict)
df.head()

Unnamed: 0,headlines,target
0,CNN Triumphs (At Least in Most Demographic Cat...,0
1,"‘You Did The Best You Could,’ Says Iron Man Ac...",1
2,New Emails Reveal Warm Relationship Between Ka...,0
3,Donald Trump Jr. Gets Slammed Over Racist Birt...,0
4,God Urges Rick Perry Not To Run For President,1


Training and validation split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values, df.target.values, test_size = 0.15,
    stratify = df.target.values
)

In [None]:
df['data_type'] = ['not_set'] * df.shape[0]

In [None]:
df.head()

Unnamed: 0,headlines,target,data_type
0,CNN Triumphs (At Least in Most Demographic Cat...,0,not_set
1,"‘You Did The Best You Could,’ Says Iron Man Ac...",1,not_set
2,New Emails Reveal Warm Relationship Between Ka...,0,not_set
3,Donald Trump Jr. Gets Slammed Over Racist Birt...,0,not_set
4,God Urges Rick Perry Not To Run For President,1,not_set


In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['target', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,headlines
target,data_type,Unnamed: 2_level_1
0,train,5551
0,val,980
1,train,5079
1,val,896


Loading Tokenizer and Encoding

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'].headlines.values,
    add_special_tokens = True,
    return_attention_mask = True,
    padding = True,
    max_length = 256,
    return_tensors = 'pt',
    truncation = True
)

In [None]:
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type == 'val'].headlines.values,
    add_special_tokens = True,
    return_attention_mask = True,
    padding = True,
    max_length = 256,
    return_tensors = 'pt',
    truncation = True
    
)

In [None]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].target.values)

In [None]:
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].target.values)

In [None]:
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_val)

In [None]:
len(dataset_train)


10630

In [None]:
dataset_val.tensors

(tensor([[  101,  6221,  8398,  ...,     0,     0,     0],
         [  101,  2976, 15996,  ...,     0,     0,     0],
         [  101, 10223,  2079,  ...,     0,     0,     0],
         ...,
         [  101,  4533,  2284,  ...,     0,     0,     0],
         [  101, 14791,  8040,  ...,     0,     0,     0],
         [  101, 13229,  1005,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 1, 0,  ..., 1, 0, 0]))

Setting up BERT Pretrained Model

In [None]:
from transformers import BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = len(label_dict),
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Creating Data Loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

Setting Up Optimizer and Scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)

In [None]:
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

Defining our Performance Metrics

In [None]:
import numpy as np
from sklearn.metrics import f1_score

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_pred = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_pred[y_pred==label])}/{len(y_true)}\n')

Creating our Training Loop

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [None]:
def evaluate(dataloader_val):


  model.eval()

  loss_val_total = 0
  predictions, true_vals = [], []

  for batch in tqdm(dataloader_val):
    batch = tuple(b.to(device) for b in batch)

    inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

    with torch.no_grad():
      outputs = model(**inputs)   



    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()
    
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    predictions.append(logits)
    true_vals.append(label_ids)

  loss_val_avg = loss_val_total/len(dataloader_val) 
  predictions = np.concatenate(predictions, axis = 0)

  true_vals = np.concatenate(true_vals, axis= 0)   
  return loss_val_avg, predictions, true_vals


In [None]:
from tqdm.notebook import trange, tqdm

In [None]:
for epoch in tqdm(range(1, epochs+1)):
  
  model.train()
  loss_train_total = 0

  progress_bar = tqdm(dataloader_train,
                      desc = 'Epoch {:1d}'. format(epoch), leave = False, disable = False)
  
  for batch in progress_bar:
    model.zero_grad()
    batch = tuple(b.to(device) for b in batch)
    inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

    outputs = model(**inputs)
    loss = outputs[0]
    loss_train_total += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    

    optimizer.step()
    scheduler.step()

    progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


  tqdm.write('\nEpoch {epoch}')

  loss_train_avg = loss_train_total /len(dataloader_train)
  tqdm.write(f'Training loss : {loss_train_avg}')

  val_loss, predictions, true_vals = evaluate(dataloader_val)
  val_f1 = f1_score_func(predictions, true_vals)
  tqdm.write(f'Validation loss: {val_loss}')
  tqdm.write(f'F1 Score (weighted): {val_f1}')

       

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/2658 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.31165715754386025


  0%|          | 0/59 [00:00<?, ?it/s]

Validation loss: 0.25809176024286296
F1 Score (weighted): 0.9467203184774967


Epoch 2:   0%|          | 0/2658 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.13550234039293527


  0%|          | 0/59 [00:00<?, ?it/s]

Validation loss: 0.3015881099124534
F1 Score (weighted): 0.9472317954977683


Epoch 3:   0%|          | 0/2658 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.058003982298725054


  0%|          | 0/59 [00:00<?, ?it/s]

Validation loss: 0.3201594497846692
F1 Score (weighted): 0.9520068327438745


Epoch 4:   0%|          | 0/2658 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.03603958962601451


  0%|          | 0/59 [00:00<?, ?it/s]

Validation loss: 0.3714266975823995
F1 Score (weighted): 0.9509403179159607


Epoch 5:   0%|          | 0/2658 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.014218971878020082


  0%|          | 0/59 [00:00<?, ?it/s]

Validation loss: 0.42058199814540387
F1 Score (weighted): 0.9487347065251251


Epoch 6:   0%|          | 0/2658 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.013533540357472852


  0%|          | 0/59 [00:00<?, ?it/s]

Validation loss: 0.6101323539628538
F1 Score (weighted): 0.9271226323796563


Epoch 7:   0%|          | 0/2658 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.005714833031212439


  0%|          | 0/59 [00:00<?, ?it/s]

Validation loss: 0.5574252013890161
F1 Score (weighted): 0.9451189636628061


Epoch 8:   0%|          | 0/2658 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.0066316652079630315


  0%|          | 0/59 [00:00<?, ?it/s]

Validation loss: 0.5266620717514292
F1 Score (weighted): 0.9487894783853322


Epoch 9:   0%|          | 0/2658 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.00433910556161521


  0%|          | 0/59 [00:00<?, ?it/s]

Validation loss: 0.5370181508891363
F1 Score (weighted): 0.9482617459815507


Epoch 10:   0%|          | 0/2658 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.0017011900021941743


  0%|          | 0/59 [00:00<?, ?it/s]

Validation loss: 0.557865588446083
F1 Score (weighted): 0.9487932755793796


Evaluating our Model

In [None]:
accuracy_per_class(predictions, true_vals)

Class: Non Sarcastic
Accuracy:943/980

Class: Sarcastic
Accuracy:837/896

