In [6]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
!wget 'https://raw.githubusercontent.com/Kausthub8/Emotion-Analysis-Using-BERT-With-Pytorch/master/smile-annotations-final.csv'

--2020-12-31 04:46:48--  https://raw.githubusercontent.com/Kausthub8/Emotion-Analysis-Using-BERT-With-Pytorch/master/smile-annotations-final.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 429669 (420K) [text/plain]
Saving to: ‘smile-annotations-final.csv’


2020-12-31 04:46:48 (16.2 MB/s) - ‘smile-annotations-final.csv’ saved [429669/429669]



In [18]:
df = pd.read_csv('smile-annotations-final.csv', names=['id','text','category'])
df.set_index('id', inplace=True)

In [19]:
df.sample(10)

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
612328414201147392,The Great Court of the @britishmuseum #london....,nocode
615256552669360128,@MrStuchbery @britishmuseum ok. Thank you,happy
612651359448363008,@britishmuseum That's cool!,happy
613764239782727680,@NationalGallery I'm afraid quite a lot is los...,not-relevant
613267675205214209,Working meeting at @britishmuseum &gt; what a ...,happy
615452195102851072,Archaeology magazine featured @britishmuseum a...,not-relevant
611535568732696576,What lies beneath: drapery and the suggested f...,nocode
612686916006322176,@NationalGallery Make a great BBC4 TV document...,happy
608297265770340352,Save the date for Artist Talk: Svend Bayer @Pl...,nocode
615176084854820864,"Cool: @britishmuseum putting 30,000 years, 25,...",happy


In [20]:
df.category.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|angry               2
sad|disgust             2
sad|disgust|angry       1
Name: category, dtype: int64

In [21]:
df = df[~df.category.str.contains('\|')]
df = df[df.category!='nocode']
df.category.value_counts()

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: category, dtype: int64

In [17]:
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [23]:
possible_labels = df.category.unique()
label_dict = {}
for index, label in enumerate(possible_labels):
  label_dict[label] = index
label_dict

{'angry': 2,
 'disgust': 3,
 'happy': 0,
 'not-relevant': 1,
 'sad': 4,
 'surprise': 5}

In [24]:
df['label'] = df.category.replace(label_dict)
df.sample(5)

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
612713827470299136,#DefiningBeauty @britishmuseum stunning. See i...,happy,0
610882059494539264,From 12pm on @BBCCambs we preview #CastleHillO...,not-relevant,1
610487926229704704,@NationalGallery #AskTheGallery As a director ...,happy,0
613650879674085376,If you haven't checked out #TARA the TRUST FOR...,happy,0
610261969484935168,@appendixjournal @MartinPribble @FitzMuseum_UK...,surprise,5


In [29]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.label.values,
                                                  test_size=0.2,
                                                  random_state=40,
                                                  stratify=df.label.values)

In [30]:
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
df.sample(5)

Unnamed: 0_level_0,text,category,label,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
610576888575205379,Solidarity @greenpeace #kayaktivists We r figh...,angry,2,train
611556819962589184,@NConcostrina @britishmuseum: .@Charles_XII Th...,surprise,5,train
612533745099345920,Happy Father's Day &amp; Solstice! Guided tour...,happy,0,train
614414044477440001,@britishmuseum #DefiningBeauty excellent a hig...,happy,0,val
615429077042917376,Just because it's #Monday! An Old Woman (The #...,happy,0,train


In [32]:
df.groupby(['category','label','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
angry,2,train,45
angry,2,val,12
disgust,3,train,5
disgust,3,val,1
happy,0,train,909
happy,0,val,228
not-relevant,1,train,171
not-relevant,1,val,43
sad,4,train,26
sad,4,val,6


In [None]:
!pip install -q transformers

In [35]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [43]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [45]:
encoded_data_train = tokenizer.batch_encode_plus(df[df.data_type=='train'].text.values, 
                                                     add_special_tokens = True, 
                                                     return_attention_mask = True, 
                                                     padding = True, 
                                                     max_length = 256, 
                                                     return_tensors = 'pt')

encoded_data_val = tokenizer.batch_encode_plus(df[df.data_type=='val'].text.values,
                                                     add_special_tokens = True,
                                                     return_attention_mask = True,
                                                     padding = True,
                                                     max_length = 256,
                                                     return_tensors = 'pt')

In [48]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = encoded_data_train['token_type_ids']

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = encoded_data_val['token_type_ids']

In [49]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [50]:
len(dataset_train)

1184

In [54]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [55]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 4 #32

dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
dataloader_val = DataLoader(dataset_val, sampler=RandomSampler(dataset_val), batch_size=batch_size)

In [57]:
from transformers import AdamW, get_linear_schedule_with_warmup

epochs = 10
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)

In [58]:
import numpy as np
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return f1_score(labels_flat, preds_flat, average='weighted')

In [60]:
def accuracy_per_class(preds, labels):
  label_dict_inverse = {v: k for k, v in label_dict.items()}

  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()

  for label in np.unique(labels_flat):
    y_preds = preds_flat[labels_flat==label]
    y_true = labels_flat[labels_flat==label]
    print(f'Class: {label_dict_inverse[label]}')
    print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}')

In [68]:
import random

seed = 40
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cpu


In [69]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [71]:
for epoch in tqdm(range(1, epochs+1)):

  model.train()
  loss_train_total = 0
  progress_bar = tqdm(dataloader_train, 
                      desc='Epoch {:1d}'.format(epoch),
                      leave=False,
                      disable=False)
  
  for batch in progress_bar:
    model.zero_grad()
    batch = tuple(b.to(device) for b in batch)
    
    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1],
              'labels':         batch[2],
              }
    outputs = model(**inputs)
    loss = outputs[0]
    loss_train_total+=loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()

    progress_bar.set_postfix('training loss: {:.3f}'.format(loss.item()/len(batch)))

  torch.save(model.state_dict(), f'BERT_ft_epoch{epoch}.model')

  tqdm.write(f'\nEpoch {epoch}')

  loss_train_avg = loss_train_total/len(dataloader_train)
  tqdm.write(f'Training loss: {loss_train_avg}')

  val_loss, predictions, true_vals = evaluate(dataloader_val)
  val_f1 = f1_score_func(predictions, true_vals)
  tqdm.write(f'Validation loss: {val_loss}')
  tqdm.write(f'F1 score (weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=296.0, style=ProgressStyle(description_widt…

ValueError: ignored

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('BERT_ft_epoch1.model'))