## Import Libraries

In [1]:
# !pip install transformers
# !pip install sentencepiece
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score, roc_auc_score
import pickle
from transformers import BertTokenizer, XLNetTokenizer, RobertaTokenizer, BertForSequenceClassification, XLNetForSequenceClassification, RobertaForSequenceClassification, AdamW
from tqdm import tqdm, trange
from ast import literal_eval

Using TensorFlow backend.


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce GTX 1060'

## Load and Preprocess Training Data

In [5]:
!pwd

/home/shwetkm/EmoRegCom/nbs


In [6]:
with open('../data/public_train/train_np_img_norm','rb') as f: X_img_train = pickle.load(f)
X_img_train.shape

(4365, 224, 224, 3)

In [7]:
sample_size = 4365

In [8]:
with open('../data/public_train/test_np_img_norm', 'rb') as f: X_img_test = pickle.load(f)
X_img_test.shape

(1213, 224, 224, 3)

In [9]:
with open('../data/public_train/val_np_img_norm', 'rb') as f: X_img_val = pickle.load(f)
X_img_val.shape


(486, 224, 224, 3)

In [10]:
X_img_train = X_img_train[:sample_size]
X_img_val = X_img_val[:sample_size]
X_img_test = X_img_test[:sample_size]

In [11]:
X_img_val = np.reshape(X_img_val, (X_img_val.shape[0], 3, 224, 224))
X_img_val.shape

(486, 3, 224, 224)

In [12]:
X_img_test = np.reshape(X_img_test, (X_img_test.shape[0], 3, 224, 224))

In [13]:
X_img_train = np.reshape(X_img_train, (X_img_train.shape[0], 3, 224, 224))

In [14]:
df = pd.read_csv('../data/public_train/train_data.csv')

In [15]:
print('Unique ocr_texts: ', df.text_clean.nunique() == df.shape[0])
print('Null values: ', df.isnull().values.any())

Unique ocr_texts:  False
Null values:  False


In [16]:
print('average sentence length: ', df.text_clean.str.split().str.len().mean())
print('stdev sentence length: ', df.text_clean.str.split().str.len().std())

average sentence length:  25.596334478808707
stdev sentence length:  24.795639258733278


In [17]:
cols = df.columns
label_cols = list(cols[2:10])
num_labels = len(label_cols)
print('Label columns: ', label_cols)

Label columns:  ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral', 'Others']


In [18]:
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,id,image_id,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral,Others,dialog,narration,text,text_clean,emotion_list,one_hot_labels
0,575.0,1308_48_2,1,0,0,1,0,0,0,0,['wait a minute im not going to hurt you !'],[],['wait a minute im not going to hurt you !'],['wait a minute i am not going to hurt you !'],"['Angry', 'Happy']","[1, 0, 0, 1, 0, 0, 0, 0]"
1,5395.0,3766_29_2,0,1,0,1,0,0,1,0,[' hear that trody ? they meed a nsw carew maa...,[],[' hear that trody ? they meed a nsw carew maa...,"['he thought they need a new careman , looks l...","['Disgust', 'Happy', 'Neutral']","[0, 1, 0, 1, 0, 0, 1, 0]"
2,2004.0,2112_17_7,1,1,0,0,0,0,0,0,['the comet leaps into action his bouyancy all...,['the comet leaps into action his bouyancy all...,['the comet leaps into action his bouyancy all...,"['the comet leaps into action , his bouyancy a...","['Angry', 'Disgust']","[1, 1, 0, 0, 0, 0, 0, 0]"
3,4863.0,3458_16_7,0,0,0,0,0,0,1,0,"['its in there . isnt mate ?', ""yeah - t ' s i...",[],"['its in there . isnt mate ?', ""yeah - t ' s i...","['is it there .is not mate?', 'yeah t is in ...",['Neutral'],"[0, 0, 0, 0, 0, 0, 1, 0]"
4,5146.0,2338_19_3,0,0,1,0,0,1,1,0,"['listen und pass der yord along . bzzzz21', '...",[],"['listen und pass der yord along . bzzzz21', '...","['listen and pass your way . bzzzz21 .', '...","['Fear', 'Surprise', 'Neutral']","[0, 0, 1, 0, 0, 1, 1, 0]"


In [19]:
train_labels = list(df.one_hot_labels.values)
ocr_texts = list(df.text_clean.values)

In [20]:
df.text_clean.apply(lambda x: len(x.split())).describe()

count    4365.000000
mean       25.596334
std        24.795639
min         1.000000
25%        12.000000
50%        21.000000
75%        33.000000
max       616.000000
Name: text_clean, dtype: float64

In [21]:
max_length = 35
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(ocr_texts,max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [22]:
train_inputs = encodings['input_ids'] # tokenized and encoded sentences
train_token_types = encodings['token_type_ids'] # token type ids
train_masks = encodings['attention_mask'] # attention masks

Be sure to handle all classes during validation using "stratify" during train/validation split:

In [23]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

In [24]:
validation_df = pd.read_csv('../data/public_train/val_data.csv')
validation_label_cols = list(validation_df.columns[2:10])
print('Null values: ', validation_df.isnull().values.any()) #should not be any null sentences or labels
print('Same columns between train and validation: ', label_cols == validation_label_cols) #columns should be the same

Null values:  False
Same columns between train and validation:  True


In [25]:
validation_df['one_hot_labels'] = list(validation_df[validation_label_cols].values)
validation_df.head()

Unnamed: 0,id,image_id,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral,Others,dialog,narration,text,text_clean,emotion_list,one_hot_labels
0,340.0,1179_7_2,0,0,0,0,1,1,0,0,['oops ! cheap twine or must have gained weigh...,"['plummets down', 'he vaul7 front']",['oops ! cheap twine or must have gained weigh...,['ops ! cheap twine or must have gained weig...,"['Sad', 'Surprise']","[0, 0, 0, 0, 1, 1, 0, 0]"
1,3831.0,2258_29_2,1,1,1,1,0,0,1,0,"[' just a minute , you ...']",[],"[' just a minute , you ...']","['just a minute , you . . . .']","['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral']","[1, 1, 1, 1, 0, 0, 1, 0]"
2,2465.0,3832_23_3,1,1,0,0,1,0,1,0,"[""they think they bungled and are going to rep...",[],"[""they think they bungled and are going to rep...",['they think they bungled and are going to rep...,"['Angry', 'Disgust', 'Sad', 'Neutral']","[1, 1, 0, 0, 1, 0, 1, 0]"
3,672.0,1377_39_7,1,0,0,0,0,1,1,0,['owuch'],[],['owuch'],['wow .'],"['Angry', 'Surprise', 'Neutral']","[1, 0, 0, 0, 0, 1, 1, 0]"
4,5145.0,777_20_0,1,0,0,0,0,0,1,0,"[""look nightmare ! o he ' s gone through w the...",[],"[""look nightmare ! o he ' s gone through w the...",['look nightmare ! he is gone through the we...,"['Angry', 'Neutral']","[1, 0, 0, 0, 0, 0, 1, 0]"


In [26]:
validation_labels = list(validation_df.one_hot_labels.values)
validation_ocr_texts = list(validation_df.text_clean.values)

In [27]:
# Encoding input data
validation_encodings = tokenizer.batch_encode_plus(validation_ocr_texts,max_length=max_length,pad_to_max_length=True)
validation_input_ids = validation_encodings['input_ids']
validation_token_type_ids = validation_encodings['token_type_ids']
validation_attention_masks = validation_encodings['attention_mask']

In [28]:
# Make tensors out of data
validation_inputs = torch.tensor(validation_input_ids)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_attention_masks)
validation_token_types = torch.tensor(validation_token_type_ids)

In [29]:
test_df = pd.read_csv('../data/public_train/test_data.csv')
test_label_cols = list(test_df.columns[2:10])
print('Null values: ', test_df.isnull().values.any()) #should not be any null sentences or labels
print('Same columns between train and test: ', label_cols == test_label_cols) #columns should be the same

Null values:  False
Same columns between train and test:  True


In [30]:
test_df['one_hot_labels'] = list(test_df[test_label_cols].values)
test_df.head()

Unnamed: 0,id,image_id,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral,Others,dialog,narration,text,text_clean,emotion_list,one_hot_labels
0,4184.0,3812_3_3,1,1,1,0,0,0,1,0,['or this what kind of reputation are these pi...,"[""tha day prog ahranean the recovery ou thats ...",['or this what kind of reputation are these pi...,['or this what kind of reputation are these pi...,"['Angry', 'Disgust', 'Fear', 'Neutral']","[1, 1, 1, 0, 0, 0, 1, 0]"
1,132.0,1088_28_3,1,0,1,0,0,0,1,0,"[""he ' s not telling all he knows do you think...",[],"[""he ' s not telling all he knows do you think...","['he is not telling all he knows , do you thin...","['Angry', 'Fear', 'Neutral']","[1, 0, 1, 0, 0, 0, 1, 0]"
2,3543.0,479_14_0,1,0,0,0,0,0,1,0,"[""you big stupid why don ' t you watch where y...",[],"[""you big stupid why don ' t you watch where y...",['you big stupid why do not you watch where yo...,"['Angry', 'Neutral']","[1, 0, 0, 0, 0, 0, 1, 0]"
3,4692.0,859_24_1,1,1,1,0,0,1,1,0,"[""fight it out with him it ' s the gallows if ...",[],"[""fight it out with him it ' s the gallows if ...",['fight it out with him it is the gallows if h...,"['Angry', 'Disgust', 'Fear', 'Surprise', 'Neut...","[1, 1, 1, 0, 0, 1, 1, 0]"
4,4762.0,2260_47_8,0,0,1,1,0,0,1,0,['this way to the roof'],['the fleeing boxer and photographer'],"['this way to the roof', 'the fleeing boxer an...","['this way to the roof .', 'the fleeing boxe...","['Fear', 'Happy', 'Neutral']","[0, 0, 1, 1, 0, 0, 1, 0]"


In [31]:
# Gathering input data
test_labels = list(test_df.one_hot_labels.values)
test_ocr_texts = list(test_df.text_clean.values)

In [32]:
# Encoding input data
test_encodings = tokenizer.batch_encode_plus(test_ocr_texts,max_length=max_length,pad_to_max_length=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']

In [33]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)

In [34]:
text_train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
img_train_data = TensorDataset(torch.from_numpy(X_img_train), train_labels)

text_val_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
img_val_data = TensorDataset(torch.from_numpy(X_img_val), validation_labels)

text_test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
img_test_data = TensorDataset(torch.from_numpy(X_img_test), test_labels)

batch_size = 16

text_train_loader = DataLoader(text_train_data, batch_size=batch_size)
img_train_loader = DataLoader(img_train_data, batch_size=batch_size)

text_val_loader = DataLoader(text_val_data, batch_size=batch_size)
img_val_loader = DataLoader(img_val_data, batch_size=batch_size)

text_test_loader = DataLoader(text_test_data, batch_size=batch_size)
img_test_loader = DataLoader(img_test_data, batch_size=batch_size)

print(len(text_train_loader), len(img_train_loader))
print(len(text_val_loader), len(img_val_loader))
print(len(text_test_loader), len(img_test_loader))

273 273
31 31
76 76


## Load Model & Set Params

In [35]:
# Load model, the pretrained model will include a single linear classification layer on top for classification. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

Setting custom optimization parameters for the AdamW optimizer https://huggingface.co/transformers/main_classes/optimizer_schedules.html

In [36]:
# setting custom optimization parameters.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [37]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

## Train Model

In [38]:
# Store our loss and accuracy for plotting
train_loss_set = []
val_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 5

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  val_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  nb_val_steps = 0
  
  # Train the data for one epoch
  for step, batch in enumerate(text_train_loader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, batch in enumerate(text_val_loader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      v_loss = loss_func(b_logit_pred.view(-1,num_labels),b_labels.type_as(b_logit_pred).view(-1,num_labels)) #convert labels to float for calculation
      val_loss_set.append(v_loss.item())  
      val_loss += v_loss.item()
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)
    nb_val_steps += 1

  print("Val loss: {}".format(val_loss/nb_val_steps))
  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  threshold = 0.50
  pred_bools = [pl>threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='macro')*100
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100
  val_roc_score = roc_auc_score(true_bools, pred_bools,average='macro')*100

  print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)
  print('ROC AUC Score: ', val_roc_score)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Train loss: 0.5749105777277614


Epoch:  20%|██        | 1/5 [02:49<11:19, 169.82s/it]

Val loss: 0.5705188455120209
F1 Validation Accuracy:  25.65019451588873
Flat Validation Accuracy:  7.20164609053498
ROC AUC Score:  53.793826353090864
Train loss: 0.5439539429468986


Epoch:  40%|████      | 2/5 [05:43<08:36, 172.25s/it]

Val loss: 0.565084324729058
F1 Validation Accuracy:  32.53643399117978
Flat Validation Accuracy:  7.20164609053498
ROC AUC Score:  55.682933678723415
Train loss: 0.5034820013867193


Epoch:  60%|██████    | 3/5 [08:32<05:41, 170.86s/it]

Val loss: 0.5782314712001432
F1 Validation Accuracy:  33.74888845903668
Flat Validation Accuracy:  6.995884773662551
ROC AUC Score:  55.963382106910586
Train loss: 0.45082390788710597


Epoch:  80%|████████  | 4/5 [11:23<02:50, 170.66s/it]

Val loss: 0.6166491220074315
F1 Validation Accuracy:  34.70450395297884
Flat Validation Accuracy:  6.790123456790123
ROC AUC Score:  56.09149635463038
Train loss: 0.39367308077358065


Epoch: 100%|██████████| 5/5 [14:21<00:00, 172.39s/it]

Val loss: 0.6620262188296164
F1 Validation Accuracy:  37.060902068724936
Flat Validation Accuracy:  3.909465020576132
ROC AUC Score:  55.897397638649025





In [39]:
torch.save(model.state_dict(), 'bert_model_1')

## Prediction and Metics

In [40]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(text_test_loader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

We need to threshold our sigmoid function outputs which range from [0, 1]. Below I use 0.50 as a threshold.

In [41]:
pred_bools = [pl>0.50 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test ROC AUC Score: ', roc_auc_score(true_bools, pred_bools,average='macro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=test_label_cols)
pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

Test F1 Accuracy:  0.5038585209003217
Test ROC AUC Score:  0.5575571407677811
Test Flat Accuracy:  0.05276174773289365 

              precision    recall  f1-score   support

       Angry       0.46      0.66      0.55       474
     Disgust       0.49      0.29      0.36       450
        Fear       0.52      0.33      0.40       444
       Happy       0.55      0.67      0.60       516
         Sad       0.16      0.01      0.03       206
    Surprise       0.46      0.25      0.33       412
     Neutral       0.70      0.66      0.68       797
      Others       0.00      0.00      0.00        75

   micro avg       0.55      0.46      0.50      3374
   macro avg       0.42      0.36      0.37      3374
weighted avg       0.51      0.46      0.47      3374
 samples avg       0.57      0.49      0.50      3374



  _warn_prf(average, modifier, msg_start, len(result))


## Output Dataframe

In [42]:
idx2label = dict(zip(range(8),label_cols))
print(idx2label)

{0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'Sad', 5: 'Surprise', 6: 'Neutral', 7: 'Others'}


In [43]:
# Getting indices of where boolean one hot vector true_bools is True so we can use idx2label to gather label names
true_label_idxs, pred_label_idxs=[],[]
for vals in true_bools:
  true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [44]:
# Gathering vectors of label names using idx2label
true_label_texts, pred_label_texts = [], []
for vals in true_label_idxs:
  if vals:
    true_label_texts.append([idx2label[val] for val in vals])
  else:
    true_label_texts.append(vals)

for vals in pred_label_idxs:
  if vals:
    pred_label_texts.append([idx2label[val] for val in vals])
  else:
    pred_label_texts.append(vals)

In [57]:
# Decoding input ids to comment text
ocr_texts = [tokenizer.decode(text,skip_special_tokens=True,clean_up_tokenization_spaces=False) for text in tokenized_texts]

In [58]:
# Converting lists to df
comparisons_df = pd.DataFrame({'text_clean': ocr_texts, 'true_labels': true_label_texts, 'pred_labels':pred_label_texts})
comparisons_df.head()

Unnamed: 0,text_clean,true_labels,pred_labels
0,[ ' or this what kind of reputation are these ...,"[Angry, Disgust, Fear, Neutral]","[Angry, Neutral]"
1,"[ ' he is not telling all he knows , do you th...","[Angry, Fear, Neutral]",[Neutral]
2,[ ' you big stupid why do not you watch where ...,"[Angry, Neutral]","[Angry, Fear]"
3,[ ' fight it out with him it is the gallows if...,"[Angry, Disgust, Fear, Surprise, Neutral]","[Angry, Neutral]"
4,"[ ' this way to the roof . ' , ' the fleeing b...","[Fear, Happy, Neutral]","[Angry, Fear, Surprise, Neutral]"


## Bonus - Optimizing threshold value for macro ROC score

Doing this may result in a trade offs between precision, flat accuracy and micro F1 accuracy. You may tune the threshold however you want.

In [48]:
# Calculate Accuracy - maximize roc_auc score by tuning threshold values. First with 'macro_thresholds' on the order of e^-1 then with 'micro_thresholds' on the order of e^-2

macro_thresholds = np.array(range(1,10))/10

roc_auc_results, flat_acc_results = [], []
for th in macro_thresholds:
  pred_bools = [pl>th for pl in pred_labels]
  test_roc_auc_accuracy = roc_auc_score(true_bools,pred_bools,average='macro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  roc_auc_results.append(test_roc_auc_accuracy)
  flat_acc_results.append(test_flat_accuracy)

best_macro_th = macro_thresholds[np.argmax(roc_auc_results)] #best macro threshold value

micro_thresholds = (np.array(range(10))/100)+best_macro_th #calculating micro threshold values

roc_auc_results, flat_acc_results = [], []
for th in micro_thresholds:
  pred_bools = [pl>th for pl in pred_labels]
  test_roc_auc_accuracy = roc_auc_score(true_bools,pred_bools,average='macro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  roc_auc_results.append(test_roc_auc_accuracy)
  flat_acc_results.append(test_flat_accuracy)

best_roc_auc_idx = np.argmax(roc_auc_results) #best threshold value

# Printing and saving classification report
print('Best Threshold: ', micro_thresholds[best_roc_auc_idx])
print('Test roc_auc Accuracy: ', roc_auc_results[best_roc_auc_idx])
print('Test Flat Accuracy: ', flat_acc_results[best_roc_auc_idx], '\n')

best_pred_bools = [pl>micro_thresholds[best_roc_auc_idx] for pl in pred_labels]
clf_report_optimized = classification_report(true_bools,best_pred_bools, target_names=label_cols)
pickle.dump(clf_report_optimized, open('classification_report_optimized.txt','wb'))
print(clf_report_optimized)

Best Threshold:  0.62
Test roc_auc Accuracy:  0.5593396045820264
Test Flat Accuracy:  0.05358615004122012 

              precision    recall  f1-score   support

       Angry       0.50      0.58      0.53       474
     Disgust       0.57      0.16      0.24       450
        Fear       0.59      0.25      0.36       444
       Happy       0.57      0.59      0.58       516
         Sad       0.50      0.01      0.02       206
    Surprise       0.48      0.14      0.22       412
     Neutral       0.74      0.55      0.63       797
      Others       0.00      0.00      0.00        75

   micro avg       0.59      0.37      0.46      3374
   macro avg       0.49      0.29      0.32      3374
weighted avg       0.57      0.37      0.42      3374
 samples avg       0.62      0.40      0.45      3374



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
best_pred_label_idxs = []
for vals in best_pred_bools:
    best_pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [54]:
best_pred_label_texts = []
for vals in best_pred_label_idxs:
  if vals:
    best_pred_label_texts.append([idx2label[val] for val in vals])
  else:
    best_pred_label_texts.append(vals)

In [64]:
test_df['pred_bert_text_cls'] = best_pred_label_texts

In [67]:
test_df[['image_id','pred_bert_text_cls']].to_csv('../data/public_train/predictions/bert_text_cls_56.csv',index=None)

In [49]:
roc_auc_score(true_bools, best_pred_bools,average='macro')

0.5593396045820264