## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from efficientnet_pytorch import EfficientNet
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score, roc_auc_score
import pickle
from transformers import RobertaTokenizer,RobertaModel, XLNetTokenizer, RobertaTokenizer, BertForSequenceClassification, XLNetForSequenceClassification, RobertaModel, AdamW
from tqdm import tqdm, trange
from ast import literal_eval

Using TensorFlow backend.


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce GTX 1060'

In order to avoid memory issues with Google Colab, I enforce a max_length of 100 tokens. Note that some sentences may not adequately represent each label because of this.

## Load Model & Set Params

## Load and Preprocess Training Data

Dataset will be tokenized then split into training and validation sets. The validation set will be used to monitor training. For testing a separate test set will be loaded for analysis.

In [5]:
df = pd.read_csv('../data/public_train/train_data.csv')
df.head()

Unnamed: 0,id,image_id,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral,Others,dialog,narration,text,text_clean,emotion_list
0,575.0,1308_48_2,1,0,0,1,0,0,0,0,['wait a minute im not going to hurt you !'],[],['wait a minute im not going to hurt you !'],['wait a minute i am not going to hurt you !'],"['Angry', 'Happy']"
1,5395.0,3766_29_2,0,1,0,1,0,0,1,0,[' hear that trody ? they meed a nsw carew maa...,[],[' hear that trody ? they meed a nsw carew maa...,"['he thought they need a new careman , looks l...","['Disgust', 'Happy', 'Neutral']"
2,2004.0,2112_17_7,1,1,0,0,0,0,0,0,['the comet leaps into action his bouyancy all...,['the comet leaps into action his bouyancy all...,['the comet leaps into action his bouyancy all...,"['the comet leaps into action , his bouyancy a...","['Angry', 'Disgust']"
3,4863.0,3458_16_7,0,0,0,0,0,0,1,0,"['its in there . isnt mate ?', ""yeah - t ' s i...",[],"['its in there . isnt mate ?', ""yeah - t ' s i...","['is it there .is not mate?', 'yeah t is in ...",['Neutral']
4,5146.0,2338_19_3,0,0,1,0,0,1,1,0,"['listen und pass der yord along . bzzzz21', '...",[],"['listen und pass der yord along . bzzzz21', '...","['listen and pass your way . bzzzz21 .', '...","['Fear', 'Surprise', 'Neutral']"


In [6]:
len(df)

4365

In [12]:
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,id,image_id,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral,Others,dialog,narration,text,text_clean,emotion_list,one_hot_labels
0,575.0,1308_48_2,1,0,0,1,0,0,0,0,['wait a minute im not going to hurt you !'],[],['wait a minute im not going to hurt you !'],['wait a minute i am not going to hurt you !'],"['Angry', 'Happy']","[1, 0, 0, 1, 0, 0, 0, 0]"
1,5395.0,3766_29_2,0,1,0,1,0,0,1,0,[' hear that trody ? they meed a nsw carew maa...,[],[' hear that trody ? they meed a nsw carew maa...,"['he thought they need a new careman , looks l...","['Disgust', 'Happy', 'Neutral']","[0, 1, 0, 1, 0, 0, 1, 0]"
2,2004.0,2112_17_7,1,1,0,0,0,0,0,0,['the comet leaps into action his bouyancy all...,['the comet leaps into action his bouyancy all...,['the comet leaps into action his bouyancy all...,"['the comet leaps into action , his bouyancy a...","['Angry', 'Disgust']","[1, 1, 0, 0, 0, 0, 0, 0]"
3,4863.0,3458_16_7,0,0,0,0,0,0,1,0,"['its in there . isnt mate ?', ""yeah - t ' s i...",[],"['its in there . isnt mate ?', ""yeah - t ' s i...","['is it there .is not mate?', 'yeah t is in ...",['Neutral'],"[0, 0, 0, 0, 0, 0, 1, 0]"
4,5146.0,2338_19_3,0,0,1,0,0,1,1,0,"['listen und pass der yord along . bzzzz21', '...",[],"['listen und pass der yord along . bzzzz21', '...","['listen and pass your way . bzzzz21 .', '...","['Fear', 'Surprise', 'Neutral']","[0, 0, 1, 0, 0, 1, 1, 0]"


In [13]:
train_labels = list(df.one_hot_labels.values)
ocr_texts = list(df.text_clean.values)

In [14]:
max_length = 35
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(ocr_texts,max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


tokenizer outputs:  dict_keys(['input_ids', 'attention_mask'])


In [15]:
encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [25]:
train_inputs = encodings['input_ids'] # tokenized and encoded sentences
# train_token_types = encodings['token_type_ids'] # token type ids
train_masks = encodings['attention_mask'] # attention masks

In [26]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = df.one_hot_labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

df label indices with only one instance:  [4304, 4123, 3612, 3314, 3255, 3240, 3227, 3167, 2530, 2506, 2379, 2369, 2253, 2109, 2015, 1819, 1756, 1720, 1626, 1503, 1489, 1432, 1205, 1128, 1062, 689, 441, 286, 190, 176]


In [16]:
class CNN_BERT(nn.Module):
  def __init__(self):
    super(CNN_BERT, self).__init__()

    # BERT for the text overview
    self.text_model = RobertaModel.from_pretrained('roberta-base')
    self.dropout = nn.Dropout(0.3)
    self.text_fc = nn.Linear(768,32)

    # CNN for the posters
    self.effnet = EfficientNet.from_pretrained('efficientnet-b2')
    self.effnet_fc = nn.Linear(1000, 32)
    self.n_out = 8
#     self.concat_dropout = nn.Dropout(0.1)
    self.output_fc = nn.Linear(64, self.n_out)


  def forward(self, input_ids, attention_mask, cnn_inp):
    text_outputs = self.text_model(input_ids, attention_mask)
#     text_outputs = text_outputs['last_hidden_state']
    text_outputs = text_outputs[0][:, 0, :]
    text_outputs = self.dropout(text_outputs)
    text_outputs = self.text_fc(text_outputs)
    
    x = self.effnet(cnn_inp)
    x = self.dropout(x)
    cnn_out = F.relu(self.effnet_fc(x))
    combined_inp = torch.cat((cnn_out, text_outputs), 1)
#     out = torch.sigmoid(self.output_fc(self.concat_dropout(combined_inp)))
    out = torch.sigmoid(self.output_fc(combined_inp))

    return out

In [17]:
model = CNN_BERT()
model.load_state_dict(torch.load('roberta_effnet_model_3'))
model.cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loaded pretrained weights for efficientnet-b2


CNN_BERT(
  (text_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,

## Load and Preprocess Test Data

## Public Test Data

In [None]:
with open('../data/public_train/public_test_np_img_norm', 'rb') as f: X_img_test_public = pickle.load(f)
X_img_test_public.shape

In [None]:
X_img_test_public = np.reshape(X_img_test_public, (X_img_test_public.shape[0], 3, 224, 224))
X_img_test_public.shape

In [None]:
public_test_df = pd.read_csv('../data/public_train/public_test_dataset.csv')

In [None]:
public_test_ocr_texts = list(public_test_df.text_clean.values)

In [None]:
# Encoding input data
public_test_encodings = tokenizer.batch_encode_plus(public_test_ocr_texts,max_length=max_length,pad_to_max_length=True)
public_test_input_ids = public_test_encodings['input_ids']
# public_test_token_type_ids = public_test_encodings['token_type_ids']
public_test_attention_masks = public_test_encodings['attention_mask']

In [None]:
public_test_labels = [0*2046]

In [None]:
# Make tensors out of data
public_test_inputs = torch.tensor(public_test_input_ids)
public_test_labels = torch.tensor(public_test_labels)
public_test_masks = torch.tensor(public_test_attention_masks)

In [None]:
text_public_test_data = TensorDataset(public_test_inputs, public_test_masks)
img_public_test_data = TensorDataset(torch.from_numpy(X_img_test_public),)

batch_size = 16

text_public_test_loader = DataLoader(text_public_test_data, batch_size=batch_size)
img_public_test_loader = DataLoader(img_public_test_data, batch_size=batch_size)

print(len(text_public_test_loader), len(img_public_test_loader))

In [None]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for text_batch, img_batch in zip(text_public_test_loader,img_public_test_loader):
  text_batch = tuple(t.to(device) for t in text_batch)
  img_batch = tuple(t.to(device) for t in img_batch)
#   print(text_batch)
  b_input_ids, b_input_mask = text_batch
#   print(b_input_ids,b_input_mask)
  cnn_inp = img_batch
#   print(cnn_inp[0])
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, b_input_mask,cnn_inp[0])
    pred_label = outs.squeeze()
    pred_label = pred_label.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
#   logit_preds.append(b_logit_pred)
#   true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
# true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
# true_bools = [tl==1 for tl in true_labels]

In [None]:
public_test_df = public_test_df.reset_index()

In [None]:
emotion_pred = pd.DataFrame({'id':public_test_df['index'],'image_id':public_test_df['image_id']})

In [None]:
emotion_pred

In [None]:
emotion_pred = pd.concat([emotion_pred,pd.DataFrame(pred_labels)],axis=1)

In [None]:
len(emotion_pred)

In [58]:
emotion_pred.to_csv('../data/public_train/submissions/roberta_efficientnet_overall/results.csv',index=None,header=None)

In [59]:
tfidf_pred = pd.read_csv('../data/public_train/submissions/tfidf_baseline/results.csv',header=None)

In [60]:
tfidf_pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1000_4_6,0.523208,0.447437,0.290447,0.184556,0.109613,0.371651,0.885208,0.066091
1,1,1000_5_1,0.634561,0.519546,0.257014,0.148125,0.164887,0.452509,0.594083,0.022357
2,2,1003_33_2,0.425433,0.437973,0.307283,0.365318,0.253982,0.247112,0.787755,0.034961
3,3,1004_29_6,0.269718,0.545222,0.486814,0.282292,0.092399,0.477461,0.768713,0.063704
4,4,1009_14_1,0.291866,0.222720,0.234067,0.396719,0.185177,0.191591,0.651912,0.072899
...,...,...,...,...,...,...,...,...,...,...
2041,2041,632_53_5,0.485799,0.249188,0.407760,0.463594,0.121224,0.428982,0.742981,0.055823
2042,2042,2302_42_3,0.312781,0.246049,0.311383,0.458165,0.173260,0.340935,0.846120,0.086444
2043,2043,2958_21_7,0.535239,0.429088,0.291175,0.365889,0.221553,0.464719,0.580777,0.037202
2044,2044,3182_52_2,0.479866,0.543695,0.417242,0.462658,0.087884,0.237607,0.392872,0.047740


In [61]:
emotion_pred

Unnamed: 0,id,image_id,0,1,2,3,4,5,6,7
0,0,1000_4_6,0.005870,0.625102,0.178353,0.366286,0.031274,0.411699,0.854394,0.040575
1,1,1000_5_1,0.737346,0.034282,0.028685,0.425013,0.053227,0.839688,0.939678,0.008839
2,2,1003_33_2,0.924333,0.238250,0.191768,0.976309,0.441097,0.246111,0.086263,0.111632
3,3,1004_29_6,0.080295,0.066833,0.905695,0.105006,0.132021,0.726795,0.935116,0.112855
4,4,1009_14_1,0.404627,0.217213,0.980932,0.378449,0.121499,0.830419,0.197641,0.142702
...,...,...,...,...,...,...,...,...,...,...
2041,2041,632_53_5,0.394669,0.440101,0.578236,0.219792,0.005442,0.046492,0.830281,0.002579
2042,2042,2302_42_3,0.030127,0.121836,0.907854,0.092480,0.750700,0.213992,0.977283,0.445743
2043,2043,2958_21_7,0.276641,0.018013,0.470685,0.980341,0.060037,0.127084,0.937156,0.008428
2044,2044,3182_52_2,0.048815,0.572432,0.183183,0.932413,0.045753,0.038721,0.661347,0.027385


In [62]:
ensemble_pred_list = []

In [63]:
for col in range(8):
    ensemble_pred_list.append((tfidf_pred[col+2]+emotion_pred[col])/2)

In [64]:
ensemble_df = emotion_pred[['id','image_id']]

In [65]:
ensemble_df = pd.concat([ensemble_df,pd.DataFrame(ensemble_pred_list).T],axis=1)

In [66]:
ensemble_df.to_csv('../data/public_train/submissions/tfidf_roberta_efficientnet_overall/results.csv',index=None,header=None)