# Train on the overall public train data

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from efficientnet_pytorch import EfficientNet
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score, roc_auc_score
import pickle
from transformers import RobertaTokenizer,RobertaModel, XLNetTokenizer, RobertaTokenizer, BertForSequenceClassification, XLNetForSequenceClassification, RobertaModel, AdamW
from tqdm import tqdm, trange
from ast import literal_eval

Using TensorFlow backend.


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce GTX 1060'

## Load and Preprocess Training Data

In [5]:
sample_size = 6064

In [6]:
with open('../data/public_train/public_train_np_img_norm','rb') as f: X_img_train = pickle.load(f)
X_img_train.shape

(6064, 224, 224, 3)

In [7]:
X_img_train = X_img_train[:sample_size]

In [8]:
X_img_train = np.reshape(X_img_train, (X_img_train.shape[0], 3, 224, 224))

In [9]:
df = pd.read_csv('../data/public_train/dataset.csv') #jigsaw-toxic-comment-classification-challenge
df.head()

Unnamed: 0,id,image_id,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral,Others,dialog,narration,text,text_clean,emotion_list
0,0.0,0_19_3,0,0,1,0,0,1,1,0,[' the chief mentioned that the place was dese...,[],[' the chief mentioned that the place was dese...,['the chief mentioned that the place was deser...,"['Fear', 'Surprise', 'Neutral']"
1,1.0,0_21_4,1,0,1,1,0,0,1,0,['disappoint you but id probably give you'],[],['disappoint you but id probably give you'],"['excuse you , but i would probably give you ...","['Angry', 'Fear', 'Happy', 'Neutral']"
2,2.0,0_3_4,0,0,0,1,0,0,1,0,['ah i feel better already now for a brisk wal...,[],['ah i feel better already now for a brisk wal...,['ah i feel better already now for a brisk wal...,"['Happy', 'Neutral']"
3,3.0,0_35_3,0,0,0,1,1,0,1,0,['poor girl ! no wonder shes shy and retiring !'],[],['poor girl ! no wonder shes shy and retiring !'],['poor girl ! no wonder she is shy and retir...,"['Happy', 'Sad', 'Neutral']"
4,4.0,1000_16_4,0,1,0,0,0,1,0,1,['gosh guess i tried to civilize these peo a l...,[],['gosh guess i tried to civilize these peo a l...,['gosh guess i tried to civilize these people ...,"['Disgust', 'Surprise', 'Others']"


In [10]:
len(df)

6064

In [11]:
df = df[:sample_size]

In [12]:
print('Unique comments: ', df.text_clean.nunique() == df.shape[0])
print('Null values: ', df.isnull().values.any())
# df[df.isna().any(axis=1)]

Unique comments:  False
Null values:  False


In [13]:
print('average sentence length: ', df.text_clean.str.split().str.len().mean())
print('stdev sentence length: ', df.text_clean.str.split().str.len().std())

average sentence length:  25.5745382585752
stdev sentence length:  26.251980309701473


In [14]:
cols = df.columns
label_cols = list(cols[2:10])
num_labels = len(label_cols)
print('Label columns: ', label_cols)

Label columns:  ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral', 'Others']


In [15]:
print('Count of 1 per label: \n', df[label_cols].sum(), '\n') # Label counts, may need to downsample or upsample
print('Count of 0 per label: \n', df[label_cols].eq(0).sum())

Count of 1 per label: 
 Angry       2415
Disgust     2208
Fear        2104
Happy       2510
Sad          928
Surprise    2085
Neutral     4088
Others       415
dtype: int64 

Count of 0 per label: 
 Angry       3649
Disgust     3856
Fear        3960
Happy       3554
Sad         5136
Surprise    3979
Neutral     1976
Others      5649
dtype: int64


In [16]:
# df = df.sample(frac=1).reset_index(drop=True) #shuffle rows

In [17]:
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,id,image_id,Angry,Disgust,Fear,Happy,Sad,Surprise,Neutral,Others,dialog,narration,text,text_clean,emotion_list,one_hot_labels
0,0.0,0_19_3,0,0,1,0,0,1,1,0,[' the chief mentioned that the place was dese...,[],[' the chief mentioned that the place was dese...,['the chief mentioned that the place was deser...,"['Fear', 'Surprise', 'Neutral']","[0, 0, 1, 0, 0, 1, 1, 0]"
1,1.0,0_21_4,1,0,1,1,0,0,1,0,['disappoint you but id probably give you'],[],['disappoint you but id probably give you'],"['excuse you , but i would probably give you ...","['Angry', 'Fear', 'Happy', 'Neutral']","[1, 0, 1, 1, 0, 0, 1, 0]"
2,2.0,0_3_4,0,0,0,1,0,0,1,0,['ah i feel better already now for a brisk wal...,[],['ah i feel better already now for a brisk wal...,['ah i feel better already now for a brisk wal...,"['Happy', 'Neutral']","[0, 0, 0, 1, 0, 0, 1, 0]"
3,3.0,0_35_3,0,0,0,1,1,0,1,0,['poor girl ! no wonder shes shy and retiring !'],[],['poor girl ! no wonder shes shy and retiring !'],['poor girl ! no wonder she is shy and retir...,"['Happy', 'Sad', 'Neutral']","[0, 0, 0, 1, 1, 0, 1, 0]"
4,4.0,1000_16_4,0,1,0,0,0,1,0,1,['gosh guess i tried to civilize these peo a l...,[],['gosh guess i tried to civilize these peo a l...,['gosh guess i tried to civilize these people ...,"['Disgust', 'Surprise', 'Others']","[0, 1, 0, 0, 0, 1, 0, 1]"


In [18]:
train_labels = list(df.one_hot_labels.values)
comments = list(df.text_clean.values)

In order to avoid memory issues with Google Colab, I enforce a max_length of 100 tokens. Note that some sentences may not adequately represent each label because of this.

In [19]:
max_length = 35
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


tokenizer outputs:  dict_keys(['input_ids', 'attention_mask'])


In [20]:
encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [21]:
train_inputs = encodings['input_ids'] # tokenized and encoded sentences
# train_token_types = encodings['token_type_ids'] # token type ids
train_masks = encodings['attention_mask'] # attention masks

In [22]:
# Use train_test_split to split our data into train and validation sets

# train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids,attention_masks,
#                                                             random_state=2020, test_size=0.10, stratify = labels)

# Add one frequency data to train data
# train_inputs.extend(one_freq_input_ids)
# train_labels.extend(one_freq_labels)
# train_masks.extend(one_freq_attention_masks)
# train_token_types.extend(one_freq_token_types)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
# train_token_types = torch.tensor(train_token_types)

# validation_inputs = torch.tensor(validation_inputs)
# validation_labels = torch.tensor(validation_labels)
# validation_masks = torch.tensor(validation_masks)
# validation_token_types = torch.tensor(validation_token_types)

In [23]:
text_train_data = TensorDataset(train_inputs, train_masks, train_labels)
img_train_data = TensorDataset(torch.from_numpy(X_img_train), train_labels)

batch_size = 16

text_train_loader = DataLoader(text_train_data, batch_size=batch_size)
img_train_loader = DataLoader(img_train_data, batch_size=batch_size)

print(len(text_train_loader), len(img_train_loader))

379 379


## Load Model & Set Params

In [24]:
class CNN_BERT(nn.Module):
  def __init__(self):
    super(CNN_BERT, self).__init__()

    # BERT for the text overview
    self.text_model = RobertaModel.from_pretrained('roberta-base')
    self.dropout = nn.Dropout(0.3)
    self.text_fc = nn.Linear(768,32)

    # CNN for the posters
    self.effnet = EfficientNet.from_pretrained('efficientnet-b2')
    self.effnet_fc = nn.Linear(1000, 32)
    self.n_out = 8
#     self.concat_dropout = nn.Dropout(0.1)
    self.output_fc = nn.Linear(64, self.n_out)


  def forward(self, input_ids, attention_mask, cnn_inp):
    text_outputs = self.text_model(input_ids, attention_mask)
#     text_outputs = text_outputs['last_hidden_state']
    text_outputs = text_outputs[0][:, 0, :]
    text_outputs = self.dropout(text_outputs)
    text_outputs = self.text_fc(text_outputs)
    
    x = self.effnet(cnn_inp)
    x = self.dropout(x)
    cnn_out = F.relu(self.effnet_fc(x))
    combined_inp = torch.cat((cnn_out, text_outputs), 1)
#     out = torch.sigmoid(self.output_fc(self.concat_dropout(combined_inp)))
    out = torch.sigmoid(self.output_fc(combined_inp))

    return out

In [25]:
model = CNN_BERT()
model.cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loaded pretrained weights for efficientnet-b2


CNN_BERT(
  (text_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,

Setting custom optimization parameters for the AdamW optimizer https://huggingface.co/transformers/main_classes/optimizer_schedules.html

In [26]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01}
]

In [27]:
len(param_optimizer) #366

506

In [28]:
# optimizer_grouped_parameters

In [29]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=1e-5,weight_decay=1e-2)  # Default optimization
# scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.1, steps_per_epoch=len(text_train_loader), epochs=5,anneal_strategy='linear')

## Train Model

In [30]:
# Store our loss and accuracy for plotting
train_loss_set = []
# val_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
#   val_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
#   nb_val_steps = 0
  
  # Train the data for one epoch
  for text_batch, img_batch in zip(text_train_loader,img_train_loader):
    # Add batch to GPU
    text_batch = tuple(t.to(device) for t in text_batch)
    img_batch = tuple(t.to(device) for t in img_batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = text_batch
    cnn_inp, cnn_labels = img_batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()


    # Forward pass for multilabel classification
    outputs = model(b_input_ids, b_input_mask,cnn_inp)
#     loss_func = BCEWithLogitsLoss()
    loss_func = BCELoss()
#     loss = loss_func(outputs.view(-1,num_labels),b_labels.type_as(outputs).view(-1,num_labels)) #convert labels to float for calculation
    loss = loss_func(outputs.squeeze(), b_labels.float())
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

Epoch:  33%|███▎      | 1/3 [03:42<07:25, 222.58s/it]

Train loss: 0.5817406849842273


Epoch:  67%|██████▋   | 2/3 [08:11<04:09, 249.95s/it]

Train loss: 0.5564878586886112


Epoch: 100%|██████████| 3/3 [12:58<00:00, 259.48s/it]

Train loss: 0.5336733318884957





In [31]:
torch.save(model.state_dict(), 'roberta_effnet_model_4')

In [32]:
# model = CNN_BERT()
# model.load_state_dict(torch.load('roberta_effnet_model_1'))
# model.cuda()