In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |▏                               | 10kB 24.3MB/s eta 0:00:01[K     |▍                               | 20kB 32.5MB/s eta 0:00:01[K     |▋                               | 30kB 22.6MB/s eta 0:00:01[K     |▊                               | 40kB 20.7MB/s eta 0:00:01[K     |█                               | 51kB 21.8MB/s eta 0:00:01[K     |█▏                              | 61kB 16.6MB/s eta 0:00:01[K     |█▎                              | 71kB 17.5MB/s eta 0:00:01[K     |█▌                              | 81kB 18.1MB/s eta 0:00:01[K     |█▊                              | 92kB 16.2MB/s eta 0:00:01[K     |█▉                              | 102kB 17.5MB/s eta 0:00:01[K     |██                              | 112kB 17.5MB/s eta 0:00:01[K     |██▎                             | 

In [None]:
import pandas as pd
import random
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn import Parameter
from transformers import BertModel
from transformers import BertConfig
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
import torch.optim as optim
import matplotlib.pyplot as plt

In [None]:
import pickle

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def get_df(file):
    return pd.read_csv(file,sep = '\t')

In [None]:
train_df = get_df('/content/drive/My Drive/fake_news_dataset/mediaeval2016/train_posts.txt')
test_df = get_df('/content/drive/My Drive/fake_news_dataset/mediaeval2016/test_posts.txt')

## Image Embedding processing

In [None]:
y_train = train_df['label'].eq('real').astype(int)
y_test = test_df['label'].eq('real').astype(int)

In [None]:
train_img = train_df['image_id(s)']
test_img = test_df['image_id']

In [None]:
# Load feature file which is a dictionary with image_id as key and feature vector as value. The feature vector is created using bottom up attention. 
dict_feat = pickle.load(open("/content/drive/My Drive/multi-modal/mediaeval/airsplay/dict_feat.pkl", "rb"))

In [None]:
# Find images for which feature vector size has a mismatch (the total number of fragments are not 36).
c=0
shape_not_match = []
for key in dict_feat :
  if(dict_feat[key].shape[0]!=36) :
    shape_not_match.append(key)
    c+=1
print(c)
print(len(shape_not_match))

43
43


In [None]:
# Keep only those images for which the feature vector has 36 fragments (for training set)
fin_train_img = []
c=0
ids_train = []
for i in range(len(train_img)) :
  img = train_img[i]
  main_image = ""
  if(',' in img) :
    main_images = img.split(',')
    for img in main_images :
      if (img in list(dict_feat.keys())) :
        main_image = img
        break 
  else :
    main_image = img

  if((main_image in list(dict_feat.keys())) and (main_image not in shape_not_match)) :
    fin_train_img.append(main_image)
    ids_train.append(i)
    c+=1

print(c)
print(len(ids_train))
print(len(fin_train_img))

11789
11789
11789


In [None]:
# Load feature file which is a dictionary with image_id as key and feature vector as value. The feature vector is created using bottom up attention. 
dict_feat_test = pickle.load(open("/content/drive/My Drive/multi-modal/mediaeval/airsplay/dict_feat_test.pkl", "rb"))

In [None]:
# Find images for which feature vector size has a mismatch (the total number of fragments are not 36).
c=0
shape_not_match_test = []
for key in dict_feat_test :
  if(dict_feat_test[key].shape[0]!=36) :
    shape_not_match_test.append(key)
    c+=1
print(c)
print(len(shape_not_match_test))

9
9


In [None]:
# Keep only those images for which the feature vector has 36 fragments (for test set).
fin_test_img = []
c=0
ids_test = []
for i in range(len(test_img)) :
  img = test_img[i]
  main_image = ""
  if(',' in img) :
    main_images = img.split(',')
    for img in main_images :
      if (img in list(dict_feat_test.keys())) :
        main_image = img
        break 
  else :
    main_image = img

  if((main_image in list(dict_feat_test.keys())) and (main_image not in shape_not_match_test)) :
    fin_test_img.append(main_image)
    ids_test.append(i)
    c+=1

print(c)
print(len(ids_test))
print(len(fin_test_img))

794
794
794


In [None]:
# Create final training set tensor for image feature vector and labels
train_input_img = torch.zeros((11789,36,2048))
train_output_img = torch.zeros((11789,2))

for i in range(len(fin_train_img)) :
  main_image = fin_train_img[i]
  index = ids_train[i]
  if (main_image not in dict_feat) :
      ar = pickle.load(open("/content/drive/My Drive/multi-modal/mediaeval/airsplay/feat_train/"+main_image+".pkl","rb"))
      dict_feat[main_image] = ar
  else :
    ar = dict_feat[main_image]
  train_input_img[i] = ar
  if (y_train[index]==0) :
    train_output_img[i][0] = 1
  else :
    train_output_img[i][1] = 1

In [None]:
# Create final test set tensor for image feature vector and labels
test_input_img = torch.zeros((794,36,2048))
test_output_img = torch.zeros((794,2))

for i in range(len(fin_test_img)) :
  main_image = fin_test_img[i]
  index = ids_test[i]
  test_input_img[i] = dict_feat_test[main_image]
  if (y_test[index]==0) :
    test_output_img[i][0] = 1
  else :
    test_output_img[i][1] = 1

In [None]:
print(len(train_input_img))
print(len(train_output_img))

11789
11789


In [None]:
# Code for this cell is from https://github.com/yiling2018/saem/blob/master/bert.py
def gelu(x):
    """Implementation of the gelu activation function.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

class BERTLayerNorm(nn.Module):
    def __init__(self, config, variance_epsilon=1e-12):
        """Construct a layernorm module in the TF style (epsilon inside the square root).
        """
        super(BERTLayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(config.hidden_size))
        self.beta = nn.Parameter(torch.zeros(config.hidden_size))
        self.variance_epsilon = variance_epsilon

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.gamma * x + self.beta

class BERTSelfAttention(nn.Module):
    def __init__(self, config):
        super(BERTSelfAttention, self).__init__()
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        return context_layer


class BERTSelfOutput(nn.Module):
    def __init__(self, config):
        super(BERTSelfOutput, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = BERTLayerNorm(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class BERTAttention(nn.Module):
    def __init__(self, config):
        super(BERTAttention, self).__init__()
        self.self = BERTSelfAttention(config)
        self.output = BERTSelfOutput(config)

    def forward(self, input_tensor, attention_mask):
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)
        return attention_output


class BERTIntermediate(nn.Module):
    def __init__(self, config):
        super(BERTIntermediate, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        self.intermediate_act_fn = gelu

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


class BERTOutput(nn.Module):
    def __init__(self, config):
        super(BERTOutput, self).__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = BERTLayerNorm(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class BERTLayer(nn.Module):
    def __init__(self, config):
        super(BERTLayer, self).__init__()
        self.attention = BERTAttention(config)
        self.intermediate = BERTIntermediate(config)
        self.output = BERTOutput(config)

    def forward(self, hidden_states, attention_mask):
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output

In [None]:
class TransformerMapping(nn.Module):
    """ Self-attention layer for image branch
    """
    def __init__(self):
        super(TransformerMapping, self).__init__()
        bert_config = BertConfig.from_json_file("t_cfg_mediaeval.json")
        self.layer = BERTLayer(bert_config)
        self.mapping = nn.Linear(2048, 256)
        self.cls_layer = nn.Linear(256,1)

    def forward(self, x):
        x = self.mapping(x)
        attention_mask = torch.ones(x.size(0), x.size(1))
        if torch.cuda.is_available():
            attention_mask = attention_mask.cuda()
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.float()
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        hidden_states = self.layer(x, extended_attention_mask)
        embed = torch.mean(hidden_states, 1)
        feats = F.normalize(embed, p=2, dim=1)  
        codes = self.cls_layer(feats)
        return codes, feats

## Text embedding processing

In [None]:
x_train = train_df['post_text']
x_test = test_df['post_text']

In [None]:
# Get text only for those ids for which we have valid image feature vectors
x_train_text = []
x_test_text = []

for ind in ids_train :
  x_train_text.append(x_train[ind])

for ind in ids_test :
  x_test_text.append(x_test[ind])

In [None]:
print(len(x_train_text))
print(len(x_test_text))

11789
794


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
def get_token_ids(x_train, x_test):
    
    token_tr = []
    token_tst = []
    count = 0
    for sent in x_train :
        tokens = tokenizer.encode(sent, add_special_tokens = True, max_length=512)
        token_tr.append(tokens)
        count+=1
        if(count%1000==0):
            print(count)
    
    for sent1 in x_test :
        tokens1 = tokenizer.encode(sent1, add_special_tokens = True, max_length=512)
        token_tst.append(tokens1)
        count+=1
        if(count%1000==0):
            print(count)
            
    return token_tr, token_tst 

In [None]:
xtr_token, xtst_token = get_token_ids(x_train_text, x_test_text)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000


In [None]:
xtr_token = pad_sequences(xtr_token, maxlen=512, dtype="long", 
                          value=0, truncating="post", padding="post")
xtst_token = pad_sequences(xtst_token, maxlen=512, dtype="long", 
                          value=0, truncating="post", padding="post")

In [None]:
attention_mask_tr = []
attention_mask_tst = []
for sent in xtr_token:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_mask_tr.append(att_mask)

for sent in xtst_token:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_mask_tst.append(att_mask)

In [None]:
train_input_text = torch.tensor(xtr_token)
test_input_text = torch.tensor(xtst_token)

train_mask = torch.tensor(attention_mask_tr)
test_mask = torch.tensor(attention_mask_tst)

In [None]:
def freeze_layers(model):
    for child in model.children():
        for param in child.parameters():
            param.requires_grad = False

In [None]:
class BertMapping(nn.Module):
    """
    """
    def __init__(self):
        super(BertMapping, self).__init__()
        bert_config = BertConfig.from_pretrained('bert-base-uncased')
        self.bert = BertModel(bert_config)
        freeze_layers(self.bert)
        final_dims = 256
        Ks = [1, 2, 3]
        in_channel = 1
        out_channel = 512
        embedding_dim = bert_config.hidden_size
        self.convs1 = nn.ModuleList([nn.Conv2d(in_channel, out_channel, (K, embedding_dim)) for K in Ks])
        self.dropout = nn.Dropout(bert_config.hidden_dropout_prob)
        self.mapping = nn.Linear(len(Ks)*out_channel, final_dims)
        self.cls_layer = nn.Linear(final_dims, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids,attention_mask=attention_mask, return_dict=True)
        x = outputs.last_hidden_state.unsqueeze(1)  # (batch_size, 1, token_num, embedding_dim)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(batch_size, out_channel, W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
        output = torch.cat(x, 1)
        output = self.dropout(output)
        code = self.mapping(output)
        # code = F.tanh(code)
        feats = F.normalize(code, p=2, dim=1)
        code = self.cls_layer(feats)
        code = F.softmax(code, dim=1)
        return code

## Final model

In [None]:
class FinalModel(nn.Module) :
  """
  """
  def __init__(self, beta):
    super(FinalModel, self).__init__()
    self.text_enc_model = BertMapping()
    self.img_enc_model = TransformerMapping()
    self.beta = beta
    img_dims = 256
    text_dims = 256

  def cal_coeff(self, img_prob, text_prob) :
    logp = torch.log(img_prob)
    logp2 = torch.log(text_prob)
    one_p = 1 - img_prob  # Don't compute gradient here
    one_p.detach()
    one_p2 = 1 - text_prob # Don't compute gradient here
    one_p2.detach()
    # M = 2
    # Beta = 1
    coeff = torch.pow(one_p2,self.beta) * logp + torch.pow(one_p,self.beta) * logp2
    return coeff


  def forward(self, input_ids, attention_mask, img) :
    img_enc = self.img_enc_model(img)
    text_enc = self.text_enc_model(input_ids, attention_mask)
    coeff = self.cal_coeff(img_enc, text_enc)
    return coeff

## Training code

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 16

# Create the DataLoader for our test set.
train_data = TensorDataset(train_input_text,train_mask, train_input_img, train_output_img)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [None]:
# Create the DataLoader for our validation set.
test_data = TensorDataset(test_input_text,test_mask, test_input_img, test_output_img)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)

In [None]:
from sklearn.metrics import classification_report
def eval_model(model, val_loader) :
  model.eval()
  with torch.no_grad() :
    final_out = []
    final_lab = []
    loss_val = 0
    for idx, (val_input, val_mask, val_img, val_label) in enumerate(val_loader):
      # try:
        val_input = val_input.cuda()
        val_mask = val_mask.cuda()
        val_img = val_img.cuda()
        val_label = val_label.cuda()
        coeff = model(val_input, val_mask, val_img)
        loss = -1*torch.mean(torch.sum(coeff*val_label,dim=1))
        loss_val+= float(loss)
        output = torch.argmax(coeff, dim=1)
        val_label = torch.argmax(val_label, dim=1)
        output = output.cpu().detach().numpy()
        val_label = val_label.cpu().detach().numpy()
        final_out.extend(list(output))
        final_lab.extend(list(val_label))

        del val_input
        del val_label
        del output
        del coeff
        del loss
        del val_mask
        torch.cuda.empty_cache()
        
  return classification_report(final_lab, final_out, output_dict=True), loss_val

In [None]:
def train(net, opti, train_loader, num_epochs, val_loader, beta, best_f1_val):
  loss_train = []
  loss_test = []
  best_f1 = best_f1_val
  best_epoch = 0
  for epoch in range(num_epochs):
    loss_val = 0
    for it, (text, mask,img, labels) in enumerate(train_loader):
        torch.cuda.empty_cache()
        opti.zero_grad()  
        text, mask, img, labels = text.cuda(), mask.cuda(), img.cuda(), labels.cuda()
        coeff = net(text, mask, img)
        mul_out = coeff*labels
        sum_out = torch.sum(mul_out, dim=1)
        loss = -1*torch.mean(sum_out)
        loss_val += float(loss.data)
        loss.backward()
        opti.step()

        if (it + 1) % 100 == 0:
            print("Iteration {} of epoch {} complete. Loss : {}".format(it+1, epoch+1, loss.item()))

        del text
        del mask
        del img
        del labels
        del loss
        del mul_out
        del sum_out
        del coeff
        torch.cuda.empty_cache()

    print('Epoch [{}/{}], Loss:{:.4f}'.format(epoch+1, num_epochs, loss_val))
    loss_train.append(loss_val)
    if((epoch+1)%1==0) :
      report, loss_t= eval_model(net, val_loader)
      f1 = report['macro avg']['f1-score']
      loss_test.append(loss_t)
      print("loss_test", loss_t, "beta", beta)
      print("classification_report")
      print(report)
      if (f1>best_f1) :
            print("best_f1_changed from " + str(best_f1) + " to " + str(f1))
            best_f1 = f1
            best_epoch = epoch
      print("--------------------------------------------------------------")
  return loss_train, loss_test, best_epoch

In [None]:
for beta in [0.5] : #Add more beta values in list to fine tune
  torch.cuda.empty_cache()
  net = FinalModel(beta).to(device)
  opti = optim.Adam(net.parameters(), lr = 1e-4)
  loss_train, loss_test, be = train(net, opti, train_dataloader, 15, test_dataloader, beta)
  plt.figure()
  plt.plot(loss_train, label="loss_train")
  plt.plot(loss_test, label="loss_test")
  plt.plot([be,be], [0, max(loss_train)])
  plt.legend()
  plt.show()