In [None]:
!pip install rouge
!pip install sentence_transformers

In [2]:
import pandas as pd
import numpy as np
import math
from rouge import Rouge
from sentence_transformers import SentenceTransformer,util
from tqdm import tqdm
from sklearn.neural_network import MLPClassifier
import requests
from PIL import Image
from io import BytesIO
import json
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [3]:
# load train dataset
train_df= pd.read_csv('/content/drive/MyDrive/data/train_mod.csv',sep='\t')

In [4]:
# load train dataset
val_df= pd.read_csv('/content/drive/MyDrive/data/val_mod.csv',sep='\t')

In [7]:
# load test dataset
test_df = pd.read_csv('/content/drive/MyDrive/data/test_mod.csv', sep='\t')

In [None]:
train_df.head(1)

Unnamed: 0,claim,claim_image,document,document_image,Category,Claim OCR,Document OCR,claim_length,document_length,rouge,text_sim,text_clip_label,img_sim,text_sim_2
0,Delhi: Seventh round of meeting between Centra...,http://pbs.twimg.com/media/Eq4GUZmVoAAiSz4.jpg...,In their seventh attempt to end the logjam ove...,http://pbs.twimg.com/media/Eq4I0YaUcAcXQnO.jpg...,Support_Multimodal,ANI,संजय अग्रवाल\nपीयूष गोयल\nमे\nद्र\nd\nANI,149,7985,0.833333,0.465989,0,0.630884,0.710226


In [5]:
val_df.head(1)

Unnamed: 0,claim,claim_image,document,document_image,Category,Claim OCR,Document OCR,claim_length,document_length,rouge,text_sim,text_sim_2,text_clip_label,img_sim
0,Musician Kodak Black was shot outside of a nig...,https://www.digitalmusicnews.com/wp-content/up...,"On 26 December 2016, the web site Gummy Post p...",https://www.snopes.com/tachyon/2017/01/caution...,Refute,,CAUTION CAUTION CAUTION,81,527,0.923077,0.727481,0.733851,2,0.366395


In [None]:
test_df.head(1)

Unnamed: 0,claim,claim_image,document,document_image,Category,Claim OCR,Document OCR,claim_length,document_length,rouge,text_sim,text_clip_label,img_sim,text_sim_2
0,Delhi: Group of students protest outside offic...,http://pbs.twimg.com/media/EOJMHR4VUAALSeG.jpg...,Hundreds of Jamia Millia Islamia students prot...,http://pbs.twimg.com/media/EOJ4Q47UcAAq6wy.jpg...,Insufficient_Multimodal,ANI,,230,1778,0.615385,0.712563,0,0.619836,0.765613


In [None]:
""" Note here that the model originally used by INO is in the variable
 text_model_2, which has to do with the fact that it is never mentioned by INO,
 and we only found out later in the replication process which variant it was.
 It was easier to use this contraintuitive notation than to rewrite the
 whole code.
"""

text_model = SentenceTransformer('all-MiniLM-L6-v2')
text_model_2 = SentenceTransformer('paraphrase-MiniLM-L6-v2')
clip = SentenceTransformer('clip-ViT-B-32-multilingual-v1')
image_model = ResNet50(weights='imagenet', include_top=False)
rouge = Rouge()

Add text and image features to the datasets

In [None]:
# length feature
train_df['claim_length'] = train_df['claim'].str.len()
train_df['document_length'] = train_df['document'].str.len()

In [None]:
val_df['claim_length'] = val_df['claim'].str.len()
val_df['document_length'] = val_df['document'].str.len()

In [None]:
test_df['claim_length'] = test_df['claim'].str.len()
test_df['document_length'] = test_df['document'].str.len()

In [None]:
# ROUGE feature
def get_rouge(df):
  rouge1 = []
  for n, row in tqdm(df.iterrows(), total=len(df)):
    r = rouge.get_scores(row['document'], row['claim'])
    rouge1.append(r[0]['rouge-1']['r'])
  return rouge1

In [None]:
train_df['rouge'] = get_rouge(train_df)

In [None]:
val_df['rouge'] = get_rouge(val_df)

In [None]:
test_df['rouge'] = get_rouge(test_df)

In [None]:
# SBERT feature
def get_text_sim(df, model):
  chunk = 1024
  sim = []
  for i in tqdm(range(0, len(df), chunk), total = math.ceil(len(df)/chunk)):
    matrix = util.cos_sim(model.encode(df['claim'][i:i+chunk].to_numpy()),
                          model.encode(df['document'][i:i+chunk].to_numpy()))
    for row in [round(matrix[j, j].item(), 6) for j in range(len(matrix))]:
      sim.append(row)
  return sim

In [None]:
train_df['text_sim'] = get_text_sim(train_df, text_model)

In [None]:
train_df['text_sim_2'] = get_text_sim(train_df, text_model_2)

In [None]:
val_df['text_sim'] = get_text_sim(val_df, text_model)

In [None]:
val_df['text_sim_2'] = get_text_sim(val_df, text_model_2)

In [None]:
test_df['text_sim'] = get_text_sim(test_df, text_model)

In [None]:
test_df['text_sim_2'] = get_text_sim(test_df, text_model_2)

In [None]:
# CLIP Module
def get_embeddings(df):
  return np.hstack([clip.encode(df['claim']), clip.encode(df['document'])])

In [None]:
# X_train = get_embeddings(train_df)
X_train = pd.read_csv('/content/drive/MyDrive/data/clip_features.csv', header=None)
category = {
     'Support_Multimodal': 0,
     'Support_Text': 0,
     'Insufficient_Multimodal': 1,
     'Insufficient_Text': 1,
     'Refute': 2
 }
y = train_df['Category'].map(category)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,), solver='adam', max_iter=20, verbose=True)
mlp.fit(X_train, y)

Iteration 1, loss = 0.60954107
Iteration 2, loss = 0.50490811
Iteration 3, loss = 0.48264657
Iteration 4, loss = 0.46822462
Iteration 5, loss = 0.45318851
Iteration 6, loss = 0.44719488
Iteration 7, loss = 0.43308367
Iteration 8, loss = 0.41810177
Iteration 9, loss = 0.41198572
Iteration 10, loss = 0.40209756
Iteration 11, loss = 0.38951919
Iteration 12, loss = 0.38016364
Iteration 13, loss = 0.36828233
Iteration 14, loss = 0.36069894
Iteration 15, loss = 0.35144411
Iteration 16, loss = 0.34180368
Iteration 17, loss = 0.33592973
Iteration 18, loss = 0.33082569
Iteration 19, loss = 0.32537413
Iteration 20, loss = 0.31372623




In [None]:
pred = mlp.predict(X_train)
train_df['text_clip_label'] = pred

In [None]:
X_val = get_embeddings(val_df)
pred = mlp.predict(X_val)
val_df['text_clip_label'] = pred

In [None]:
X_test = get_embeddings(test_df)
pred = mlp.predict(X_test)
test_df['text_clip_label'] = pred

In [None]:
# RESNET50
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

def download_images(df, path):
  for n, row in tqdm(df.iterrows(), total=len(df)):
    try:
      response = requests.get(row['claim_image'], headers=headers)
      response.raise_for_status()
      image = Image.open(BytesIO(response.content))
      image = image.convert('RGB')

      response = requests.get(row['document_image'], headers=headers)
      response.raise_for_status()
      image1 = Image.open(BytesIO(response.content))
      image1 = image1.convert('RGB')
    except:
      continue

    image1.save(path + 'document/' + str(int(n/1000)*1000) + '/document_img_' + str(n) + '.jpg')
    image.save(path + 'claim/' + str(int(n/1000)*1000) + '/claim_img_' + str(n) + '.jpg')


In [None]:
download_images(train_df, '/content/drive/MyDrive/data/images/train/')

In [None]:
download_images(val_df, '/content/drive/MyDrive/data/images/val/')

In [None]:
download_images(test_df, '/content/drive/MyDrive/data/images/test/')

In [None]:
def get_img_sim(size, path, dump_file_path):
  with open(dump_file_path, 'a') as sim:
    for n in tqdm(range(size), total=size, initial=0):
      try:
        p = path + 'claim/' + str(int(n/1000)*1000) + '/claim_img_' + str(n) + '.jpg'
        img = image.load_img(p, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        claim_pred = image_model.predict(img_array, verbose=0)

        p = path + 'document/' + str(int(n/1000)*1000) + '/document_img_' + str(n) + '.jpg'
        img = image.load_img(p, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        document_pred = image_model.predict(img_array, verbose=0)

        sim.write(str(n) + ', ' + str(cosine_similarity(np.mean(claim_pred, axis=(1,2)), np.mean(document_pred, axis=(1, 2)))[0][0]) + '\n')
        if n % 100 == 0:
          sim.flush()
      except:
        sim.write(str(n) + ', ' + str(0) + '\n')

In [None]:
get_img_sim(35000, '/content/drive/MyDrive/data/images/train/', '/content/drive/MyDrive/data/img_sim_train.csv')

In [None]:
img_sim_train_df = pd.read_csv('/content/drive/MyDrive/data/img_sim_train.csv', sep=',', header=None)
train_df['img_sim'] = img_sim_train_df[1]

In [None]:
get_img_sim(7500, '/content/drive/MyDrive/data/images/val/', '/content/drive/MyDrive/data/img_sim_val.csv')

In [21]:
img_sim_val_df = pd.read_csv('/content/drive/MyDrive/data/img_sim_val.csv', sep=',', header=None)
val_df['img_sim'] = img_sim_val_df[1]

In [None]:
get_img_sim(7500, '/content/drive/MyDrive/data/images/test/', '/content/drive/MyDrive/data/img_sim_test.csv')

In [None]:
img_sim_test_df = pd.read_csv('/content/drive/MyDrive/data/img_sim_test.csv', sep=',', header=None)
test_df['img_sim'] = img_sim_test_df[1]

Now that we have all the features, we save the datasets for future use and proceed with training the RandomForest classifier

In [None]:
train_df.to_csv('/content/drive/MyDrive/data/train_mod.csv',sep='\t',index=False,header=True)

In [29]:
val_df.to_csv('/content/drive/MyDrive/data/val_mod.csv',sep='\t',index=False,header=True)

In [None]:
test_df.to_csv('/content/drive/MyDrive/data/test_mod.csv',sep='\t',index=False,header=True)













...

In [8]:
# filter out samples that we couldn't download images for

no_image_train = train_df[train_df['img_sim'] == 0]
merged = train_df.merge(no_image_train, how='outer', indicator=True)
X_train = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')
print(X_train.shape)

no_image_val = val_df[val_df['img_sim'] == 0]
merged = val_df.merge(no_image_val, how='outer', indicator=True)
X_val = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')
print(X_val.shape)

no_image_test = test_df[test_df['img_sim'] == 0]
merged = test_df.merge(no_image_test, how='outer', indicator=True)
X_test = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')
print(X_test.shape)

(34838, 14)
(7432, 14)
(7419, 14)


In [9]:
X_train_2 = X_train[['claim_length', 'document_length', 'rouge', 'text_sim_2', 'text_clip_label', 'img_sim']]
X_val_2 = X_val[['claim_length', 'document_length', 'rouge', 'text_sim_2', 'text_clip_label', 'img_sim']]
X_test_2 = X_test[['claim_length', 'document_length', 'rouge', 'text_sim_2', 'text_clip_label', 'img_sim']]

In [10]:
y_train = X_train['Category']
X_train = X_train[['claim_length', 'document_length', 'rouge', 'text_sim', 'text_clip_label', 'img_sim']]
y_val = X_val['Category']
X_val = X_val[['claim_length', 'document_length', 'rouge', 'text_sim', 'text_clip_label', 'img_sim']]
y_test = X_test['Category']
X_test = X_test[['claim_length', 'document_length', 'rouge', 'text_sim', 'text_clip_label', 'img_sim']]

In [11]:
sc=StandardScaler()
sc.fit(X_train)
X_train = pd.DataFrame(sc.transform(X_train), columns=['claim_length', 'document_length', 'rouge', 'text_sim', 'text_clip_label', 'img_sim'])
X_val = pd.DataFrame(sc.transform(X_val), columns=['claim_length', 'document_length', 'rouge', 'text_sim', 'text_clip_label', 'img_sim'])
X_test = pd.DataFrame(sc.transform(X_test), columns=['claim_length', 'document_length', 'rouge', 'text_sim', 'text_clip_label', 'img_sim'])

In [12]:
sc.fit(X_train_2)
X_train_2 = pd.DataFrame(sc.transform(X_train_2), columns=['claim_length', 'document_length', 'rouge', 'text_sim_2', 'text_clip_label', 'img_sim'])
X_val_2 = pd.DataFrame(sc.transform(X_val_2), columns=['claim_length', 'document_length', 'rouge', 'text_sim_2', 'text_clip_label', 'img_sim'])
X_test_2 = pd.DataFrame(sc.transform(X_test_2), columns=['claim_length', 'document_length', 'rouge', 'text_sim_2', 'text_clip_label', 'img_sim'])

In [None]:
clf = RandomForestClassifier(n_estimators=500,max_depth=40, random_state=16)
clf.fit(X_train,y_train)
predictions = clf.predict(X_test)
print(f1_score(y_test, predictions, average='weighted'))
clf.fit(X_train_2,y_train)
predictions = clf.predict(X_test_2)
print(f1_score(y_test, predictions, average='weighted'))


0.7726915738976211
0.7627738185365759


Ablation experiments:

| Model Name           | Validation F1 score |
|----------------------|---------------------|
| Without Sentence BERT| 0.7926              |
| Without CLIP         | 0.7911              |
| Without ROUGE+length | 0.7709              |
| Without ResNet50     | 0.6007              |
| Baseline             | 0.6664              |
| Final model          | 0.8078              |

In [14]:
# without sentence BERT
X_train_abl1 = X_train[['claim_length', 'document_length', 'rouge', 'text_clip_label', 'img_sim']].copy()
X_val_abl1 = X_val[['claim_length', 'document_length', 'rouge', 'text_clip_label', 'img_sim']].copy()

clf = RandomForestClassifier(n_estimators=500,max_depth=40, random_state=16)
clf = clf.fit(X_train_abl1,y_train)
predictions = clf.predict(X_val_abl1)
score = f1_score(y_val, predictions, average='weighted')
print(score)

0.7601965781896


In [15]:
# without CLIP
X_train_abl2 = X_train[['claim_length', 'document_length', 'rouge', 'text_sim', 'img_sim']]
X_val_abl2 = X_val[['claim_length', 'document_length', 'rouge', 'text_sim', 'img_sim']]

X_train_abl2_2 = X_train_2[['claim_length', 'document_length', 'rouge', 'text_sim_2', 'img_sim']]
X_val_abl2_2 = X_val_2[['claim_length', 'document_length', 'rouge', 'text_sim_2', 'img_sim']]

clf = RandomForestClassifier(n_estimators=500,max_depth=40, random_state=16)
clf.fit(X_train_abl2,y_train)
predictions = clf.predict(X_val_abl2)
score = f1_score(y_val, predictions, average='weighted')
print('all-MiniLM-L6-v2', score)

clf.fit(X_train_abl2_2,y_train)
predictions = clf.predict(X_val_abl2_2)
score = f1_score(y_val, predictions, average='weighted')
print('paraphrase-MiniLM-L6-v2', score)

all-MiniLM-L6-v2 0.7058051346686504
paraphrase-MiniLM-L6-v2 0.7166043399726676


In [16]:
# without rouge + length
X_train_abl3 = X_train[['text_sim', 'text_clip_label', 'img_sim']]
X_val_abl3 = X_val[['text_sim', 'text_clip_label', 'img_sim']]

X_train_abl3_2 = X_train_2[['text_sim_2', 'text_clip_label', 'img_sim']]
X_val_abl3_2 = X_val_2[['text_sim_2', 'text_clip_label', 'img_sim']]

clf = RandomForestClassifier(n_estimators=500,max_depth=40, random_state=16)
clf.fit(X_train_abl3,y_train)
predictions = clf.predict(X_val_abl3)
score = f1_score(y_val, predictions, average='weighted')
print('all-MiniLM-L6-v2', score)

clf.fit(X_train_abl3_2,y_train)
predictions = clf.predict(X_val_abl3_2)
score = f1_score(y_val, predictions, average='weighted')
print('paraphrase-MiniLM-L6-v2', score)

all-MiniLM-L6-v2 0.7445958489286475
paraphrase-MiniLM-L6-v2 0.7425484442789415


In [17]:
# without ResNet50
X_train_abl4 = X_train[['claim_length', 'document_length', 'rouge', 'text_sim', 'text_clip_label']]
X_val_abl4 = X_val[['claim_length', 'document_length', 'rouge', 'text_sim', 'text_clip_label']]

X_train_abl4_2 = X_train_2[['claim_length', 'document_length', 'rouge', 'text_sim_2', 'text_clip_label']]
X_val_abl4_2 = X_val_2[['claim_length', 'document_length', 'rouge', 'text_sim_2', 'text_clip_label']]

clf = RandomForestClassifier(n_estimators=500,max_depth=40, random_state=16)
clf.fit(X_train_abl4,y_train)
predictions = clf.predict(X_val_abl4)
score = f1_score(y_val, predictions, average='weighted')
print('all-MiniLM-L6-v2', score)

clf.fit(X_train_abl4_2,y_train)
predictions = clf.predict(X_val_abl4_2)
score = f1_score(y_val, predictions, average='weighted')
print('paraphrase-MiniLM-L6-v2', score)

all-MiniLM-L6-v2 0.5961898044689335
paraphrase-MiniLM-L6-v2 0.5832171360343918


In [18]:
# baseline (just SBERT + ResNet50)
X_train_abl5 = X_train[['text_sim', 'img_sim']]
X_val_abl5 = X_val[['text_sim', 'img_sim']]

X_train_abl5_2 = X_train_2[['text_sim_2', 'img_sim']]
X_val_abl5_2 = X_val_2[['text_sim_2', 'img_sim']]

clf = RandomForestClassifier(n_estimators=500,max_depth=40, random_state=16)
clf.fit(X_train_abl5,y_train)
predictions = clf.predict(X_val_abl5)
score = f1_score(y_val, predictions, average='weighted')
print('all-MiniLM-L6-v2', score)

clf.fit(X_train_abl5_2,y_train)
predictions = clf.predict(X_val_abl5_2)
score = f1_score(y_val, predictions, average='weighted')
print('paraphrase-MiniLM-L6-v2', score)

0.48612791537887196
0.4742488309415902
