In [None]:
from google.colab import drive

drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 2.6MB/s 
[?25hCollecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 12.7MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 24.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB

In [None]:
import os
import sys
import codecs
import random
import numpy
import itertools
import torch
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import Dataset, IterableDataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import *

In [None]:
# # If there's a GPU available...
if torch.cuda.is_available():    
	# Tell PyTorch to use the GPU.    
	device = torch.device("cuda")
	print('There are %d GPU(s) available.' % torch.cuda.device_count())
	print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
	print('No GPU available, using the CPU instead.')
	device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
MODEL_NAME = 'BERT'
print(f'MODEL_NAME = {MODEL_NAME}')
MODEL_SAVING_POLICY = "acc" # 'loss'
print(f'MODEL_SAVING_POLICY = {MODEL_SAVING_POLICY}')
LOSS_FN = 'nw' # 'w'
print(f'LOSS_FN = {LOSS_FN}')
OPTIM = 'adam' # 'adamw'
print(f'OPTIM = {OPTIM}')
L2_REGULARIZER = 'n' # 'y'
print(f'L2_REGULARIZER = {L2_REGULARIZER}')
USE_DROPOUT = 'n' # 'y'
print(f'USE_DROPOUT = {USE_DROPOUT}')
TREE_VERSION = "old" # 'new'
print(f'TREE_VERSION = {TREE_VERSION}')
NO_OF_EVENTS = 4
print(f'NO_OF_EVENTS = {NO_OF_EVENTS}')
gpu_id = 0
print(f'GPU_ID = {gpu_id}\n\n')

MODEL_NAME = BERT
MODEL_SAVING_POLICY = acc
LOSS_FN = nw
OPTIM = adam
L2_REGULARIZER = n
USE_DROPOUT = n
TREE_VERSION = old
NO_OF_EVENTS = 4
GPU_ID = 0




In [None]:
class TreeDataset(Dataset):

	def __init__(self, data):
		self.data = data
	
	def __len__(self):
		return len(self.data)
	
	def __getitem__(self, idx):
		return self.data[idx]


def _label_node_index(node, n=0):
	node['index'] = n
	for child in node['c']:
		n += 1
		_label_node_index(child, n)


def _gather_node_attributes(node, key):
	features = [node[key]]
	for child in node['c']:
		features.extend(_gather_node_attributes(child, key))
	return features


def _gather_adjacency_list(node):
	adjacency_list = []
	for child in node['c']:
		adjacency_list.append([node['index'], child['index']])
		adjacency_list.extend(_gather_adjacency_list(child))

	return adjacency_list


def convert_tree_to_tensors(tree, summ_gt, cont_gt, tweet_id, device=device):
	# Label each node with its walk order to match nodes to feature tensor indexes
	# This modifies the original tree as a side effect
	_label_node_index(tree)

	features = _gather_node_attributes(tree, 'f')
	attention = _gather_node_attributes(tree, 'a')
	old_features = _gather_node_attributes(tree, 'k')
	labels = _gather_node_attributes(tree, 'l')		
	root_label = [labels[0]]
	if summ_gt==1:
		s_gt = [[0,1]]
	else:
		s_gt = [[1,0]]
	c_gt = [cont_gt]
	adjacency_list = _gather_adjacency_list(tree)

	node_order, edge_order = calculate_evaluation_orders(adjacency_list, len(features))
	root_node = [0]

	return {
		'f': torch.tensor(features, dtype=torch.long),
		'a':torch.tensor(attention,  dtype=torch.float32),
		'k':torch.tensor(old_features, dtype=torch.float32),
        's_gt':torch.tensor(s_gt,dtype=torch.float32),
        'c_gt':torch.tensor(c_gt,dtype=torch.long),
		'l': torch.tensor(labels,  dtype=torch.float32),
		'root_l': torch.tensor(root_label, dtype=torch.long),
		'root_n': torch.tensor(root_node,  dtype=torch.int64),
		'node_order': torch.tensor(node_order,  dtype=torch.int64),
		'adjacency_list': torch.tensor(adjacency_list,  dtype=torch.int64),
		'edge_order': torch.tensor(edge_order,  dtype=torch.int64),
        'tweet_id' : torch.tensor(tweet_id, dtype=torch.int64)
	}


def calculate_evaluation_orders(adjacency_list, tree_size):
	'''Calculates the node_order and edge_order from a tree adjacency_list and the tree_size.

	The TreeLSTM model requires node_order and edge_order to be passed into the model along
	with the node features and adjacency_list.  We pre-calculate these orders as a speed
	optimization.
	'''
	adjacency_list = numpy.array(adjacency_list)
	node_ids = numpy.arange(tree_size, dtype=int)
	node_order = numpy.zeros(tree_size, dtype=int)
	unevaluated_nodes = numpy.ones(tree_size, dtype=bool)
	
	# print(adjacency_list)
	if(len(adjacency_list)==0):
		return [0],[]
	parent_nodes = adjacency_list[:, 0]
	child_nodes = adjacency_list[:, 1]

	n = 0
	while unevaluated_nodes.any():
		# Find which child nodes have not been evaluated
		unevaluated_mask = unevaluated_nodes[child_nodes]

		# Find the parent nodes of unevaluated children
		unready_parents = parent_nodes[unevaluated_mask]

		# Mark nodes that have not yet been evaluated
		# and which are not in the list of parents with unevaluated child nodes
		nodes_to_evaluate = unevaluated_nodes & ~numpy.isin(node_ids, unready_parents)

		node_order[nodes_to_evaluate] = n
		unevaluated_nodes[nodes_to_evaluate] = False

		n += 1

	edge_order = node_order[parent_nodes]

	return node_order, edge_order


def batch_tree_input(batch):
	'''Combines a batch of tree dictionaries into a single batched dictionary for use by the TreeLSTM model.

	batch - list of dicts with keys ('f', 'node_order', 'edge_order', 'adjacency_list')
	returns a dict with keys ('f', 'node_order', 'edge_order', 'adjacency_list', 'tree_sizes')
	'''
	tree_sizes = [b['f'].shape[0] for b in batch]

	batched_features = torch.cat([b['f'] for b in batch])
	batched_attentions = torch.cat([b['a'] for b in batch])
	batched_old_features = torch.cat([b['k'] for b in batch])
	batched_node_order = torch.cat([b['node_order'] for b in batch])

	idx = 0
	root_li = []

	for b in batch:
		root_li.append(idx)
		idx += len(b['node_order'])

	batched_root = torch.tensor(root_li, dtype=torch.int64)

	batched_edge_order = torch.cat([b['edge_order'] for b in batch])

	batched_labels = torch.cat([b['l'] for b in batch])

	batched_root_labels = torch.cat([b['root_l'] for b in batch])
	batched_summ_labels = torch.cat([b['s_gt'] for b in batch])
 
	batched_cont_labels = torch.cat([b['c_gt'] for b in batch])
	batched_adjacency_list = []
	offset = 0
	for n, b in zip(tree_sizes, batch):
		batched_adjacency_list.append(b['adjacency_list'] + offset)
		offset += n
	batched_adjacency_list = torch.cat(batched_adjacency_list)

	return {
		'f': batched_features,
		'a': batched_attentions,
		'k': batched_old_features,
        's_gt':batched_summ_labels,
        'c_gt':batched_cont_labels,
		'node_order': batched_node_order,
		'edge_order': batched_edge_order,
		'adjacency_list': batched_adjacency_list,
		'tree_sizes': tree_sizes,
		'root_node': batched_root,
		'root_label': batched_root_labels,
		'l': batched_labels
	}


def unbatch_tree_tensor(tensor, tree_sizes):
	'''Convenience functo to unbatch a batched tree tensor into individual tensors given an array of tree_sizes.

	sum(tree_sizes) must equal the size of tensor's zeroth dimension.
	'''
	return torch.split(tensor, tree_sizes, dim=0)

In [None]:
class TreeLSTM(torch.nn.Module):
	'''PyTorch TreeLSTM model that implements efficient batching.
	'''
	def __init__(self, model_name, trainable_layers, in_features, out_features, classifier_dropout, mode='cls'):
		'''TreeLSTM class initializer

		Takes in int sizes of in_features and out_features and sets up model Linear network layers.
		'''
		super().__init__()
		print("model intialising...")
		self.in_features = in_features
		self.out_features = out_features
		self.mode = mode
		self.model_name = model_name
		
        #ENCODER
		if model_name == 'BERT':
			self.BERT_model = BertModel.from_pretrained("bert-base-cased")
		elif model_name == 'ROBERTA':
			self.BERT_model = RobertaModel.from_pretrained("roberta-base")
		elif model_name == 'XLNET':
			self.BERT_model = XLNetModel.from_pretrained("xlnet-base-cased")
		elif model_name == 'T5':
			self.BERT_model = T5Model.from_pretrained("t5-base")
		else:
			# Default BERT
			self.BERT_model = BertModel.from_pretrained("bert-base-cased")
				
		"""
		for name, param in self.BERT_model.named_parameters():
			flag = False
			for num in trainable_layers:
				if 'layer.'+ str(num) + '.' in name:
					param.requires_grad = True
					flag = True
					break
			if not flag:
				if 'pooler' in name or 'embedding' in name:
					param.requires_grad = True
				else:
					param.requires_grad = False
        """

        #DECODER
        #VERIFICATION
		self.W_iou = torch.nn.Linear(self.in_features, 3 * self.out_features)
		self.U_iou = torch.nn.Linear(self.out_features, 3 * self.out_features, bias=False)
		# f terms are maintained seperate from the iou terms because they involve sums over child nodes
		# while the iou terms do not
		self.W_f = torch.nn.Linear(self.in_features, self.out_features)
		self.U_f = torch.nn.Linear(self.out_features, self.out_features, bias=False)
		self.fc = torch.nn.Linear(self.out_features, 1)
		# self.bert_dropout = torch.nn.Dropout(bert_dropout)
		self.classifier_dropout = torch.nn.Dropout(classifier_dropout)
		# self.init_weights()

        #SUMMARIZATION
		self.summ_fc1 = torch.nn.Linear(self.in_features,self.out_features)
		self.summ_fc2 = torch.nn.Linear(self.out_features,2)

        #CONTENT-CLASSIFICATION
		self.cont_fc1 = torch.nn.Linear(self.in_features,self.out_features)
		self.cont_fc2 = torch.nn.Linear(self.out_features,4)
	
	def init_weights(self):
		for name, param in self.named_parameters():
			if "BERT" in name or "bias" in name:
				continue
			else:
				torch.nn.init.xavier_uniform_(param)


	def forward(self, features, attentions, old_features, node_order, adjacency_list, edge_order, root_node):
		'''Run TreeLSTM model on a tree data structure with node features

		Takes Tensors encoding node features, a tree node adjacency_list, and the order in which 
		the tree processing should proceed in node_order and edge_order.
		'''

		# Total number of nodes in every tree in the batch
		batch_size = node_order.shape[0]

		# Retrive device the model is currently loaded on to generate h, c, and h_sum result buffers
		device = next(self.parameters()).device

		# h and c states for every node in the batch
		# h - hidden state
		# c - memory state
		h = torch.zeros(batch_size, self.out_features, device=device)		
		c = torch.zeros(batch_size, self.out_features, device=device)
		
		if self.model_name == 'XLNET':
			hidden_states = self.BERT_model(input_ids=features, attention_mask=attentions)
			hidden_states = hidden_states[0]
			# print(len(hidden_states))
			print(hidden_states[0])
			# print(hidden_states[0].size)
		else:
			hidden_states,_ = self.BERT_model(input_ids=features, attention_mask=attentions)		

		if self.mode=="cls":
			output_vectors = hidden_states[:,0]
		elif self.mode=="avg":
			input_mask_expanded = attentions.unsqueeze(-1).expand(hidden_states.size()).float()
			sum_embeddings = torch.sum(hidden_states * input_mask_expanded, 1)
			sum_mask = input_mask_expanded.sum(1)
			output_vectors= sum_embeddings / sum_mask
			
		output_vectors = torch.cat([output_vectors, old_features], axis=1)
		# output_vectors = self.bert_dropout(output_vectors)
		
        #DECODER
        #VERIFICATION
		for n in range(node_order.max() + 1):
			self._run_lstm(n, h, c, output_vectors, node_order, adjacency_list, edge_order)
		h_root = h[root_node, :]
		if USE_DROPOUT == 'y':
			h_root = self.classifier_dropout(h_root)
		logits_out = self.fc(h_root)
		# pred_out = F.log_softmax(logits_out, dim = 1)
		# pred_out = F.softmax(logits_out, dim = 1)

        #SUMMARIZAION
		s_x = output_vectors[root_node,:]
		s_x = self.summ_fc1(s_x)
		s_x = self.summ_fc2(s_x)
		s_x = torch.nn.functional.softmax(s_x,dim=1)
  
        #CONTENT-CLASSIFICATION
		c_x = output_vectors[root_node,:]
		c_x = self.cont_fc1(c_x)
		c_x = self.cont_fc2(c_x)

		return h, logits_out, c, s_x, c_x

	
	def _run_lstm(self, iteration, h, c, features, node_order, adjacency_list, edge_order):
		'''Helper function to evaluate all tree nodes currently able to be evaluated.
		'''
		node_mask = node_order == iteration

		# edge_mask is a tensor of size E x 1
		edge_mask = edge_order == iteration

		x = features[node_mask, :]
		if iteration == 0:
			iou = self.W_iou(x)
		else:
			# adjacency_list is a tensor of size e x 2
			adjacency_list = adjacency_list[edge_mask, :]

			parent_indexes = adjacency_list[:, 0]
			child_indexes = adjacency_list[:, 1]

			# child_h and child_c are tensors of size e x 1
			child_h = h[child_indexes, :]
			child_c = c[child_indexes, :]

			# Add child hidden states to parent offset locations
			_, child_counts = torch.unique_consecutive(parent_indexes, return_counts=True)
			child_counts = tuple(child_counts)
			parent_children = torch.split(child_h, child_counts)
			parent_list = [item.sum(0) for item in parent_children]

			h_sum = torch.stack(parent_list)
			iou = self.W_iou(x) + self.U_iou(h_sum)


		# i, o and u are tensors of size n x M
		i, o, u = torch.split(iou, iou.size(1) // 3, dim=1)
		i = torch.sigmoid(i)
		o = torch.sigmoid(o)
		u = torch.tanh(u)

		if iteration == 0:
			c[node_mask, :] = i * u
		else:
			# f is a tensor of size e x M
			f = self.W_f(features[parent_indexes, :]) + self.U_f(child_h)
			f = torch.sigmoid(f)
			# fc is a tensor of size e x M
			fc = f * child_c
			# Add the calculated f values to the parent's memory cell state
			parent_children = torch.split(fc, child_counts)
			parent_list = [item.sum(0) for item in parent_children]

			c_sum = torch.stack(parent_list)
			c[node_mask, :] = i * u + c_sum

		h[node_mask, :] = o * torch.tanh(c[node_mask])


In [None]:
def save_model(model, name, val_acc=0, val_loss=1):
	state = {
		'model':model.state_dict(),
		'optimizer': optimizer.state_dict(),
		'val_acc': val_acc,
		'val_loss': val_loss
		}
	torch.save(state, name)


def load_model(model, name):
	state = torch.load(name)
	model.load_state_dict(state['model'])
	optimizer.load_state_dict(state['optimizer'])
	print('Validation accuracy of the model is ', state.get('val_acc'))
	print('Validation loss of the model is ', state.get('val_loss'))
	return state.get('val_acc')

In [None]:
def split_data_verification(trees, frac):
	pos_data = []
	neg_data = []
	for tree in trees:
		if tree['root_l'].tolist() == [[0, 1]]:
			pos_data.append(tree)
		else:
			neg_data.append(tree)
	pos_len = int(frac * len(pos_data))
	neg_len = int(frac * len(neg_data))
	val_li = pos_data[:pos_len] + neg_data[:neg_len]
	random.shuffle(val_li)
	train_li = pos_data[pos_len:] + neg_data[neg_len:]
	random.shuffle(train_li)
	return train_li, val_li

In [None]:
def split_data_summary(trees,frac):
    pos_data = []
    neg_data = []
    for tree in trees:
        if tree['s_gt'].tolist() == [[0,1]]:
            pos_data.append(tree)
        else:
            neg_data.append(tree)
    pos_len = int(frac*len(pos_data))
    neg_len = int(frac*len(neg_data))
    val_li = pos_data[:pos_len]+neg_data[:neg_len]
    random.shuffle(val_li)
    train_li = pos_data[pos_len:]+neg_data[neg_len:]
    random.shuffle(train_li)
    return train_li,val_li

In [None]:
def split_data_content(trees,frac):
    data = [[] for _ in range(4)]
    for tree in trees:
        data[int(tree['c_gt'].item()[0])].append(tree)
    length = [int(frac*len(data[i])) for i in range(4)]
    print([len(data[i][:length[i]]) for i in range(4)])
    
    val_li = []
    for i in range(4):
        val_li.extend(data[i][:length[i]])
    random.shuffle(val_li)
    train_li = []
    for i in range(4):
        train_li.extend(data[i][length[i]:])
    random.shuffle(train_li)

    return train_li,val_li

In [None]:
path = "./drive/My Drive/"
name = path + "mtl_ver+cont+summ.pt"
name2 = path + "mtl_ver+cont+summ_2.pt"
if MODEL_NAME == 'BERT':
	# tree_path = './PT_FeatBERT40_maxR5/'
	if TREE_VERSION == "new":
		tree_path = './PT_PHEME5_FeatBERT40_Depth5_maxR5/'
	else:
		tree_path = './drive/My Drive/Parsed-Trees-Pad32_FeatBERT40_Depth5_maxR5/'
elif MODEL_NAME == 'ROBERTA':
	# tree_path = './PT_FeatROBERTA40_maxR5/'
	tree_path = './PT_PHEME5_FeatROBERTA40_Depth5_maxR5/'
elif MODEL_NAME == 'XLNET':
	# tree_path = './PT_FeatXLNET40_maxR5/'
	tree_path = './PT_PHEME5_FeatXLNET40_Depth5_maxR5/'
elif MODEL_NAME =='T5':
	# tree_path = './PT_FeatT540_maxR5/'
	tree_path = './PT_PHEME5_FeatT540_Depth5_maxR5/'
else:
	# Default BERT
	# tree_path = './PT_FeatBERT40_maxR5/'
	tree_path = './PT_PHEME5_FeatBERT40_Depth5_maxR5/'

if NO_OF_EVENTS == 4:
	files = ['charliehebdo.txt', 'germanwings-crash.txt', 'ottawashooting.txt','sydneysiege.txt']
else:
	files = ['charliehebdo.txt', 'ferguson.txt', 'germanwings-crash.txt', 'ottawashooting.txt','sydneysiege.txt']


In [None]:
import pandas as pd

f = ['charliehebdo', 'ottawashooting','germanwings','sydneysiege'] 
path2 = "./drive/My Drive/CIKM_dataset/0.7/"
dfc = pd.read_pickle(path2+f[0]+"_7.pkl")
dfo = pd.read_pickle(path2+f[1]+"_7.pkl")
dfg = pd.read_pickle(path2+f[2]+"_7.pkl")
dfs = pd.read_pickle(path2+f[3]+"_7.pkl")

summ_dict = {}
for df in [dfc,dfo,dfg,dfs]:
    for ind in df.index:
        summ_dict[df['tweetid'][ind]] = df['newgt'][ind]

In [None]:
count = 0
for i in summ_dict:
    if(summ_dict[i]==1):
        count+=1
print(count)

803


In [None]:
# files = ['charliehebdo.txt', 'ottawashooting.txt','germanwings-crash.txt','sydneysiege.txt'] 
cont_tweets = {}
for f in files:
    f = codecs.open(path+"situational_tweets/"+f[:-4]+"_FOUR_CLEAN_ANNOTATE_110520.txt")
    for line in f:
        line = line.split("\t")
        cont_tweets[int(line[1])] = int(line[8])-1

In [None]:
tree_li = {}
val_li = {}
s_y = {}
c_y = {}
for filename in files:
    s_temp = []
    c_temp = []
    input_file = codecs.open(tree_path + filename, 'r', 'utf-8')
    tree_li[filename] = []
    for row in input_file:
        s = row.strip().split('\t')
        tweet_id = int(s[0])
        curr_tree = eval(s[1])
        curr_tensor = convert_tree_to_tensors(curr_tree,summ_dict[tweet_id],cont_tweets[tweet_id],tweet_id)
        s_temp.append(summ_dict[tweet_id])
        c_temp.append(cont_tweets[tweet_id])
        tree_li[filename].append(curr_tensor)
    s_y[filename] = s_temp
    c_y[filename] = c_temp
    random.shuffle(tree_li[filename])
    tree_li[filename], val_li[filename] = split_data_summary(tree_li[filename], 0.2)
    input_file.close()
    print(f'{filename} Training Set Size: {len(tree_li[filename])}, Validation Set Size: {len(val_li[filename])}, Total: {len(tree_li[filename]) + len(val_li[filename])}')

charliehebdo.txt Training Set Size: 1664, Validation Set Size: 415, Total: 2079
germanwings-crash.txt Training Set Size: 376, Validation Set Size: 93, Total: 469
ottawashooting.txt Training Set Size: 712, Validation Set Size: 178, Total: 890
sydneysiege.txt Training Set Size: 978, Validation Set Size: 243, Total: 1221


In [None]:
from sklearn.utils.class_weight import compute_class_weight
weight_vec = {}
pos_weight_vec = {}
for test_file in files:
	y = []
	label_dist = [0, 0]
	for filename in files:		
		if filename != test_file:			
			file_dist = [0, 0]
			for tree in tree_li[filename]:
				# print(int(tree['root_l'].tolist()[0][1]))
				y.append(int(tree['root_l'].tolist()[0][1]))
				file_dist[int(tree['root_l'].tolist()[0][1])] += 1
				label_dist[int(tree['root_l'].tolist()[0][1])] += 1
			# print(f'{filename} has {file_dist[0]} non-rumors and {file_dist[1]} rumors')
	print(f'Total non-rumors: {label_dist[0]}, Total rumors: {label_dist[1]}')
	weight_vec[test_file] = torch.tensor(compute_class_weight('balanced', numpy.unique(y), y)).to(device)
	pos_weight = label_dist[0] / label_dist[1]
	pos_weight_vec[test_file] = torch.tensor([pos_weight], dtype=torch.float32).to(device)
	print(f'Test File: {test_file}, Weight Vector: {weight_vec[test_file]}')
	print(f'Test File: {test_file}, Pos Weight Vector: {pos_weight_vec[test_file]}')

Total non-rumors: 1077, Total rumors: 989
Test File: charliehebdo.txt, Weight Vector: tensor([0.9591, 1.0445], device='cuda:0', dtype=torch.float64)
Test File: charliehebdo.txt, Pos Weight Vector: tensor([1.0890], device='cuda:0')
Total non-rumors: 2197, Total rumors: 1157
Test File: germanwings-crash.txt, Weight Vector: tensor([0.7633, 1.4494], device='cuda:0', dtype=torch.float64)
Test File: germanwings-crash.txt, Pos Weight Vector: tensor([1.8989], device='cuda:0')
Total non-rumors: 2044, Total rumors: 974
Test File: ottawashooting.txt, Weight Vector: tensor([0.7383, 1.5493], device='cuda:0', dtype=torch.float64)
Test File: ottawashooting.txt, Pos Weight Vector: tensor([2.0986], device='cuda:0')
Total non-rumors: 1819, Total rumors: 933
Test File: sydneysiege.txt, Weight Vector: tensor([0.7565, 1.4748], device='cuda:0', dtype=torch.float64)
Test File: sydneysiege.txt, Pos Weight Vector: tensor([1.9496], device='cuda:0')


In [None]:
content_weight_vec = {}
summ_weight_vec = {}
for test_file in files:
    s_y = []
    c_y = []
    for f in files:
        if f != test_file:
            for t in tree_li[f]:
                s_y.append(t['s_gt'][0][1].item())
                c_y.append(t['c_gt'][0].item())
    content_weight_vec[test_file] = torch.tensor(compute_class_weight('balanced',numpy.unique(c_y),c_y), device=device, dtype=torch.float32)
    summ_weight_vec[test_file] = torch.tensor(compute_class_weight('balanced',numpy.unique(s_y),s_y),device=device, dtype=torch.float32)
print("*******Content Weights******")
print(content_weight_vec)
print("******Summary Weights******")
print(summ_weight_vec)

*******Content Weights******
{'charliehebdo.txt': tensor([1.6191, 1.0738, 1.2598, 0.6034], device='cuda:0'), 'germanwings-crash.txt': tensor([1.9637, 1.2724, 1.8032, 0.4651], device='cuda:0'), 'ottawashooting.txt': tensor([2.1870, 1.2111, 1.8225, 0.4612], device='cuda:0'), 'sydneysiege.txt': tensor([1.8595, 1.1924, 1.8445, 0.4804], device='cuda:0')}
******Summary Weights******
{'charliehebdo.txt': tensor([0.5826, 3.5256], device='cuda:0'), 'germanwings-crash.txt': tensor([0.5989, 3.0271], device='cuda:0'), 'ottawashooting.txt': tensor([0.6119, 2.7337], device='cuda:0'), 'sydneysiege.txt': tensor([0.6201, 2.5816], device='cuda:0')}


In [None]:
def train(tree_batch, test_file, mode="train"):
	err_count = 0
	loss = 0
	pred_labels = []
	g_labels = []
	
	# try:
	h, h_root, c, summ_out, cont_out = model(
		tree_batch['f'].to(device),
		tree_batch['a'].to(device),
		tree_batch['k'].to(device),
		tree_batch['node_order'].to(device),
		tree_batch['adjacency_list'].to(device),
		tree_batch['edge_order'].to(device),
		tree_batch['root_node'].to(device)
	)

	#WEIGHTS
	weights = weight_vec[test_file]
	pos_weights = pos_weight_vec[test_file]
	summ_weight = summ_weight_vec[test_file]
	cont_weight = content_weight_vec[test_file]


	# CASE 1: Verification
	root = tree_batch['root_label'].to('cpu')
	g_labels = [[t[1]] for t in root]
	g_labels_tensor = torch.tensor(g_labels).type_as(h_root).to(device)
	pred_logits = h_root.detach().cpu()
	sigmoid_fn = torch.nn.Sigmoid()
	logits_after_sigmoid = sigmoid_fn(pred_logits)
	batch_size = logits_after_sigmoid.size()[0]		
	pred_labels = [1 if logits_after_sigmoid[i].item() >= 0.5 else 0 for i in range(batch_size)]
	pred_labels = torch.tensor(pred_labels)
	if LOSS_FN == 'nw':
		loss_func1 = torch.nn.BCEWithLogitsLoss()
	else:
		loss_func1 = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weights)
	# loss_function = torch.nn.BCEWithLogitsLoss(weight=weights)
	loss_ver = loss_func1(h_root, g_labels_tensor)
	g_labels = [t[1] for t in root]


    #CASE 2: Summarization
	loss_func2 = torch.nn.BCELoss(weight=summ_weight)
	summ_labels = tree_batch['s_gt'].to(device)
	pred_label_vals = summ_out.detach().cpu()
	pred_v,pred_summ_label = torch.max(pred_label_vals,1)
	g_summ_root = summ_labels.to('cpu').tolist()
	g_summ_label = [t[1] for t in g_summ_root]
	loss_summ = loss_func2(summ_out,summ_labels)


    #CASE 3: CONTENT-CLASSIFICATION
	loss_func3 = torch.nn.CrossEntropyLoss(weight = cont_weight)
	cont_labels = tree_batch['c_gt'].to(device)
	cont_label_vals = cont_out.detach().cpu()
	cont_v,cont_label = torch.max(cont_label_vals, 1)
	cont_gt = tree_batch['c_gt'] 
	loss_cont = loss_func3(cont_out,cont_labels)


	loss = (loss_ver + loss_summ + loss_cont)/3
	optimizer.zero_grad()
	if mode == "train":
		loss.backward()
		torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
		optimizer.step()
	
	return loss, pred_labels, g_labels, pred_summ_label, g_summ_label, cont_label, cont_gt


In [None]:
def testing(test_trees,model,epoch):
    print('Now Testing:', test_file)
    acc = 0
    total = 0
    predicted = []
    ground = []
    summ_pred = []
    summ_ground = []
    cont_pred = []
    cont_ground = []
    model.eval()
    prob = []
    pred =[]
    cont = []
    tweetid = []
    with torch.no_grad():
        for test in test_trees:
            h_test, h_test_root, c, summ_out, cont_out = model(
                    test['f'].to(device),
                    test['a'].to(device),
                    test['k'].to(device),
                    test['node_order'].to(device),
                    test['adjacency_list'].to(device),
                    test['edge_order'].to(device),
                    test['root_n'].to(device)
            )
            
            #VERIFICATION
            true_label_val = test['root_l'].to('cpu')					
            true_label = true_label_val[0][1].item()
            pred_logit = h_test_root.detach().cpu()					
            logit_after_sigmoid = sigmoid_fn(pred_logit)
            pred_label = 1 if logit_after_sigmoid[0].item() >= 0.5 else 0		 
            predicted.append(pred_label)
            ground.append(true_label)
            if pred_label == true_label:
                acc += 1

            #SUMMARIZATION
            summ_true_vals = test['s_gt']
            summ_pred_vals = summ_out.cpu()
            summ_v,summ_label = torch.max(summ_pred_vals, 1)
            prob.append(summ_v)
            pred.append(summ_label)
            tweetid.append(test['tweet_id'])
            summ_true_vals = summ_true_vals[0][1]
            summ_pred.append(summ_label)
            summ_ground.append(summ_true_vals)

            #CONTENT-CLASSIFICATION
            cont_true_val = test['c_gt']
            cont_pred_vals = cont_out.detach().cpu()
            cont_v,cont_label = torch.max(cont_pred_vals, 1)
            cont_pred.append(cont_label)
            cont_ground.append(cont_true_val)

            total += 1
    
    print("===================   TESTING   =====================")
    print(test_file, 'accuracy:', acc / total)
    print("*****VERIFICATION*****")
    print(classification_report(ground, predicted, digits=5))
    print('confusion matrix ')
    print(confusion_matrix(ground, predicted))    

    print("*****SUMMARIZATION*****")
    print(classification_report(summ_ground,summ_pred, digits=5))
    print('confusion matrix ')
    print(confusion_matrix(summ_ground,summ_pred))

    print("*****CONTENT-CLASSIFICATION*****")
    print(classification_report(cont_ground,cont_pred,digits=5))
    print('confusion matrix ')
    print(confusion_matrix(cont_ground,cont_pred))


    # dfsum = pd.DataFrame({"tweetid":tweetid,"summ_pred":pred,"summ_prob":prob, "content_label":cont_pred, "verification":predicted})
    # dfsum.to_pickle(path+model_name+str(learning_rate)[0]+'/'+test_file[:-4]+"_"+str(epoch)+".pkl")        

In [None]:
TRAINABLE_LAYERS = [0,1,2,3,4,5,6,7,8,9,10,11]
lr_list = [1e-5, 2e-5]
# lr_list = [5e-5]
for lr in lr_list:
	print("\n\n\nTraining with LR: ", lr)
	# train_accuracy = []
	# val_accuracy = []
	for test in files:
		seed_val = 40
		random.seed(seed_val)
		numpy.random.seed(seed_val)
		torch.manual_seed(seed_val)
		torch.backends.cudnn.deterministic = True
		torch.backends.cudnn.benchmark = False
		torch.autograd.set_detect_anomaly(True)

		# path = "./Models/"
		# path = "./drive/My Drive/IIT_Kgp/Research/Disaster/BTP_Chandana_Vishnu/verification/Models/"
		IN_FEATURES = 808
		OUT_FEATURES = 128
		NUM_ITERATIONS = 5
		BATCH_SIZE = 16
		CLASSIFIER_DROPOUT = 0.3
		# if MODEL_NAME == "BERT":
		# 	name = path + "stl_verification_featBERT.pt"
		# elif MODEL_NAME == "ROBERTA":
		# 	name = path + "stl_verification_featROBERTA.pt"
		# elif MODEL_NAME == "XLNET":
		# 	name = path + "stl_verification_featXLNET.pt"
		
		model = TreeLSTM(MODEL_NAME, TRAINABLE_LAYERS, IN_FEATURES, OUT_FEATURES, CLASSIFIER_DROPOUT, mode="cls")
		model.cuda()
		# model.cuda()
		# test_model = TreeLSTM(MODEL_NAME, TRAINABLE_LAYERS, IN_FEATURES, OUT_FEATURES, CLASSIFIER_DROPOUT, mode="cls")
		# test_model.cuda(gpu_id)
		# test_model.cuda()
		
		if OPTIM == 'adam':
			if L2_REGULARIZER == 'n':
				optimizer = torch.optim.Adam(model.parameters(), lr=lr)
			else:
				optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.01)
		else:
			optimizer = torch.optim.AdamW(model.parameters(), lr=lr, amsgrad=True)

		sigmoid_fn = torch.nn.Sigmoid()
		
		# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2, verbose=True)		

		test_file = test
		print('Training Set:', set(files) - {test_file})
		test_trees = []
		train_trees = []
		val_data = []
		for filename in files:
			if filename == test:
				test_trees.extend(tree_li[filename])
				test_trees.extend(val_li[filename])
			else:
				curr_tree_dataset = TreeDataset(tree_li[filename])
				train_trees.extend(curr_tree_dataset)
				val_data.extend(TreeDataset(val_li[filename]))
		
		print("Size of test data", len(test_trees))
		# print("size of training data", sum([len(i) for i in (train_trees)]))
		print("size of training data", len(train_trees))
		print("\ntraining started....")
		
		prev_loss = 1
		prev_acc = 0		
		for i in range(NUM_ITERATIONS):
			
			model.train()			
			
			data_gen = DataLoader(
				train_trees,
				collate_fn=batch_tree_input,
				batch_size=BATCH_SIZE,
				shuffle = True
			)

			val_gen = DataLoader(
				val_data,
				collate_fn=batch_tree_input,
				batch_size=BATCH_SIZE,
				shuffle = True
			)
			
			ver_gl = []
			ver_pl = []
			cont_gl = []
			cont_pl = []
			summ_gl = []
			summ_pl = []


			val_ver_gl = []
			val_ver_pl = []
			val_cont_gl = []
			val_cont_pl = []
			val_summ_gl = []
			val_summ_pl = []

			j = 0
			train_avg_loss=0					
			err_count = 0
			for tree_batch in data_gen:
				loss, v_pl, v_gl, s_pl, s_gl, c_pl, c_gl = train(tree_batch, test_file, "train")
				err = 0
				err_count += err
				if err != 1:
					ver_gl.extend(v_gl)
					ver_pl.extend(v_pl)
					cont_gl.extend(c_gl)
					cont_pl.extend(c_pl)
					summ_gl.extend(s_gl)
					summ_pl.extend(s_pl)
					j += 1
					train_avg_loss += loss.item()					
				# torch.cuda.empty_cache()
			acc1 = accuracy_score(cont_gl,cont_pl)
			acc2 = accuracy_score(ver_gl,ver_pl)
			acc3 = accuracy_score(summ_gl,summ_pl)
			train_acc = (acc1 + acc2 + acc3)/3
			# train_acc = accuracy_score(ground_labels, predicted_labels)
			train_avg_loss /= j
			
			print("validation started..",len(val_data))
			model.eval()
			val_j = 0
			val_avg_loss = 0			
			with torch.no_grad():
				for batch in val_gen:
					loss, v_pl, v_gl, s_pl, s_gl, c_pl, c_gl = train(batch, test_file, "eval")
					err = 0
					err_count += err
					if err != 1:
						val_ver_gl.extend(v_gl)
						val_ver_pl.extend(v_pl)
						val_cont_gl.extend(c_gl)
						val_cont_pl.extend(c_pl)
						val_summ_gl.extend(s_gl)
						val_summ_pl.extend(s_pl)
						val_j += 1
						val_avg_loss += loss.item()
					# torch.cuda.empty_cache()			
			# val_acc = accuracy_score(val_ground_labels, val_predicted_labels)
			val_acc1 = accuracy_score(val_cont_gl,val_cont_pl)
			val_acc2 = accuracy_score(val_ver_gl,val_ver_pl)
			val_acc3 = accuracy_score(val_summ_gl,val_summ_pl)
			val_acc = (val_acc1 + val_acc2 + val_acc3)/3
			# val_f1 = f1_score(val_ground_labels, val_predicted_labels)
			val_avg_loss /= val_j
			
			if MODEL_SAVING_POLICY == "acc":
				if(prev_acc <= val_acc):
					save_model(model, name, val_acc, val_avg_loss)
					prev_acc = val_acc
			else:			
				if(prev_loss >= val_avg_loss):
					save_model(model, name, val_acc, val_avg_loss)
					prev_loss = val_avg_loss
			
			print('Iteration ', i)
			print("errors ",err_count)			
			print('Training Loss: ', train_avg_loss)
			print('Training accuracy: ', train_acc)	
			print('Validation loss: ', val_avg_loss)			
			print('Validation accuracy: ', val_acc)
			print('Verification accuracy ',acc1)
			print('Summary accuracy ',acc2)
			print('content classification accuracy ',acc3)
			# print('Validation f1 score: ', val_f1)
			print('Training confusion matrix: ')
			print("*********VERIFICATION********")
			print(confusion_matrix(ver_gl, ver_pl))
			print("*********SUMMARIZATION********")
			print(confusion_matrix(summ_gl, summ_pl))
			print("*********CONTENT-CLASSIFICATION********")
			print(confusion_matrix(cont_gl, cont_pl))
            
            # print(classification_report())
			# train_accuracy.append(train_acc)
			# val_accuracy.append(val_acc)
			# scheduler.step(val_acc)

			if ((i+1) % 5 == 0 and i > 0):
				with torch.no_grad():
					save_model(model,name2,val_acc)
					output = load_model(model,name)
					testing(test_trees,model,i+1)
				output = load_model(model,name2)
				
		# plt.plot(numpy.array(train_accuracy))
		# plt.plot(numpy.array(val_accuracy))
		# plt.legend(['train_acc','val_acc'])
		# plt.show()
		# print('Iteration ', i+1,' Loss: ', total_loss)
		print('Training and Testing Completed')




Training with LR:  1e-05
model intialising...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…


Training Set: {'ottawashooting.txt', 'sydneysiege.txt', 'germanwings-crash.txt'}
Size of test data 2079
size of training data 2066

training started....
validation started.. 514
Iteration  0
errors  0
Training Loss:  0.8732669637753413
Training accuracy:  0.6658599548241368
Validation loss:  0.7199753459655878
Validation accuracy:  0.7645914396887159
Verification accuracy  0.521297192642788
Summary accuracy  0.6360116166505324
content classification accuracy  0.8402710551790901
Training confusion matrix: 
*********VERIFICATION********
[[596 481]
 [271 718]]
*********SUMMARIZATION********
[[1728   45]
 [ 285    8]]
*********CONTENT-CLASSIFICATION********
[[172  94  40  13]
 [100 243 108  30]
 [ 77 126 150  57]
 [ 47 198  99 512]]
validation started.. 514
Iteration  1
errors  0
Training Loss:  0.638381478878168
Training accuracy:  0.7917070022587932
Validation loss:  0.618401129137386
Validation accuracy:  0.8073929961089495
Verification accuracy  0.7410454985479187
Summary accuracy  0.