### GNN Batching

To implement the batching based on the Graph neighbors we should build a sampler that wraps Geometric's NeighborSampler and saves an attribute with the other outputs each time the __iter function is called.

In [None]:
class NeighborBatcher(Sampler[int]):
    r"""Samples elements sequentially, always in the same order.
    Args:
        data_source (Dataset): dataset to sample from
    """
    data_source: Sized

    def __init__(self, sampler_args) -> None:
        self.NeighborSampler = NeighborSampler(sampler_args)

    def __iter__(self) -> Iterator[int]:
        # This should yield an iteration of the indexes produced by NeighborSampler and save the other attributes in self
        
        return iter(range(len(self.data_source)))

    def __len__(self) -> int:
        return len(self.NeighborSampler)

# Training Loop

In [2]:
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import numpy as np

# Helper Functions
def plot_grad_flow(named_parameters):
    '''Plots the gradients flowing through different layers in the net during training.
    Can be used for checking for possible gradient vanishing / exploding problems.
    
    Usage: Plug this function in Trainer class after loss.backwards() as 
    "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow'''
    ave_grads = []
    max_grads= []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            layers.append(n)
            ave_grads.append(p.grad.abs().mean().cpu())
            max_grads.append(p.grad.abs().max().cpu())
    plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")
    plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(left=0, right=len(ave_grads))
    plt.ylim(bottom = -0.001, top=0.02) # zoom in on the lower gradient regions
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)
    plt.legend([Line2D([0], [0], color="c", lw=4),
                Line2D([0], [0], color="b", lw=4),
                Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])

Load Tokenizer, Validation and Training Set.

In [1]:
# Load BERT Tokenizer
from transformers import RobertaTokenizerFast, AdamW, get_linear_schedule_with_warmup
import torch, os
from tokenizers.processors import RobertaProcessing, BertProcessing

## Global Parameters
MAX_SEQ_LEN, MAX_TW_LEN = 128, 15
BATCH_SIZE = 64
SEED = 1911
INTERACTION_TYPES = ['<cls>', '<pad>', 'Original', 'Quote', 'Reply', 'Retweet']
##

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tokDir = r'\RoBETO_Model\roberta_es_tweet_tokenizer_bpe_50k' #Office Directory

tokenizer = RobertaTokenizerFast(os.path.join(tokDir, 'vocab.json'), os.path.join(tokDir, 'merges.txt'), 
                                tokenizer_file = os.path.join(tokDir, 'es_tweet_tokenizer_bpe_50k.json'), max_len = MAX_SEQ_LEN)
# I use this instead of Robertaprocessing as it returns different IDs for the target and reply (it does not follow the Roberta Convention <s>...<\s><\s>...<\s> and uses BERT's  <s>...<\s>...<\s>)        
tokenizer._tokenizer.post_processor = BertProcessing( 
                                            (tokenizer.eos_token, tokenizer.eos_token_id),
                                            (tokenizer.bos_token, tokenizer.bos_token_id)
                                        )
PAD_ID, CLS_ID, SEP_ID = tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id

In [2]:
import os, numpy as np
from UserModules.StanceDataset import *
from torch.utils.data import DataLoader

workDir = r'\Data' # Debugging Sample
country = 'Ecuador'

# Define Training Dataset
num_workers = 6
stance_params = {
    'user_file_name': os.path.join(workDir, r"2a-train_{}_dataframe.csv".format(country)),
    'tokenizer': tokenizer,
    'interaction_categories': INTERACTION_TYPES,
    'max_tw_per_user': MAX_TW_LEN,
    'label_column': 'user_government_stance',
    'max_seq_len':MAX_SEQ_LEN
}
replacement = True
trainConfig = StanceDatasetConfig(**stance_params)
train_data = StanceDataset(trainConfig)

# Define Validation Dataset
stance_params['user_file_name'] = os.path.join(workDir, r"2b-validation_{}_dataframe.csv".format(country))
valConfig = StanceDatasetConfig(**stance_params)
val_data = StanceDataset(valConfig)

train_loader =  DataLoader(train_data, batch_size = BATCH_SIZE, num_workers = num_workers, collate_fn = train_data._Stance_datacollator, 
                           sampler = train_data._Balanced_sampler(replacement = replacement)) # Do Balanced Samples due to label assymetry
val_loader = DataLoader(val_data, batch_size = BATCH_SIZE, num_workers = num_workers, collate_fn = val_data._Stance_datacollator, shuffle = True)


Define the model parameters.

In [3]:
from UserModules.ModelConfiguration import *
from UserModules.UserClassifier import User_Stance_Classifier

tweet_enc_args = {
    'Model_Dir': r'\RoBETO_Model', 
    #'Model_Dir': r'/disk1/target_stance_classification/Data/RoBETO', 
    'dropout': 0.1,
    'activation': 'Tanh',
    'freeze_bert_embeddings': True
}

tweetConfig = RoBERTaEncoderConfig(**tweet_enc_args)

emb_params = {
    'cls_idx': val_data.config.tweet_cls_id,
    'pad_idx': val_data.config.tweet_pad_id,
    'max_tweet_number': MAX_TW_LEN,
    'dropout': 0.1,
    'layer_norm_eps': 1e-12,
    'tweet_type_number': len(val_data.config.interaction_categories),
    'mask_embeddings': True
}

embConfig = ModelEmbeddingsConfig(tweetConfig, **emb_params)

user_params = { # This are the default parameters in UserEncoderConfig
    'num_attention_heads': 6,
    'intermidiate_size': 2048,
    'num_encoder_layers': 3,
    'transformer_activation': 'gelu',
    'user_activation': 'Tanh',
    'dropout': 0.1, 
    'initializer_range': 0.02,
    'model_embedder_version': 'v3' # v3 leaves the CLS parameter for the type embeddings    
}
userConfig = UserEncoderConfig(embConfig, **user_params)

# Instantiate model
model = User_Stance_Classifier(num_classes = 2, user_config = userConfig)    
model.to(device)
print()

Some weights of the model checkpoint at E:\OneDrive\Research Group\Papers\Sudaka_BETO\Data\ROP_Task\RoBETO_Model were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





Define the Trainer and training arguments

In [4]:
from UserModules.Trainer import Metric, TrainerConfig, Trainer

##
METRICS = [Metric('accuracy_score', {'normalize': True}), Metric('f1_score', {'average': 'weighted'})]
EPOCHS = 4#20
results_path = r'\Checkpoints'
##

optimizer = AdamW(model.parameters(),
                  lr = 2e-4, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Create the learning rate scheduler.
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = int(1.5 * len(train_loader)),# len(train_loader), # Default value in run_glue.py
                                            num_training_steps = total_steps)

EVALS_PER_EPOCH = 4#10
config = TrainerConfig(
                        epochs_to_train = EPOCHS,
                        results_path = results_path,
                        optimizer = optimizer, 
                        batch_size = BATCH_SIZE, 
                        train_dataloader = train_loader,
                        val_dataloader = val_loader,
                        device = device, 
                        loss = torch.nn.CrossEntropyLoss(),
                        scheduler = scheduler,
                        clip_max_norm = 1,
                        steps_to_eval = len(train_loader) // EVALS_PER_EPOCH,
                        early_stop_eval_steps = EVALS_PER_EPOCH * 2,
                        max_checkpoints_num = 10,
                        seed = SEED,
                        use_notebook = True,
                        x_names = ['input_ids', 'attention_mask', 'interaction_types', 'tweet_masks'],
                        y_name = 'labels',
                        resume_checkpoint_path = os.path.join(results_path, 'checkpoint_273'), #None,
                        metrics2collect = METRICS,
                        metric_to_focus = 'loss',
                        lowerIsbetter = True
)

trainer = Trainer(model, config)

Model loaded from <== E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\checkpoint_273


Output()

In [5]:
trainer.train()

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/157 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\checkpoint_312


Iteration:   0%|          | 0/157 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\checkpoint_351
    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\Best_Models\checkpoint_351


Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\checkpoint_390


Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\checkpoint_429


Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\checkpoint_468
    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\Best_Models\checkpoint_468


Iteration:   0%|          | 0/157 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\checkpoint_507


Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\checkpoint_546


Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\checkpoint_585


Evaluation:   0%|          | 0/16 [00:00<?, ?it/s]

    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\checkpoint_624
    Model saved to ==> E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Small_Test\Checkpoints\Best_Models\checkpoint_624


### Create Small sample for debugging

In [None]:
# Get Small example
import pandas as pd, os

workDir = r'' # Office
country = 'Bolivia'
outDir = r''
 
# Get small subset of data    
trainDF = pd.read_csv(os.path.join(workDir, r"2a-train_{}_dataframe.csv".format(country)), dtype = {'tweet_id': 'Int64', 'user_id': str})
uGroup = trainDF.groupby('user_id')
smallTrain = []
for i, (uID, uDF) in enumerate(uGroup):
    smallTrain.append(uDF)
    if i == 1000:    break
trainDF = pd.concat(smallTrain, ignore_index = True)    
trainDF.to_csv(r'{}\2a-train_{}_dataframe.csv'.format(outDir, country), index = False)

valDF = pd.read_csv(os.path.join(workDir, r"2b-validation_{}_dataframe.csv".format(country)), dtype = {'tweet_id': 'Int64', 'user_id': str})
uGroup = trainDF.groupby('user_id')
smallVal = []
for i, (uID, uDF) in enumerate(uGroup):
    smallVal.append(uDF)
    if i == 100:    break
valDF = pd.concat(smallVal, ignore_index = True)    
valDF.to_csv(r'{}\2b-validation_{}_dataframe.csv'.format(outDir, country), index = False)



In [2]:
# Create a small sample of the unbalanced Ecuadorian Data

import pandas as pd, os

workDir = r'.\' # Office
country = 'Ecuador'
outDir = r'.\'
 
# Get small subset of data    
trainDF = pd.read_csv(os.path.join(workDir, r"2a-train_{}_dataframe.csv".format(country)), dtype = {'tweet_id': 'Int64', 'user_id': str})
uDF = trainDF.groupby('user_id').tweet_id.count().reset_index().rename(columns = {'tweet_id': 'counts'})
uDF = uDF.sample(frac = 1)
uDF = uDF.head(10000)

trainDF = pd.merge(trainDF, uDF, on = 'user_id').drop(columns = 'counts')
trainDF.to_csv(r'{}\2a-train_{}_dataframe.csv'.format(outDir, country), index = False)

valDF = pd.read_csv(os.path.join(workDir, r"2b-validation_{}_dataframe.csv".format(country)), dtype = {'tweet_id': 'Int64', 'user_id': str})
uDF = valDF.groupby('user_id').tweet_id.count().reset_index().rename(columns = {'tweet_id': 'counts'})
uDF = uDF.sample(frac = 1)
uDF = uDF.head(1000)

valDF = pd.merge(valDF, uDF, on = 'user_id').drop(columns = 'counts')
valDF.to_csv(r'{}\2b-validation_{}_dataframe.csv'.format(outDir, country), index = False)

In [13]:
import pandas as pd, os

workDir = r'' # Office
country = 'Ecuador'

trainDF = pd.read_csv(os.path.join(workDir, r"2a-train_{}_dataframe.csv".format(country)), dtype = {'tweet_id': 'Int64', 'user_id': str})

userDF = trainDF.groupby('user_id').user_government_stance.first().reset_index()
stanceDF = userDF.groupby('user_government_stance').count().reset_index().rename(columns = {'user_id': 'counts'})
stanceDF['weights'] = len(userDF) / stanceDF['counts']
userDF = pd.merge(userDF, stanceDF[['user_government_stance', 'weights']], on = 'user_government_stance')

In [1]:
# Load BERT Tokenizer
from transformers import BertModel, BertTokenizer
from transformers import RobertaModel, RobertaTokenizerFast, AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification
import torch, numpy as np, random, os
from tokenizers.processors import RobertaProcessing, BertProcessing

## Global Parameters
MAX_SEQ_LEN, MAX_TW_LEN = 128, 15
BATCH_SIZE = 32#64
SEED = 1911
INTERACTION_TYPES = ['<cls>', '<pad>', 'Original', 'Quote', 'Reply', 'Retweet']
##

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#tokDir = r'/disk1/target_stance_classification/Data/RoBETO/roberta_es_tweet_tokenizer_bpe_50k' #Server Directory
tokDir = r'\RoBETO_Model\roberta_es_tweet_tokenizer_bpe_50k' #Office Directory

tokenizer = RobertaTokenizerFast(os.path.join(tokDir, 'vocab.json'), os.path.join(tokDir, 'merges.txt'), 
                                tokenizer_file = os.path.join(tokDir, 'es_tweet_tokenizer_bpe_50k.json'), max_len = MAX_SEQ_LEN)
# I use this instead of Robertaprocessing as it returns different IDs for the target and reply (it does not follow the Roberta Convention <s>...<\s><\s>...<\s> and uses BERT's  <s>...<\s>...<\s>)        
tokenizer._tokenizer.post_processor = BertProcessing( 
                                            (tokenizer.eos_token, tokenizer.eos_token_id),
                                            (tokenizer.bos_token, tokenizer.bos_token_id)
                                        )
PAD_ID, CLS_ID, SEP_ID = tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id


import pandas as pd
from UserModules.StanceDataset import *
from torch.utils.data import DataLoader

#workDir = r'/disk1/target_stance_classification/Data/Splits/Subsampled' # Server
#workDir = r'E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Splits\Subsampled' # Office
workDir = r'.\' # Debugging Sample
country = 'Ecuador'

# Define Training Dataset
num_workers = 6
stance_params = {
    'user_file_name': os.path.join(workDir, r"2a-train_{}_dataframe.csv".format(country)),
    'tokenizer': tokenizer,
    'interaction_categories': INTERACTION_TYPES,
    'max_tw_per_user': MAX_TW_LEN,
    'label_column': 'user_government_stance',
    'max_seq_len':MAX_SEQ_LEN
}
trainConfig = StanceDatasetConfig(**stance_params)
train_data = StanceDataset(trainConfig)

# Define Validation Dataset
stance_params['user_file_name'] = os.path.join(workDir, r"2b-validation_{}_dataframe.csv".format(country))
valConfig = StanceDatasetConfig(**stance_params)
val_data = StanceDataset(valConfig)


In [38]:
import pandas as pd

fDF = []
replacement = True
sampler = train_data._Balanced_sampler(replacement = replacement, num_samples = len(train_data))
for i in sampler:
    user_id = train_data.users[i]
    uDF = train_data.User_DF.loc[train_data.User_DF.user_id == user_id]
    target = uDF.user_stance.head(1).item()
    fDF.append({'user_id': user_id, 'user_stance': target})
sampledDF = pd.DataFrame(fDF)

In [44]:
sampledDF[sampledDF.duplicated(subset = 'user_id')].groupby('user_stance').count()

Unnamed: 0_level_0,user_id
user_stance,Unnamed: 1_level_1
0,1513
1,2440


In [29]:
sampledDF2.groupby('user_stance').count()

Unnamed: 0_level_0,user_id
user_stance,Unnamed: 1_level_1
0,4073
1,1879


In [39]:
sampledDF.groupby('user_stance').count()

Unnamed: 0_level_0,user_id
user_stance,Unnamed: 1_level_1
0,5066
1,4934


In [None]:

train_loader =  DataLoader(train_data, batch_size = BATCH_SIZE, num_workers = num_workers, collate_fn = train_data._Stance_datacollator, shuffle = True)
val_loader = DataLoader(val_data, batch_size = BATCH_SIZE, num_workers = num_workers, collate_fn = val_data._Stance_datacollator, shuffle = True)


In [16]:
https://github.com/ufoym/imbalanced-dataset-sampler/blob/master/torchsampler/imbalanced.py

https://pytorch.org/docs/master/_modules/torch/utils/data/sampler.html#Sampler
https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/28?page=2

Unnamed: 0,user_id,user_government_stance,weights
0,1000048856523034624,0,0.000150
1,1000145648,0,0.000150
2,100017027,0,0.000150
3,1000217620027764738,0,0.000150
4,1000356578,0,0.000150
...,...,...,...
9995,998062725220700160,1,0.000302
9996,999123320615456773,1,0.000302
9997,999255230,1,0.000302
9998,999839718312480768,1,0.000302
