# Predict User Embeddings

Load Best Model

In [1]:
import torch, os

countries = ['Ecuador', 'Bolivia', 'Colombia', 'Chile']

COUNTRY = countries[3]

## Global Parameters
MAX_SEQ_LEN, MAX_TW_LEN = 128, 15
BATCH_SIZE = 64
SEED = 1911
INTERACTION_TYPES = ['<cls>', '<pad>', 'Original', 'Quote', 'Reply', 'Retweet']

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from UserModules.ModelConfiguration import *
from UserModules.UserClassifier import User_Stance_Classifier

tweet_enc_args = {
    'Model_Dir': r'E:\OneDrive\Research Group\Papers\Sudaka_BETO\Data\ROP_Task\RoBETO_Model', 
    'dropout': 0.1,
    'activation': 'Tanh',
    'freeze_bert_embeddings': True
}

tweetConfig = RoBERTaEncoderConfig(**tweet_enc_args)

emb_params = {
    'cls_idx': INTERACTION_TYPES.index('<cls>'),
    'pad_idx': INTERACTION_TYPES.index('<pad>'),
    'max_tweet_number': MAX_TW_LEN,
    'dropout': 0.1,
    'layer_norm_eps': 1e-12,
    'tweet_type_number': len(INTERACTION_TYPES),
    'mask_embeddings': True
}

embConfig = ModelEmbeddingsConfig(tweetConfig, **emb_params)

user_params = { # This are the default parameters in UserEncoderConfig
    'num_attention_heads': 6,
    'intermidiate_size': 2048,
    'num_encoder_layers': 3,
    'transformer_activation': 'gelu',
    'user_activation': 'Tanh',
    'dropout': 0.1, 
    'initializer_range': 0.02,
    'model_embedder_version': 'v3' # v3 leaves the CLS parameter for the type embeddings    
}
userConfig = UserEncoderConfig(embConfig, **user_params)

# Instantiate model
model = User_Stance_Classifier(num_classes = 2, user_config = userConfig)    
model.to(device)
print()

Some weights of the model checkpoint at E:\OneDrive\Research Group\Papers\Sudaka_BETO\Data\ROP_Task\RoBETO_Model were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [3]:
best_dir = r'\{}'.format(COUNTRY)

state_dict = torch.load(os.path.join(best_dir, 'model_results.pth'), map_location=device)
print(f'Model loaded from <== {best_dir}')

model.load_state_dict(state_dict['model_state_dict'])
state_dict['valid_loss']

Model loaded from <== E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Results\Best_Models\Chile


0.001947219159196558

Load Tokenizer

In [6]:
# Load BERT Tokenizer
from transformers import BertModel, BertTokenizer
from transformers import RobertaModel, RobertaTokenizerFast, AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification
import torch, numpy as np, random, os
from tokenizers.processors import RobertaProcessing, BertProcessing

#tokDir = r'/disk1/target_stance_classification/Data/RoBETO/roberta_es_tweet_tokenizer_bpe_50k' #Server Directory
tokDir = r'\RoBETO_Model\roberta_es_tweet_tokenizer_bpe_50k' #Office Directory

tokenizer = RobertaTokenizerFast(os.path.join(tokDir, 'vocab.json'), os.path.join(tokDir, 'merges.txt'), 
                                tokenizer_file = os.path.join(tokDir, 'es_tweet_tokenizer_bpe_50k.json'), max_len = MAX_SEQ_LEN)
# I use this instead of Robertaprocessing as it returns different IDs for the target and reply (it does not follow the Roberta Convention <s>...<\s><\s>...<\s> and uses BERT's  <s>...<\s>...<\s>)        
tokenizer._tokenizer.post_processor = BertProcessing( 
                                            (tokenizer.eos_token, tokenizer.eos_token_id),
                                            (tokenizer.bos_token, tokenizer.bos_token_id)
                                        )
PAD_ID, CLS_ID, SEP_ID = tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id

Process Datasets

In [7]:
# Helper Function
import pandas as pd
from tqdm.notebook import tqdm, trange

def predict(model, data_loader):
    model.eval()
    resultsDF_list = {'user_id': [], 'true_label': [], 'predicted_label':[]}
    embeddings = []
    with torch.no_grad():   
        epoch_iterator = tqdm(data_loader, desc="Predicting")
        for step, batch in enumerate(epoch_iterator):        
            # Get Batch Elements
            batched_ids, input_ids, attention_mask = batch['batched_ids'], batch['input_ids'].to(device), batch['attention_mask'].to(device)
            interaction_types, tweet_masks, labels = batch['interaction_types'].to(device), batch['tweet_masks'].to(device), batch['labels']
            
            # Predict
            logits, user_emb = model(input_ids, attention_mask, interaction_types, tweet_masks) 
            _, preds = torch.max(logits, dim=1)
            
            # Compile results 
            embeddings.append(user_emb)
            resultsDF_list['user_id'].extend(batched_ids)
            if labels is None:
                resultsDF_list['true_label'].extend([None] * len(batched_ids))
            else:
                resultsDF_list['true_label'].extend(list(labels.data.numpy()))
            resultsDF_list['predicted_label'].extend(list(preds.data.cpu().numpy()))
    embeddings = torch.cat(embeddings)
    resultsDF = pd.DataFrame(resultsDF_list)
    return resultsDF, embeddings

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

Weak Labeled Data

In [None]:
from UserModules.StanceDataset import *
from torch.utils.data import Dataset, DataLoader

#workDir = r'/disk1/target_stance_classification/Data/Splits/Subsampled' # Server
workDir = r'' # Office
outDir = r'\{}'.format(COUNTRY)
if not os.path.isdir(outDir):
    os.mkdir(outDir)
num_workers = 6

datasets = {'test': r"2c-test_{}_dataframe.csv".format(COUNTRY), 'val': r"2b-validation_{}_dataframe.csv".format(COUNTRY), 'train': r"2a-train_{}_dataframe.csv".format(COUNTRY)}

for key, f in datasets.items():
    set_seed(SEED)
    print('## Working with File: ' + f)
    # Define Training Dataset
    stance_params = {
        'user_file_name': os.path.join(workDir, f),
        'tokenizer': tokenizer,
        'interaction_categories': INTERACTION_TYPES,
        'max_tw_per_user': MAX_TW_LEN,
        'label_column': 'user_government_stance',
        'SEED': SEED,
        'max_seq_len':MAX_SEQ_LEN
    }
    dataConfig = StanceDatasetConfig(**stance_params)
    data = StanceDataset(dataConfig)
    data_loader =  DataLoader(data, batch_size = BATCH_SIZE, num_workers=num_workers, collate_fn = data._Stance_datacollator, shuffle = True)

    resultsDF, embeddings = predict(model, data_loader)

    # Save Results
    resultsDF.to_csv(os.path.join(outDir, '{}_predictions.csv'.format(key)), index = False)
    torch.save(embeddings, os.path.join(outDir, '{}_embeddings.pt'.format(key)))

## Working with File: 2c-test_Chile_dataframe.csv


Predicting:   0%|          | 0/361 [00:00<?, ?it/s]

## Working with File: 2b-validation_Chile_dataframe.csv


Predicting:   0%|          | 0/397 [00:00<?, ?it/s]

## Working with File: 2a-train_Chile_dataframe.csv


Predicting:   0%|          | 0/3208 [00:00<?, ?it/s]

In [1]:
import pandas as pd, os


countries = ['Ecuador', 'Bolivia', 'Colombia', 'Chile']

datasets = ['train', 'val', 'test']
resultDF = {}
for i, country in enumerate(countries):
    workDir = r'\Main_Predictions\{}'.format(country)
    resultDF[country] = {}
    for dt in datasets:
        dataDF = pd.read_csv(os.path.join(workDir, '{}_predictions.csv'.format(dt)), dtype = {'user_id': str})
        resultDF[country][dt] = (dataDF.true_label == dataDF.predicted_label).sum() / len(dataDF) * 100
resultDF = pd.DataFrame.from_dict(resultDF, orient = 'index')
resultDF

Unnamed: 0,train,val,test
Ecuador,94.995262,95.174925,94.987999
Bolivia,93.899956,93.973768,94.045637
Colombia,95.570011,95.713757,95.615803
Chile,95.926491,95.950364,95.975567


Unlabeled Data

In [None]:
#workDir = r'/disk1/target_stance_classification/Data/Splits/Subsampled' # Server

datasets = {'first': r"2-{}_first_neighbor_df.csv".format(COUNTRY), 'second': r"2-{}_second_neighbor_df.csv".format(COUNTRY)}
num_workers = 6
for key, f in datasets.items():
    set_seed(SEED)
    print('## Working with File: ' + f)
    # Define Training Dataset
    stance_params = {
        'user_file_name': os.path.join(workDir, f),
        'tokenizer': tokenizer,
        'interaction_categories': INTERACTION_TYPES,
        'max_tw_per_user': MAX_TW_LEN,
        'label_column': None,
        'max_seq_len':MAX_SEQ_LEN
    }
      
    dataConfig = StanceDatasetConfig(**stance_params)
    data = StanceDataset(dataConfig)
    data_loader =  DataLoader(data, batch_size = BATCH_SIZE, num_workers=num_workers, collate_fn = data._Stance_datacollator, shuffle = True)

    resultsDF, embeddings = predict(model, data_loader)

    # Save Results
    resultsDF.to_csv(os.path.join(outDir, '{}_neighbor_predictions.csv'.format(key)), index = False)
    torch.save(embeddings, os.path.join(outDir, '{}_neighbor_embeddings.pt'.format(key)))

In [None]:
## Get 