# Predict User Embeddings

Load Best Model

In [1]:
import torch, os

countries = ['Ecuador', 'Colombia', 'Bolivia', 'Chile']

COUNTRY = countries[3]

## Global Parameters
MAX_SEQ_LEN, MAX_TW_LEN = 128, 15
BATCH_SIZE = 64
SEED = 1911
INTERACTION_TYPES = ['<cls>', '<pad>', 'Original', 'Quote', 'Reply', 'Retweet']

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from UserModules.ModelConfiguration import *
from UserModules.UserClassifier import User_Stance_Classifier

tweet_enc_args = {
    'Model_Dir': r'E:\OneDrive\Research Group\Papers\Sudaka_BETO\Data\ROP_Task\RoBETO_Model', 
    'dropout': 0.1,
    'activation': 'Tanh',
    'freeze_bert_embeddings': True
}

tweetConfig = RoBERTaEncoderConfig(**tweet_enc_args)

emb_params = {
    'cls_idx': INTERACTION_TYPES.index('<cls>'),
    'pad_idx': INTERACTION_TYPES.index('<pad>'),
    'max_tweet_number': MAX_TW_LEN,
    'dropout': 0.1,
    'layer_norm_eps': 1e-12,
    'tweet_type_number': len(INTERACTION_TYPES),
    'mask_embeddings': True
}

embConfig = ModelEmbeddingsConfig(tweetConfig, **emb_params)

user_params = { # This are the default parameters in UserEncoderConfig
    'num_attention_heads': 6,
    'intermidiate_size': 2048,
    'num_encoder_layers': 3,
    'transformer_activation': 'gelu',
    'user_activation': 'Tanh',
    'dropout': 0.1, 
    'initializer_range': 0.02,
    'model_embedder_version': 'v3' # v3 leaves the CLS parameter for the type embeddings    
}
userConfig = UserEncoderConfig(embConfig, **user_params)

# Instantiate model
model = User_Stance_Classifier(num_classes = 2, user_config = userConfig)    
model.to(device)
print()

Some weights of the model checkpoint at E:\OneDrive\Research Group\Papers\Sudaka_BETO\Data\ROP_Task\RoBETO_Model were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [3]:
best_dir = r'E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Results\Best_Models\{}'.format(COUNTRY)

state_dict = torch.load(os.path.join(best_dir, 'model_results.pth'), map_location=device)
print(f'Model loaded from <== {best_dir}')

model.load_state_dict(state_dict['model_state_dict'])
state_dict['valid_loss']

Model loaded from <== E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Results\Best_Models\Chile


0.001947219159196558

Load Tokenizer

In [4]:
# Load BERT Tokenizer
from transformers import BertModel, BertTokenizer
from transformers import RobertaModel, RobertaTokenizerFast, AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification
import torch, numpy as np, random, os
from tokenizers.processors import RobertaProcessing, BertProcessing

#tokDir = r'/disk1/target_stance_classification/Data/RoBETO/roberta_es_tweet_tokenizer_bpe_50k' #Server Directory
tokDir = r'E:\OneDrive\Research Group\Papers\Sudaka_BETO\Data\ROP_Task\RoBETO_Model\roberta_es_tweet_tokenizer_bpe_50k' #Office Directory

tokenizer = RobertaTokenizerFast(os.path.join(tokDir, 'vocab.json'), os.path.join(tokDir, 'merges.txt'), 
                                tokenizer_file = os.path.join(tokDir, 'es_tweet_tokenizer_bpe_50k.json'), max_len = MAX_SEQ_LEN)
# I use this instead of Robertaprocessing as it returns different IDs for the target and reply (it does not follow the Roberta Convention <s>...<\s><\s>...<\s> and uses BERT's  <s>...<\s>...<\s>)        
tokenizer._tokenizer.post_processor = BertProcessing( 
                                            (tokenizer.eos_token, tokenizer.eos_token_id),
                                            (tokenizer.bos_token, tokenizer.bos_token_id)
                                        )
PAD_ID, CLS_ID, SEP_ID = tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id

Process Datasets

In [5]:
# Helper Function
import pandas as pd
from tqdm.notebook import tqdm, trange

def predict(model, data_loader):
    model.eval()
    resultsDF_list = {'user_id': [], 'true_label': [], 'predicted_label':[]}
    embeddings = []
    with torch.no_grad():   
        epoch_iterator = tqdm(data_loader, desc="Predicting")
        for step, batch in enumerate(epoch_iterator):        
            # Get Batch Elements
            batched_ids, input_ids, attention_mask = batch['batched_ids'], batch['input_ids'].to(device), batch['attention_mask'].to(device)
            interaction_types, tweet_masks, labels = batch['interaction_types'].to(device), batch['tweet_masks'].to(device), batch['labels']
            
            # Predict
            logits, user_emb = model(input_ids, attention_mask, interaction_types, tweet_masks) 
            _, preds = torch.max(logits, dim=1)
            
            # Compile results 
            embeddings.append(user_emb)
            resultsDF_list['user_id'].extend(batched_ids)
            if labels is None:
                resultsDF_list['true_label'].extend([None] * len(batched_ids))
            else:
                resultsDF_list['true_label'].extend(list(labels.data.numpy()))
            resultsDF_list['predicted_label'].extend(list(preds.data.cpu().numpy()))
    embeddings = torch.cat(embeddings)
    resultsDF = pd.DataFrame(resultsDF_list)
    return resultsDF, embeddings

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

Weak Labeled Data

In [6]:
from UserModules.StanceDataset import *
from torch.utils.data import Dataset, DataLoader

#workDir = r'/disk1/target_stance_classification/Data/Splits/Subsampled' # Server
workDir = r'E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Splits\Subsampled' # Office
outDir = r'E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Results\Predictions\Cross_Predictions\{}_Classifier'.format(COUNTRY)
if not os.path.isdir(outDir):
    os.mkdir(outDir)
num_workers = 6

countries = ['Chile', 'Ecuador', 'Colombia', 'Bolivia']
for other_country in countries:
    if other_country == COUNTRY:   continue
    print('### Predicting stances in country: ' + other_country)
    datasets = {'train': r"2a-train_{}_dataframe.csv".format(other_country), 'test': r"2c-test_{}_dataframe.csv".format(other_country), 'val': r"2b-validation_{}_dataframe.csv".format(other_country)}

    for key, f in datasets.items():
        set_seed(SEED)
        print(' ## Working with File: ' + f)
        # Define Training Dataset
        stance_params = {
            'user_file_name': os.path.join(workDir, f),
            'tokenizer': tokenizer,
            'interaction_categories': INTERACTION_TYPES,
            'max_tw_per_user': MAX_TW_LEN,
            'label_column': 'user_government_stance',
            'SEED': SEED,
            'max_seq_len':MAX_SEQ_LEN
        }
        dataConfig = StanceDatasetConfig(**stance_params)
        data = StanceDataset(dataConfig)
        data_loader =  DataLoader(data, batch_size = BATCH_SIZE, num_workers=num_workers, collate_fn = data._Stance_datacollator, shuffle = True)

        resultsDF, embeddings = predict(model, data_loader)

        # Save Results
        resultsDF.to_csv(os.path.join(outDir, '{}_{}_predictions.csv'.format(other_country, key)), index = False)
        torch.save(embeddings, os.path.join(outDir, '{}_{}_embeddings.pt'.format(other_country, key)))

### Predicting stances in country: Ecuador
 ## Working with File: 2a-train_Ecuador_dataframe.csv


Predicting:   0%|          | 0/973 [00:00<?, ?it/s]

 ## Working with File: 2c-test_Ecuador_dataframe.csv


Predicting:   0%|          | 0/111 [00:00<?, ?it/s]

 ## Working with File: 2b-validation_Ecuador_dataframe.csv


Predicting:   0%|          | 0/121 [00:00<?, ?it/s]

### Predicting stances in country: Colombia
 ## Working with File: 2a-train_Colombia_dataframe.csv


Predicting:   0%|          | 0/1367 [00:00<?, ?it/s]

 ## Working with File: 2c-test_Colombia_dataframe.csv


Predicting:   0%|          | 0/156 [00:00<?, ?it/s]

 ## Working with File: 2b-validation_Colombia_dataframe.csv


Predicting:   0%|          | 0/169 [00:00<?, ?it/s]

### Predicting stances in country: Bolivia
 ## Working with File: 2a-train_Bolivia_dataframe.csv


Predicting:   0%|          | 0/1428 [00:00<?, ?it/s]

 ## Working with File: 2c-test_Bolivia_dataframe.csv


Predicting:   0%|          | 0/160 [00:00<?, ?it/s]

 ## Working with File: 2b-validation_Bolivia_dataframe.csv


Predicting:   0%|          | 0/177 [00:00<?, ?it/s]

In [6]:
from UserModules.StanceDataset import *
from torch.utils.data import Dataset, DataLoader


Unlabeled Data

In [7]:
#workDir = r'/disk1/target_stance_classification/Data/Splits/Subsampled' # Server
workDir = r'E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Data\Splits\Unlabeled' # Office
outDir = r'E:\OneDrive\Research Group\Papers\Target_Stance_Classification\Results\Predictions\Cross_Predictions\{}_Classifier'.format(COUNTRY)

for other_country in countries:
    if other_country == COUNTRY:   continue
    if other_country in set(['Ecuador', 'Colombia']):    continue
    print('### Predicting stances in country: ' + other_country)
    datasets = {'first': r"2-{}_first_neighbor_df.csv".format(other_country), 'second': r"2-{}_second_neighbor_df.csv".format(other_country)} # Second Failed
    num_workers = 4
    for key, f in datasets.items():
        if key == 'first':    continue
        set_seed(SEED)
        print(' ## Working with File: ' + f)
        # Define Training Dataset
        stance_params = {
            'user_file_name': os.path.join(workDir, f),
            'tokenizer': tokenizer,
            'interaction_categories': INTERACTION_TYPES,
            'max_tw_per_user': MAX_TW_LEN,
            'label_column': None,
            'max_seq_len':MAX_SEQ_LEN
        }

        dataConfig = StanceDatasetConfig(**stance_params)
        data = StanceDataset(dataConfig)
        data_loader =  DataLoader(data, batch_size = BATCH_SIZE, num_workers=num_workers, collate_fn = data._Stance_datacollator, shuffle = True)

        resultsDF, embeddings = predict(model, data_loader)

        # Save Results
        resultsDF.to_csv(os.path.join(outDir, '{}_{}_neighbor_predictions.csv'.format(other_country, key)), index = False)
        torch.save(embeddings, os.path.join(outDir, '{}_{}_neighbor_embeddings.pt'.format(other_country, key)))

### Predicting stances in country: Bolivia
 ## Working with File: 2-Bolivia_second_neighbor_df.csv


Predicting:   0%|          | 0/1761 [00:00<?, ?it/s]

# Results


In [None]:
from IPython.display import display_html

def print_results(self):
    with self.output:
        self.output.clear_output(wait = True)
        # Prepare Training output
        trainTemp = self.resultsDF[self.resultsDF.data_source == 'training'].drop(columns = 'data_source')
        train_styler = trainTemp.style.set_table_attributes("style='display:inline'").set_caption('Training')
        # Prepare Development output
        devTemp = self.resultsDF[self.resultsDF.data_source == 'development'].drop(columns = 'data_source')
        dev_styler = devTemp.style.set_table_attributes("style='display:inline'").set_caption('Development')

        space = "\xa0" * 10
        display_html(train_styler._repr_html_() + space  + dev_styler._repr_html_(), raw=True)

In [21]:
import pandas as pd, os, numpy as np
from sklearn.metrics import  accuracy_score, f1_score

workDir = r''
main_classifier = ['Bolivia', 'Ecuador', 'Colombia', 'Chile']
countries = ['Ecuador', 'Bolivia', 'Colombia', 'Chile']
datasets = ['train', 'val', 'test']

for main_c in main_classifier:
    country_dir = r'{}\{}_Classifier'.format(workDir, main_c)
    result_dict = {'accuracy': {}, 'f1': {}}
    for country in countries:
        if main_c == country:    continue
        result_dict[country] = {}
        result_dict[country] = {}
        true_lbl, pred_lbl = [], []
        for dt in datasets:        
            dataDF = pd.read_csv(os.path.join(country_dir, '{}_{}_predictions.csv'.format(country, dt)), dtype = {'user_id': str})
            true_lbl.append(dataDF.true_label.values)
            pred_lbl.append(dataDF.predicted_label.values)
        
        true_lbl = np.concatenate(true_lbl)
        pred_lbl = np.concatenate(pred_lbl)
        result_dict[country]['accuracy'] = accuracy_score(true_lbl, pred_lbl)
        result_dict[country]['f1'] = f1_score(true_lbl, pred_lbl, average = 'macro')
        maj = true_lbl.mean()
        result_dict[country]['majority'] = maj if maj > 0.5 else 1 - maj
        
    resultsDF = pd.DataFrame.from_dict(result_dict, orient = 'index')
    print(f' ############ {main_c} Classifier ############')
    display(resultsDF)

 ############ Bolivia Classifier ############


Unnamed: 0,accuracy,f1,majority
Ecuador,0.252541,0.214831,0.668103
Colombia,0.175506,0.173572,0.738234
Chile,0.099507,0.099499,0.868632


 ############ Ecuador Classifier ############


Unnamed: 0,accuracy,f1,majority
Bolivia,0.229658,0.211767,0.518435
Colombia,0.817378,0.783204,0.738234
Chile,0.921895,0.80918,0.868632


 ############ Colombia Classifier ############


Unnamed: 0,accuracy,f1,majority
Ecuador,0.812483,0.788866,0.668103
Bolivia,0.391104,0.353229,0.518435
Chile,0.849993,0.752241,0.868632


 ############ Chile Classifier ############


Unnamed: 0,accuracy,f1,majority
Ecuador,0.848792,0.833961,0.668103
Bolivia,0.318923,0.308817,0.518435
Colombia,0.875994,0.84855,0.738234


In [20]:

resultsDF

Unnamed: 0,accuracy,f1,majority
Ecuador,0.252541,0.214831,0.668103
Colombia,0.175506,0.173572,0.738234
Chile,0.099507,0.099499,0.868632


## Duplicates

In [3]:
import pandas as pd, os

resDir = r''
country = 'Bolivia'

trainDF = pd.read_csv(os.path.join(resDir, 'train_predictions.csv'), dtype = {'user_id': str})
valDF = pd.read_csv(os.path.join(resDir, 'val_predictions.csv'), dtype = {'user_id': str})
testDF = pd.read_csv(os.path.join(resDir, 'test_predictions.csv'), dtype = {'user_id': str})
firstNeighDF = pd.read_csv(os.path.join(resDir, 'first_neighbor_predictions.csv'), dtype = {'user_id': str})
secondNeighDF = pd.read_csv(os.path.join(resDir, 'second_neighbor_predictions.csv'), dtype = {'user_id': str})