In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import pandas as pd
import pickle
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from utils.utils import load_and_prepare_data, prepare_for_training, preprocess
from tqdm import tqdm
tqdm.pandas()

from pandarallel import pandarallel
pandarallel.initialize(nb_workers = 30, progress_bar=True)

INFO: Pandarallel will run on 30 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  from pandas import Panel


## Huggingface embeddings + cossim

In [6]:
from methods.transformers.HF_emb_COSSIM.emb_distance import emb_distance

In [7]:
data_labels = ['loaded_data_cleaned.csv', 'loaded_data_2_cleaned.csv']
data = load_and_prepare_data(data_labels, preprocess)

In [8]:
data = prepare_for_training(data)

In [9]:
models = ['distilbert-base-multilingual-cased', 'bert-base-multilingual-cased', 'bert-base-multilingual-uncased',
          'xlm-mlm-xnli15-1024', 'xlm-roberta-base', 'xlm-roberta-large', 'facebook/m2m100_418M']

In [11]:
results = {}
device = 'cuda' if torch.cuda.is_available() else 'cpu'
for model in tqdm(models):
    print(f"Start {model}")
    torch.cuda.empty_cache()
    try:
        results[model] = emb_distance(model, data, device = device)
        print("Corr: ", results[model]["correlation"])
        with open("./results/results_trans_part2.pickle", "wb") as f:
            pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
    except Exception as e:
        print(str(e))

In [None]:
print(results)

## Huggingface embeddings + FC + L2NORM

In [5]:
from methods.transformers.HF_emb_FCL2NORM.train import train_eval

In [6]:
data_labels = ['loaded_data_cleaned.csv', 'loaded_data_2_cleaned.csv']
data = load_and_prepare_data(data_labels, preprocess)

In [7]:
data = prepare_for_training(data)

In [8]:
config_path = "methods/transformers/HF_emb_FCL2NORM/models_configs_HF_FC_L2NORM.pickle"

with open(config_path, "rb") as f:
    model_configs = pickle.load(f)

In [9]:
models = ['distilbert-base-multilingual-cased', 'bert-base-multilingual-cased', 'bert-base-multilingual-uncased',
          'xlm-mlm-xnli15-1024', 'xlm-roberta-base', 'xlm-roberta-large', 'facebook/m2m100_418M']

In [15]:
result_FC_L2Norm_cosim1 = {}
checkpoints_path = "./checkpoints/HF_FC_L2NORM"
figs_path = "./figs/HF_FC_L2NORM"
for i, model_name in enumerate(model_configs):
    train_eval(model_name, data, model_configs[model_name]["batch_size"], model_configs[model_name]["batch_size_val"], 
               model_configs[model_name]["linear_layer_size"], model_configs[model_name]["num_epoch"],
               result_FC_L2Norm_cosim1, train = False, checkpoints_path = checkpoints_path, figs_path = figs_path)

## Huggingface embeddings + FC + Regression

In [3]:
from methods.transformers.HF_emb_FCReg.train import train_eval

In [4]:
data_labels = ['loaded_data_cleaned.csv', 'loaded_data_2_cleaned.csv']
data = load_and_prepare_data(data_labels)

In [5]:
data = prepare_for_training(data, method = "HF_emb_FCReg")

In [6]:
config_path = "methods/transformers/HF_emb_FCReg/models_configs_HF_emb_FCReg.pickle"

with open(config_path, "rb") as f:
    model_configs = pickle.load(f)

In [7]:
models = ['distilbert-base-multilingual-cased', 'bert-base-multilingual-cased', 'bert-base-multilingual-uncased',
          'xlm-mlm-xnli15-1024', 'xlm-roberta-base', 'xlm-roberta-large', 'facebook/m2m100_418M']

In [10]:
result_HF_emb_FCReg = {}
checkpoints_path = "./checkpoints/HF_emb_FCReg"
figs_path = "./figs/HF_emb_FCReg"
for i, model_name in enumerate(model_configs):
    train_eval(model_name, data, model_configs[model_name]["batch_size"], model_configs[model_name]["batch_size_val"], 
               model_configs[model_name]["linear_layer_size"], model_configs[model_name]["num_epoch"],
               result_HF_emb_FCReg, train = False, checkpoints_path = checkpoints_path, figs_path = figs_path)

## NLI

In [3]:
from methods.NLI.Basic.extractor import nli_extractor
from methods.NLI.Basic.train import train_eval

In [4]:
data_labels = ['loaded_data_cleaned.csv', 'loaded_data_2_cleaned.csv']
data = load_and_prepare_data(data_labels)

In [5]:
data = prepare_for_training(data, method = "NLI")

In [6]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_name = 'joeddav/xlm-roberta-large-xnli'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
params = {'batch_size': 1,
          'shuffle': False,
          'num_workers': 0
         }

In [8]:
df_nli = nli_extractor(data, model, tokenizer, params, device)

7582it [02:40, 47.14it/s]


In [6]:
nli = pd.read_csv("./checkpoints/NLI/nli_scores.csv", index_col='Unnamed: 0')

In [7]:
df_nli = data.join(nli)

In [8]:
res = train_eval(df_nli)

LinearRegression correlation: 0.09630289373511501
Lasso correlation: 3.4153926468658365e-16
Ridge correlation: 0.09398195676364932
ElasticNet correlation: 3.4153926468658365e-16
DecisionTreeRegressor correlation: 0.6086772422091575
KNeighborsRegressor correlation: 0.32554466721559555
GradientBoostingRegressor correlation: 0.71488954402111


## NER

In [5]:
from methods.NER.ner_extractor import NerExtractor
from methods.NER.score_calculator import ScoreCounter
from methods.NER.train import train_eval

  from pandas import Panel


In [3]:
import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import pipeline
from transformers import AutoModelForTokenClassification
from polyglot.text import Text
import spacy 

from tqdm import tqdm,trange
import pickle
import gc

In [4]:
extract_NER_methods = ["Huggingface", "Polyglot", "Spacy"]
method = "Huggingface"

In [5]:
data_labels = ['loaded_data_cleaned.csv', 'loaded_data_2_cleaned.csv']
if method == "Huggingface":
    data = load_and_prepare_data(data_labels, preprocess_func = preprocess)
else:
    data = load_and_prepare_data(data_labels)

In [6]:
nerExtractor = NerExtractor(method = method)

In [29]:
# we can also extract one vector of NERs here with nerExtractor.extract1ner
if method == "Huggingface":
    data["ner1"] = data[["url1_lang", "content1"]].progress_apply(lambda x: nerExtractor.extract3ner(x[1], text_lang = x[0]), axis=1)
    data["ner2"] = data[["url2_lang", "content2"]].progress_apply(lambda x: nerExtractor.extract3ner(x[1], text_lang = x[0]), axis=1)
else:
    data["ner1"] = data[["url1_lang", "content1"]].parallel_apply(lambda x: nerExtractor.extract3ner(x[1], text_lang = x[0]), axis=1)
    data["ner2"] = data[["url2_lang", "content2"]].parallel_apply(lambda x: nerExtractor.extract3ner(x[1], text_lang = x[0]), axis=1)


  2%|▏         | 164/7582 [00:29<08:55, 13.85it/s][A

In [7]:
with open("./checkpoints/NER/Polyglot_ner_2parts.pickle", "rb") as f:
    ners = pickle.load(f)

In [8]:
data['ner1'] = ners['ner1']
data['ner2'] = ners['ner2']
keys = ['LOC', 'PER', 'ORG']

In [46]:
#fasttext 
scoreCounter = ScoreCounter(loadFastText = True, ft_models_path = './fasttext_models/')

In [None]:
for key in keys:
    data["sim_" + key] = data[["ner1", "ner2", "url1_lang", "url2_lang"]].progress_apply(lambda x: scoreCounter.fasttext_scores(x[0][key], x[1][key],
                                                                                             x[2], x[3]), axis = 1)

In [13]:
#tf-idf
scoreCounter = ScoreCounter(needVocab = True, data = data)

100%|██████████| 7582/7582 [00:00<00:00, 173908.26it/s]
100%|██████████| 7582/7582 [00:00<00:00, 202498.75it/s]

Creating vocabulary...
Created





In [18]:
for key in keys:
    data["sim_" + key] = data[["ner1", "ner2"]].progress_apply(lambda x: scoreCounter.tf_idf_scores(x[0][key],x[1][key], key), axis = 1)

100%|██████████| 7582/7582 [00:02<00:00, 3720.30it/s]
100%|██████████| 7582/7582 [00:02<00:00, 3049.51it/s]
100%|██████████| 7582/7582 [00:01<00:00, 4000.79it/s]


In [9]:
#transformers
scoreCounter = ScoreCounter(loadTransformers = True, hf_model_name = "bert-base-multilingual-uncased")

Start loading transformer model


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model loaded


In [11]:
for key in keys:
    data["sim_" + key] = data[["ner1", "ner2"]].progress_apply(lambda x: scoreCounter.transformers_scores(x[0][key], x[1][key]), axis = 1)

 25%|██▍       | 1882/7582 [00:50<02:05, 45.41it/s]

In [6]:
res = train_eval(df)

LinearRegression correlation: 0.25716816501388384
Lasso correlation: 2.1396889616053544e-16
Ridge correlation: 0.257168566802717
ElasticNet correlation: 2.1396889616053544e-16
DecisionTreeRegressor correlation: 0.6479410747621763
KNeighborsRegressor correlation: 0.4105426013716557
GradientBoostingRegressor correlation: 0.47861064193091774
