In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # makes figs nicer!

import functools
import itertools
import os
import torch
import transformers
import pandas as pd

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull
from tqdm import tqdm


from scipy.spatial.distance import cosine
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from sklearn.metrics.pairwise import cosine_distances


sns.set(style='whitegrid',font_scale=1.2)

In [8]:
tokenizer_xlm = transformers.AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") 
tokenizer_ml_bert = transformers.AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")  
tokenizer_beto = transformers.AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")  
tokenizer_bert = transformers.AutoTokenizer.from_pretrained("bert-base-cased")  

In [9]:
### Faster to check membership in a set than a list
xlm_vocab = set(tokenizer_xlm.vocab.keys())
ml_bert_vocab = set(tokenizer_ml_bert.vocab.keys())
beto_vocab = set(tokenizer_beto.vocab.keys())
bert_vocab = set(tokenizer_bert.vocab.keys())

In [12]:
ml_bert_tokens = []
for t in tqdm(ml_bert_vocab):

    ml_bert_tokens.append({
        'token': t,
        'in_xlm': t in xlm_vocab,
        'in_beto': t in beto_vocab,
        'in_bert': t in bert_vocab
    })

  0%|          | 0/119547 [00:00<?, ?it/s]

In [13]:
df_ml_bert_tokens = pd.DataFrame(ml_bert_tokens)
df_ml_bert_tokens.head(3)

Unnamed: 0,token,in_xlm,in_beto,in_bert
0,Noreg,False,False,False
1,cambios,False,True,False
2,##šuje,False,False,False


In [16]:
df_ml_bert_tokens[['in_xlm', 'in_beto', 'in_bert']].mean()

in_xlm     0.165341
in_beto    0.096196
in_bert    0.161142
dtype: float64

In [22]:
# Calculate intersections and proportions for each pair
shared_stats = {}

# For each pair of vocab sets, calculate the intersection and proportions
for (vocab1_name, vocab1), (vocab2_name, vocab2) in [
    (('XLM', xlm_vocab), ('ML-BERT', ml_bert_vocab)),
    (('XLM', xlm_vocab), ('BETO', beto_vocab)),
    (('XLM', xlm_vocab), ('BERT', bert_vocab)),
    (('ML-BERT', ml_bert_vocab), ('BETO', beto_vocab)),
    (('ML-BERT', ml_bert_vocab), ('BERT', bert_vocab)),
    (('BETO', beto_vocab), ('BERT', bert_vocab))
]:
    intersection = vocab1.intersection(vocab2)
    count_shared = len(intersection)
    max_vocab_size = max(len(vocab1), len(vocab2))
    proportion_shared = count_shared / max_vocab_size

    # Store the results
    shared_stats[f'{vocab1_name} + {vocab2_name}'] = {
        'shared_count': count_shared,
        'proportion_of_larger_vocab': proportion_shared
    }

# Print the results
for pair, stats in shared_stats.items():
    print(f"{pair}: {stats['shared_count']} shared tokens, "
          f"{stats['proportion_of_larger_vocab']:.2%} of the larger vocabulary")

XLM + ML-BERT: 19766 shared tokens, 7.91% of the larger vocabulary
XLM + BETO: 3264 shared tokens, 1.31% of the larger vocabulary
XLM + BERT: 3677 shared tokens, 1.47% of the larger vocabulary
ML-BERT + BETO: 11500 shared tokens, 9.62% of the larger vocabulary
ML-BERT + BERT: 19264 shared tokens, 16.11% of the larger vocabulary
BETO + BERT: 5041 shared tokens, 16.26% of the larger vocabulary


In [40]:
import itertools

# Define the vocabularies in a dictionary
vocab_sets = {
    'XLM': xlm_vocab,
    'ML-BERT': ml_bert_vocab,
    'BETO': beto_vocab,
    'BERT': bert_vocab
}

# Calculate intersections and proportions for each pair dynamically
shared_stats = []

# Iterate over all combinations of vocab sets
for (vocab1_name, vocab1), (vocab2_name, vocab2) in itertools.combinations(vocab_sets.items(), 2):
    intersection = vocab1.intersection(vocab2)
    count_shared = len(intersection)
    max_vocab_size = max(len(vocab1), len(vocab2))
    proportion_shared = count_shared / max_vocab_size

    # Store the results
    # shared_stats[f'{vocab1_name} + {vocab2_name}'] = {
    #    'shared_count': count_shared,
    #    'proportion_of_larger_vocab': proportion_shared
    #}

    m1, m2 = sorted([vocab1_name, vocab2_name])

    shared_stats.append({
        'vocab1_name': vocab1_name,
        'vocab2_name': vocab2_name,
        'vocab1_size': len(vocab1),
        'vocab2_size': len(vocab2),
        'shared_count': count_shared,
        'proportion_of_larger_vocab': proportion_shared,
        'proportion_of_total_vocab': count_shared/(len(vocab1) + len(vocab2)),
        'max_vocab_size': max(len(vocab1), len(vocab2))
    })

In [41]:
pd.DataFrame(shared_stats)

Unnamed: 0,vocab1_name,vocab2_name,vocab1_size,vocab2_size,shared_count,proportion_of_larger_vocab,proportion_of_total_vocab,max_vocab_size
0,XLM,ML-BERT,250002,119547,19766,0.079063,0.053487,250002
1,XLM,BETO,250002,31002,3264,0.013056,0.011615,250002
2,XLM,BERT,250002,28996,3677,0.014708,0.013179,250002
3,ML-BERT,BETO,119547,31002,11500,0.096196,0.076387,119547
4,ML-BERT,BERT,119547,28996,19264,0.161142,0.129686,119547
5,BETO,BERT,31002,28996,5041,0.162602,0.084019,31002
