In [1]:
from datasets import Dataset

import transformers

from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    set_seed, 
)

import torch
import pandas as pd
import numpy as np
import itertools
import nltk
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from plotly import graph_objects as go
from collections import Counter
from scipy.stats import linregress

[nltk_data] Downloading package wordnet to /home/chistik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
set_seed(41)
cache_dir="./cache"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [3]:
def get_model_tokenizer(model_name):
    config = AutoConfig.from_pretrained(model_name)
    model = AutoModel.from_pretrained(
        model_name,
        config=config,
        cache_dir=cache_dir,
        torch_dtype=config.torch_dtype).to(device)
    
    if config.model_type in {"gpt2", "roberta"}:
        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir, use_fast=True, add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir, use_fast=True)
        
    if tokenizer.pad_token is None:
        if tokenizer.eos_token:
            tokenizer.pad_token = tokenizer.eos_token  
            print(f'Set pad_token to be equal to eos_token: {tokenizer.pad_token}')
        else:
            raise ValueError("The tokenizer does not have an eos_token set.")
    return model, tokenizer

In [4]:
def process_row(row):
    text = row['text']['en'].lower()
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    terms_with_counts = row['terms']
    flattened_terms = list(itertools.chain.from_iterable([[term] * count for term_dict in terms_with_counts for term in term_dict['term'].split('_') for count in [term_dict['count']]]))
    number_of_terms = len(flattened_terms)

    labels = []
    for word in lemmatized_tokens:
        if word in flattened_terms:
            labels.append('term')
            flattened_terms.remove(word)
        else:
            labels.append('not_term')
    assert len(tokens) == len(labels)        
    if number_of_terms > labels.count('term'):
        if number_of_terms - labels.count('term') > 5:
            return np.nan, np.nan
    return tokens, labels

In [5]:
label_list=['term', 'not_term']
label_to_id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label_to_id.items()}

In [6]:
padding = "max_length" 
max_length=512
def tokenize_and_align_labels(examples, label_all_tokens=False):
    tokenized_inputs = tokenizer(
        examples['text'],
        padding=padding,
        truncation=True,
        max_length=max_length,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples['terms']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                if label_all_tokens:
                    label_ids.append(label_to_id[label[word_idx]])
                else:
                    label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
model_name = 'google-bert/bert-base-uncased'
model, tokenizer = get_model_tokenizer(model_name)

In [8]:
file_path='eng_texts_terms_science.jsonl' 
df = pd.read_json(path_or_buf=file_path, lines=True)

In [9]:
data = {
    'text': [],
    'terms': []
}

for index, row in df.iterrows():
    tokens, labels = process_row(row)
    data['text'].append(tokens)
    data['terms'].append(labels)

new_df = pd.DataFrame(data)
new_df.dropna(inplace=True)
len(new_df)

680

In [10]:
all_tokens = [token for sublist in new_df['text'] for token in sublist]
len(all_tokens), len(set(all_tokens))

(99267, 12930)

In [11]:
def flatten_words_labels(texts, labels):
    return [(word, label)
            for tokens, labels in zip(texts, labels)
            for word, label in zip(tokens, labels)]

all_words_labels = flatten_words_labels(new_df['text'], new_df['terms'])
words, labels = zip(*all_words_labels)
word_freq = Counter(words)
sorted_words = sorted(word_freq, key=word_freq.get, reverse=True) # sort by frequency
sorted_freqs = [word_freq[word] for word in sorted_words]
word_to_label = dict(all_words_labels)
sorted_labels = [word_to_label[word] for word in sorted_words]
df_plot = pd.DataFrame({
    'Word': sorted_words,
    'Frequency': sorted_freqs,
    'Rank': range(1, len(sorted_freqs) + 1),
    'Label': sorted_labels
})
# expected frequency of each word under Zipfian distribution
df_plot['Zipf_Frequency'] = df_plot['Frequency'][0] / df_plot['Rank']

color_mapping = {'term': 'blue', 'not_term': 'green'}
df_plot['Color'] = df_plot['Label'].map(color_mapping)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_plot['Rank'], y=df_plot['Frequency'], mode='markers',
    marker=dict(color=df_plot['Color'], opacity=0.4), text=df_plot['Word'],
    name='Frequency'
))

fig.add_trace(go.Scatter(
    x=df_plot['Rank'], y=df_plot['Zipf_Frequency'], mode='lines',
    line=dict(color='orange', width=2), name="Zipf's Law"
))

fig.update_layout(
    title="Zipf's Law Visualization", title_x=0.5,
    xaxis=dict(title='Rank of word (log scale)', type='log'),
    yaxis=dict(title='Frequency of word (log scale)', type='log'),
    legend=dict(yanchor="top", y=0.99, xanchor="right", x=0.99)
)

fig.show()

def calculate_zipf_metrics(df): 
    log_ranks = np.log(df['Rank']) 
    log_freqs = np.log(df['Frequency']) 
    slope, intercept, r_value, p_value, std_err = linregress(log_ranks, log_freqs) 
    return slope, intercept, r_value, p_value, std_err 

slope, intercept, r_value, p_value, std_err = calculate_zipf_metrics(df_plot)
# value close to -1 -> dataset follows Zipf's law; ideal slope would be -1
print(f"Slope: {slope}")
print(f"Intercept: {intercept}")
# r-value close to -1 suggests a strong negative linear relationship
print(f"Correlation coefficient (r): {r_value}")
# null hypothesis that the slope is zero (no relationship)
print(f"p-value: {p_value}")

In [None]:
dataset = Dataset.from_pandas(new_df)
dataset

Dataset({
    features: ['text', 'terms', '__index_level_0__'],
    num_rows: 680
})

In [None]:
train_dataset = dataset.map(
                tokenize_and_align_labels,
                batched=True,
                desc="Running tokenizer on train dataset",
            )

Running tokenizer on train dataset:   0%|          | 0/680 [00:00<?, ? examples/s]

In [None]:
print(train_dataset[0].keys())

dict_keys(['text', 'terms', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [None]:
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
class CustomDataset(Dataset):
        def __init__(self, input_ids, attention_masks, labels):
            self.input_ids = input_ids
            self.attention_masks = attention_masks
            self.labels = labels

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, idx):
            return {
                'input_ids': self.input_ids[idx],
                'attention_mask': self.attention_masks[idx],
                'labels': self.labels[idx]
            }

In [None]:
def extract_representations(data, model, model_type='encoder'):

    input_ids = [torch.tensor(d['input_ids']).unsqueeze(0) for d in data]
    attention_masks = [torch.tensor(d['attention_mask']).unsqueeze(0) for d in data]
    labels = [torch.tensor(d['labels']).unsqueeze(0) for d in data]  
        
    custom_dataset = CustomDataset(input_ids, attention_masks, labels)

    weighted_embeddings = []
    mean_embeddings = []
    embeddings = []
    all_hidden_states = []
    label_list = []

    model.eval()
    with torch.no_grad():  
        for sequence in tqdm(custom_dataset, desc="Processing"):

            input_ids = sequence['input_ids'].to(device)
            attention_mask = sequence['attention_mask'].to(device)
           
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            hidden_states = outputs[0]
        
            if model_type != 'encoder':
                sequence_lengths = (torch.ne(input_ids, tokenizer.pad_token_id).sum(-1) - 1).to(device)
                # embedding from last token
                pooled_hidden_states = hidden_states[torch.arange(1, device=device), sequence_lengths]
                # Weighted-Mean-Pooling
                # https://stackoverflow.com/questions/76926025/sentence-embeddings-from-llama-2-huggingface-opensource
                cumsum = attention_mask.cumsum(dim=1)
                weights_for_non_padding = torch.where(attention_mask == 1, cumsum, attention_mask)
                sums = torch.sum(weights_for_non_padding, dim=-1).unsqueeze(-1)
                normalized_weights = weights_for_non_padding / sums # normalize so that weights sum to 1
                sentence_embedding = torch.sum(hidden_states * normalized_weights.unsqueeze(-1), dim=1)
                weighted_embeddings.append(sentence_embedding.cpu())
            else:
                pooled_hidden_states = outputs.last_hidden_state[:,0,:] # cls for bert

            # normal average
            num_tokens = torch.sum(attention_mask, dim=-1).unsqueeze(-1)
            mean_embedding = torch.sum(hidden_states * attention_mask.unsqueeze(-1), dim=1)
            mean_embedding = mean_embedding / num_tokens
            mean_embeddings.append(mean_embedding.cpu())

            embeddings.append(pooled_hidden_states.cpu())
            label_list.append(sequence['labels'])
            all_hidden_states.append(hidden_states.cpu())

    final_last_embeddings = torch.cat(embeddings, dim=0)
    final_mean_embeddings = torch.cat(mean_embeddings, dim=0)
    all_hidden_states = torch.cat(all_hidden_states, dim=0)
    label_list = torch.cat(label_list, dim=0)
    
    if model_type != 'encoder':
        final_weighted_embeddings = torch.cat(weighted_embeddings, dim=0)
    return all_hidden_states, final_last_embeddings, final_mean_embeddings, label_list

In [None]:
all_hidden_states, final_last_embeddings, final_mean_embeddings, label_list = extract_representations(train_dataset, model)

Processing: 100%|██████████| 680/680 [00:09<00:00, 70.58it/s]


In [None]:
assert final_last_embeddings.shape == final_mean_embeddings.shape

In [None]:
print(all_hidden_states.shape)
print(label_list.shape)

torch.Size([680, 512, 768])
torch.Size([680, 512])


In [None]:
def similarity(word_type='term', sentence_representation=final_last_embeddings):
    cosine_similarities = []
    num_terms = 0
    num_examples, max_tokens, emb_dim = all_hidden_states.size()
    for i in range(num_examples):
        term_mask = label_list[i] == label_to_id[word_type]  # a boolean mask
        term_embeddings = all_hidden_states[i][term_mask]
        num_terms += term_embeddings.shape[0]
        final_embedding = sentence_representation[i].view(1, -1)
        for token_embedding in term_embeddings:
            token_embedding = token_embedding.view(1, -1) 
            similarity = F.cosine_similarity(token_embedding, final_embedding)
            cosine_similarities.append(similarity.item())  
    return cosine_similarities

In [None]:
cosine_similarities = similarity()
np.average(cosine_similarities)

0.18263093057169646

In [None]:
cosine_similarities = similarity(word_type='not_term')
np.average(cosine_similarities)

0.19237815565088573

In [None]:
cosine_similarities = similarity(sentence_representation=final_mean_embeddings)
np.average(cosine_similarities)

0.5574284500752051

In [None]:
cosine_similarities = similarity(word_type='not_term', sentence_representation=final_mean_embeddings)
np.average(cosine_similarities)

0.5841845358850314