In [1]:
import datasets
from InstructorEmbedding import INSTRUCTOR
import torch
import collections
import imodelsx
import string
import numpy as np

## EDA of the dataset

In [2]:
#approach one using imodelsx
SST2 =imodelsx.data.load_huggingface_dataset('sst2')

In [3]:
#load ds
sst2 = datasets.load_dataset('sst2')

In [4]:
sst2

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [5]:
#dir(sst2)
sst2.column_names

{'train': ['idx', 'sentence', 'label'],
 'validation': ['idx', 'sentence', 'label'],
 'test': ['idx', 'sentence', 'label']}

In [6]:
sst2.num_rows

{'train': 67349, 'validation': 872, 'test': 1821}

In [7]:
#sample of first 10 reviews
sst2['train']['sentence'][0:10]

['hide new secretions from the parental units ',
 'contains no wit , only labored gags ',
 'that loves its characters and communicates something rather beautiful about human nature ',
 'remains utterly satisfied to remain the same throughout ',
 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
 "that 's far too tragic to merit such superficial treatment ",
 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ',
 'of saucy ',
 "a depressed fifteen-year-old 's suicidal poetry ",
 "are more deeply thought through than in most ` right-thinking ' films "]

In [8]:
sst2['train']['sentence'][1]

'contains no wit , only labored gags '

In [7]:
sst2['test']['sentence'][0]

'uneasy mishmash of styles and genres .'

## Compute embedding of first 10 sentence; no further training needed by 👨‍🏫

In [9]:
#dir(INSTRUCTOR)
model = INSTRUCTOR('hkunlp/instructor-large')

load INSTRUCTOR_Transformer
max_seq_length  512


In [10]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-base')
sentence = sst2['train']['sentence'][0:10]
#instruction = "Sentiment classification"
embeddings = model.encode(sentence)
print(embeddings)


load INSTRUCTOR_Transformer
max_seq_length  512
[[ 0.02479305 -0.01686814  0.02904831 ...  0.00310309 -0.05065686
  -0.00042309]
 [-0.02396188 -0.00445384  0.05356156 ... -0.00792792 -0.03100418
   0.03949679]
 [-0.04197229 -0.00706967  0.04467202 ... -0.00451127 -0.01771464
   0.04970837]
 ...
 [-0.0043071  -0.01626733  0.00555013 ... -0.02947796 -0.02224189
   0.02196053]
 [-0.0187378  -0.01896087  0.05475057 ...  0.02185187 -0.02786592
   0.01402784]
 [-0.03885051 -0.01989633  0.05630855 ...  0.02359127 -0.04830608
   0.03353149]]


## Calculate Similarity of the initial two -group of five- sentences

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
sentences_a = sst2['train']['sentence'][0:5]
sentences_b = sst2['train']['sentence'][5:10]
embeddings_a = model.encode(sentences_a)
embeddings_b = model.encode(sentences_b)
similarities = cosine_similarity(embeddings_a,embeddings_b)
print(similarities)

[[0.78156936 0.7758671  0.80380815 0.7702233  0.7685855 ]
 [0.8368263  0.80808187 0.81879485 0.826748   0.8006947 ]
 [0.80458146 0.8399097  0.8168999  0.82596534 0.84078974]
 [0.7902657  0.82497215 0.81833506 0.800483   0.7910638 ]
 [0.83194566 0.8474531  0.78426355 0.7815983  0.84800655]]


## Feture importance 

In [8]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Define your sentences
sentences_a = sst2['train']['sentence'][0:5]
sentences_b = sst2['train']['sentence'][5:10]

# Tokenize and encode the sentences
encoded_group1 = tokenizer(sentences_a, padding=True, truncation=True, return_tensors="pt")
encoded_group2 = tokenizer(sentences_b, padding=True, truncation=True, return_tensors="pt")

# Calculate baseline similarity
with torch.no_grad():
    output_group1 = model(**encoded_group1).last_hidden_state
    output_group2 = model(**encoded_group2).last_hidden_state

    # Assuming you want to calculate cosine similarity, you can use torch's cosine similarity function
    similarity_score = torch.nn.functional.cosine_similarity(output_group1.mean(dim=1), output_group2.mean(dim=1), dim=1).mean()

# Now, perturb tokens and calculate similarity for each perturbed sentence
for sentence_index in range(len(sentences_a)):
    for token_index in range(len(encoded_group1["input_ids"][sentence_index])):
        # Create a copy of the original input
        perturbed_input = encoded_group1["input_ids"].clone()

        # Replace the token with a special token or padding token
        perturbed_input[sentence_index][token_index] = tokenizer.pad_token_id

        # Calculate similarity for the perturbed input
        with torch.no_grad():
            perturbed_output = model(input_ids=perturbed_input).last_hidden_state
            perturbed_similarity = torch.nn.functional.cosine_similarity(perturbed_output.mean(dim=1), output_group2.mean(dim=1), dim=1).mean()

        # Calculate the impact of this token on similarity
        impact = similarity_score - perturbed_similarity

        # Print or store the token and its impact
        print(f"Token: {tokenizer.decode(encoded_group1['input_ids'][sentence_index][token_index])}, Impact: {impact.item()}")


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Token: [CLS], Impact: 0.15352243185043335
Token: hide, Impact: 0.14318251609802246
Token: new, Impact: 0.14095744490623474
Token: secret, Impact: 0.1431151032447815
Token: ##ions, Impact: 0.14290404319763184
Token: from, Impact: 0.14425644278526306
Token: the, Impact: 0.14314311742782593
Token: parental, Impact: 0.1402343213558197
Token: units, Impact: 0.14379602670669556
Token: [SEP], Impact: 0.13814139366149902
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [PAD], Impact: 0.1437567174434662
Token: [CLS], Impact: 0.14500880241394043
Token: contains, Impact: 0.14539161324501038
Toke

In [12]:
import collections
with torch.no_grad():
    # generate ngrams up to trigrams
    sentences_a = sst2['train']['sentence'][0:5]
    sentences_b = sst2['train']['sentence'][5:10]
    d = collections.defaultdict(list)
    for i, text_i in enumerate([sentences_a, sentences_b]):
        texts = imodelsx.util.generate_ngrams_list(text_i, ngrams=3, all_ngrams=True)
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs).last_hidden_state.detach().cpu().numpy()
        embs = np.mean(outputs, axis=1).squeeze()
        embs_mean = np.mean(embs, axis=0)

        d['texts'].append(texts)
        d['embs'].append(embs)
        d['embs_mean'].append(embs_mean)

AttributeError: 'list' object has no attribute 'split'

## modified code to take care of the feature importance

In [45]:
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams
from collections import Counter

model = INSTRUCTOR('hkunlp/instructor-large')
sentences_a = sst2['train']['sentence'][0:10]
sentences_b = sst2['train']['sentence'][10:20]

def extract_ngrams(sentence, n):
    tokens = sentence.split()
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(gram) for gram in ngrams_list]

similarities = []

for sentence_a in sentences_a:
    for sentence_b in sentences_b:
        embeddings_a = model.encode(sentence_a)
        embeddings_b = model.encode(sentence_b)
        similarity = cosine_similarity(embeddings_a.reshape(1, -1), embeddings_b.reshape(1, -1))[0][0]
        
        # Perform n-gram averaging
        n = 2  # You can adjust the n-gram size
        ngrams_a = extract_ngrams(sentence_a, n)
        ngrams_b = extract_ngrams(sentence_b, n)
        
        # Calculate the importance of common n-grams
        common_ngrams = set(ngrams_a) & set(ngrams_b)
        ngram_importance = {ngram: ngrams_a.count(ngram) + ngrams_b.count(ngram) for ngram in common_ngrams}
        
        similarities.append((similarity, ngram_importance))

# Now you have a list of tuples containing similarity scores and n-gram importance dictionaries for each pair of sentences


load INSTRUCTOR_Transformer
max_seq_length  512


In [38]:
# Initialize the defaultdict
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large")
model = AutoModel.from_pretrained("hkunlp/instructor-large")


d = collections.defaultdict(list)

with torch.no_grad():
    # generate ngrams up to trigrams
    for i, sentence_list in enumerate([sentences_a, sentences_b]):
        for sentence in sentence_list:
            texts = imodelsx.util.generate_ngrams_list(sentence, ngrams=3, all_ngrams=True)
            inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
            outputs = model(**inputs).last_hidden_state.detach().cpu().numpy()
            embs = np.mean(outputs, axis=1).squeeze()
            embs_mean = np.mean(embs, axis=0)

            d['texts'].append(texts)
            d['embs'].append(embs)
            d['embs_mean'].append(embs_mean)

    # calculate feature importance for similarity
    denominator = calculate_denominator(d['embs_mean'][0], d['embs_mean'][1])
    d['imps'].append((d['embs'][0] @ d['embs_mean'][1]) / denominator)
    d['imps'].append((d['embs'][1] @ d['embs_mean'][0]) / denominator)

Some weights of T5Model were not initialized from the model checkpoint at hkunlp/instructor-large and are newly initialized: ['decoder.block.19.layer.1.layer_norm.weight', 'decoder.block.6.layer.0.SelfAttention.q.weight', 'decoder.block.10.layer.1.EncDecAttention.k.weight', 'decoder.block.13.layer.0.SelfAttention.v.weight', 'decoder.block.10.layer.0.layer_norm.weight', 'decoder.block.9.layer.0.SelfAttention.k.weight', 'decoder.block.22.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.1.EncDecAttention.o.weight', 'decoder.block.19.layer.2.DenseReluDense.wo.weight', 'decoder.block.23.layer.0.SelfAttention.o.weight', 'decoder.block.20.layer.1.layer_norm.weight', 'decoder.block.21.layer.2.DenseReluDense.wo.weight', 'decoder.block.11.layer.0.SelfAttention.k.weight', 'decoder.block.3.layer.2.DenseReluDense.wi.weight', 'decoder.block.12.layer.0.SelfAttention.o.weight', 'decoder.block.3.layer.0.SelfAttention.k.weight', 'decoder.block.6.layer.1.EncDecAttention.k.weight', 'decoder.block.1

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds