In [1]:
import datasets
from InstructorEmbedding import INSTRUCTOR

## EDA of the dataset

In [2]:
sst2 = datasets.load_dataset('sst2')

In [3]:
#dir(sst2)
sst2.column_names

{'train': ['idx', 'sentence', 'label'],
 'validation': ['idx', 'sentence', 'label'],
 'test': ['idx', 'sentence', 'label']}

In [4]:
sst2.num_rows

{'train': 67349, 'validation': 872, 'test': 1821}

In [12]:
#sample of a review
sst2['train']['sentence'][0:100]

['hide new secretions from the parental units ',
 'contains no wit , only labored gags ',
 'that loves its characters and communicates something rather beautiful about human nature ',
 'remains utterly satisfied to remain the same throughout ',
 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
 "that 's far too tragic to merit such superficial treatment ",
 'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ',
 'of saucy ',
 "a depressed fifteen-year-old 's suicidal poetry ",
 "are more deeply thought through than in most ` right-thinking ' films ",
 'goes to absurd lengths ',
 "for those moviegoers who complain that ` they do n't make movies like they used to anymore ",
 "the part where nothing 's happening , ",
 'saw how bad this movie was ',
 'lend some dignity to a dumb story ',
 'the greatest musicians ',
 'cold movie ',
 'with his usual intelligence and s

In [6]:
sst2['train']['sentence'][1]

'contains no wit , only labored gags '

In [7]:
sst2['test']['sentence'][0]

'uneasy mishmash of styles and genres .'

## INSTRUCTOR 👨‍🏫👩‍🏫

In [13]:
#dir(INSTRUCTOR)
model = INSTRUCTOR('hkunlp/instructor-large')

load INSTRUCTOR_Transformer
max_seq_length  512


In [16]:
#dir(model)
#get the embedding
sentence = sst2['train']['sentence'][0:10]
#instruction = "semantic"
embeddings = model.encode(sentence)
print(embeddings)

[[-3.3520196e-02 -1.4751182e-02 -2.1812499e-02 ... -5.3077709e-02
  -1.9894231e-02  4.6290603e-02]
 [-4.0275887e-02  9.5391721e-03 -2.0763531e-02 ... -5.9346016e-02
   1.6357798e-02  3.0490218e-02]
 [-3.5560861e-02 -2.3208736e-03 -1.3518459e-02 ... -2.9144358e-02
   2.5669584e-02  6.4498499e-02]
 ...
 [-2.8902860e-02  9.5078964e-03 -1.9643229e-02 ... -3.4858063e-02
   3.1831473e-02  4.5890577e-02]
 [-3.0532386e-02 -3.9113317e-02 -1.6344437e-02 ... -3.8398307e-02
   2.1053439e-03  3.6098324e-02]
 [-3.3510122e-02 -2.5493458e-03 -1.9426448e-02 ... -6.0843486e-02
   8.7424909e-05  4.9884342e-02]]


## Calculate Similarity

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
sentences_a = sst2['train']['sentence'][0:10]
#print(sentences_a)
sentences_b = sst2['train']['sentence'][10:20]
embeddings_a = model.encode(sentences_a)
embeddings_b = model.encode(sentences_b)
similarities = cosine_similarity(embeddings_a,embeddings_b)
print(similarities)

[[0.791337   0.7545798  0.792951   0.76039135 0.7968184  0.76513994
  0.77183414 0.7897898  0.7880728  0.77604973]
 [0.8359742  0.79563093 0.83904195 0.8281418  0.85586786 0.7920675
  0.81078637 0.83478475 0.8178705  0.8116954 ]
 [0.8400682  0.81742257 0.80808955 0.7690075  0.83724487 0.821731
  0.81950223 0.84481335 0.80259347 0.83987105]
 [0.8199604  0.7720649  0.85261905 0.74618435 0.7806753  0.7816093
  0.7777296  0.83189476 0.79248047 0.8085593 ]
 [0.8210979  0.8378202  0.7874433  0.84053195 0.8511729  0.762864
  0.82096875 0.7945789  0.79173076 0.79623735]
 [0.82384497 0.7801317  0.8040639  0.79008037 0.8473133  0.78129834
  0.7880322  0.7888309  0.78303826 0.82346255]
 [0.8051164  0.8481436  0.79881895 0.7911173  0.81736356 0.7747912
  0.8397882  0.82004267 0.76506215 0.81981665]
 [0.8505894  0.8004862  0.8175191  0.7937734  0.83820117 0.8168696
  0.81293756 0.85264456 0.8296662  0.8161603 ]
 [0.8235054  0.7667987  0.79704994 0.77834255 0.8182266  0.78134567
  0.78994125 0.80602

## modified code to take care of the feature importance

In [32]:
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams
from collections import Counter

model = INSTRUCTOR('hkunlp/instructor-large')
sentences_a = sst2['train']['sentence'][0:10]
sentences_b = sst2['train']['sentence'][10:20]

def extract_ngrams(sentence, n):
    tokens = sentence.split()
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(gram) for gram in ngrams_list]

similarities = []

for sentence_a in sentences_a:
    for sentence_b in sentences_b:
        embeddings_a = model.encode(sentence_a)
        embeddings_b = model.encode(sentence_b)
        similarity = cosine_similarity(embeddings_a.reshape(1, -1), embeddings_b.reshape(1, -1))[0][0]
        
        # Perform n-gram averaging
        n = 2  # You can adjust the n-gram size
        ngrams_a = extract_ngrams(sentence_a, n)
        ngrams_b = extract_ngrams(sentence_b, n)
        
        # Calculate the importance of common n-grams
        common_ngrams = set(ngrams_a) & set(ngrams_b)
        ngram_importance = {ngram: ngrams_a.count(ngram) + ngrams_b.count(ngram) for ngram in common_ngrams}
        
        similarities.append((similarity, ngram_importance))

# Now you have a list of tuples containing similarity scores and n-gram importance dictionaries for each pair of sentences


load INSTRUCTOR_Transformer
max_seq_length  512


In [33]:
with torch.no_grad():
    # generate ngrams up to trigrams
    d = defaultdict(list)
    for i, text_i in enumerate([text0, text1]):
        texts = imodelsx.util.generate_ngrams_list(text_i, ngrams=3, all_ngrams=True)
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs).last_hidden_state.detach().cpu().numpy()
        embs = np.mean(outputs, axis=1).squeeze()
        embs_mean = np.mean(embs, axis=0)

        d['texts'].append(texts)
        d['embs'].append(embs)
        d['embs_mean'].append(embs_mean)

    # calculate feature importance for similarity
    denominator = calculate_denominator(d['embs_mean'][0], d['embs_mean'][1])
    d['imps'].append((d['embs'][0] @ d['embs_mean'][1]) / denominator)
    d['imps'].append((d['embs'][1] @ d['embs_mean'][0]) / denominator)


NameError: name 'torch' is not defined

In [34]:
import datetime

In [35]:
dir(datetime)

['MAXYEAR',
 'MINYEAR',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'date',
 'datetime',
 'datetime_CAPI',
 'sys',
 'time',
 'timedelta',
 'timezone',
 'tzinfo']

In [None]:
datetime.datetime()