In [59]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from accelerate import Accelerator
from sentence_transformers import SentenceTransformer
from torch import cuda, nn

MAX_LENGTH = 80
NUM_OF_OUTPUTS = 50
BATCH_SIZE = 4

first_model_checkpoint = "royweiss1/T5_FirstSentences"
first_model = AutoModelForSeq2SeqLM.from_pretrained(first_model_checkpoint)
first_tokenizer = AutoTokenizer.from_pretrained(first_model_checkpoint)

cuda.empty_cache()
accelerator = Accelerator(cpu=False)
print("-------Device:", accelerator.device)
first_model = first_model.to(accelerator.device)

def generate_first(encodings):
    
    inputs = first_tokenizer(encodings, max_length=MAX_LENGTH, padding=True, truncation=True, return_tensors="pt")

    inputs = {k: v.to(accelerator.device) for k, v in inputs.items()}

    # Generate text using the model on the same device
    outputs = first_model.generate(
        **inputs,
        max_length=MAX_LENGTH,
        output_scores=True,
        return_dict_in_generate=True,
        no_repeat_ngram_size=2,
        top_k=50,
        num_beam_groups=16,
        num_beams=NUM_OF_OUTPUTS,
        diversity_penalty=0.8,
        num_return_sequences=NUM_OF_OUTPUTS
    )

    sequences = outputs.sequences
    sequence_scores = outputs.sequences_scores
    
    sorted_indices = sequence_scores.argsort(descending=True)

    # Extract the sequences and scores based on the sorted indices
    sorted_sequences = sequences[sorted_indices]

    # Convert the sorted sequences to a readable format (e.g., string)
    sorted_texts = [first_tokenizer.decode(seq, skip_special_tokens=True) for seq in sorted_sequences]

    return sorted_texts

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-------Device: cuda


In [61]:
lst_of_lengths = [955, 971,985, 1011,1017,1033,1051,1059,1081,1107,1117,1129,1201,1219,1229] # TODO: fill

def parse_packet_lengths(lengths):
    token_lengths = []
    for i in range(1, len(lengths)):
        token_lengths.append((lengths[i] - lengths[i-1])//2)
    return token_lengths
        
token_lens = parse_packet_lengths(lst_of_lengths)
        
print(token_lens)

[8, 7, 13, 3, 8, 9, 4, 11, 13, 5, 6, 36, 9, 5]


In [None]:
def heuristic(lengths):
    sentences = []
    index = 0
    tokens_in_streak = 0
    while index < len(lengths):
        if tokens_in_streak >= 10 and lengths[index] == 1:
            if lengths[index-1] == 3:
                sentences.append(lengths[:tokens_in_streak])
                lengths = lengths[tokens_in_streak:]
            elif lengths[index-1] == 1:
                sentences.append(lengths[:tokens_in_streak-1])
                lengths = lengths[tokens_in_streak-1:]
                index -= 1
            else:
                sentences.append(lengths[:tokens_in_streak+1])
                lengths = lengths[tokens_in_streak+1:]
                index += 1
            tokens_in_streak = 0
        else:
            index += 1
            tokens_in_streak += 1
    else:
        if tokens_in_streak > 0:
            sentences.append(lengths)
    return sentences

In [62]:
def make_input(lst_lengths):
    lst_str = " ".join([f"_{i}" for i in lst_lengths])
    return f"Translate the Special Tokens to English. \nSpecial Tokens:{lst_str}"

# Decrypt model message
token_lens = heuristic(token_lens)[0] # take the first sentence based on the heuristic
outputs = generate_first([make_input(token_lens)]) # output is sorted by the model's confidence!!!!

for rank, output in enumerate(outputs):
    print(f"Rank: {rank+1}. Output: {output}")

Rank: 0. Output: several recent advancements in machine learning and artificial intelligence that could be a game-changing tool
Rank: 1. Output: several recent developments in machine learning and artificial intelligence that could be of interest to
Rank: 2. Output: park managers collaborate on several projects for maximizing recreational play value in a specific area.
Rank: 3. Output: firms employ standardized or uniform policies for addressing intersectionality issues in a specific area.
Rank: 4. Output: ertiveness allows participants to express emotions and experience interactions with their co-workers more
Rank: 5. Output: ers utilize cryptography to protect personal and financial transactions from fraud or ID theft.
Rank: 6. Output: land restoration efforts at Niagara Falls are constantly integrating with those of the national park
Rank: 7. Output: ethical issues pertaining to genetic research are considered intersectionality since it is research that
Rank: 8. Output: could be gen

In [None]:
model_sentence_transformers = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v1')
model_sentence_transformers = model_sentence_transformers.to(accelerator.device) 
def compute_metrics(reference_sentence, sentence_to_compare):
    embed_pred = model_sentence_transformers.encode([sentence_to_compare], convert_to_tensor=True)
    embed_reference = model_sentence_transformers.encode([reference_sentence], convert_to_tensor=True)

    # Compute cosine similarity
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    sen_trans_score = cos(embed_pred, embed_reference)
    cosine_score = tuple(sen_trans_score.detach().cpu().numpy())
    return float(f"{cosine_score:.3f}")


In [None]:
Original_LLM_Response = "FILL HERE"
for rank, output in enumerate(outputs):
    print(f"Rank: {rank+1}. Phi Score: {compute_metrics(Original_LLM_Response, output)}")