The implementation below loads a pre-trained BART model and custom vocabularies for code, nl, and ast. However, only the vocab for code is used. The configuration and model are loaded from the model_dir. 
First, input text is encoded using the custom vocabulary and then it uses the configured BART model to generate candidate sequences with beam search. And finally, the output sequences is decoded along with their probabilities.


In [2]:
import os
import torch
from transformers import BartForConditionalGeneration, BartConfig
from data.vocab import load_vocab

def load_model_and_vocab(model_dir, vocab_dir):
    config_path = os.path.join(model_dir, 'config.json')
    model_path = os.path.join(model_dir, 'model.safetensors')
    config = BartConfig.from_json_file(config_path)
    model = BartForConditionalGeneration.from_pretrained(model_path, config=config)
    model.eval()  

    code_vocab = load_vocab(vocab_dir, "code")
    nl_vocab = load_vocab(vocab_dir, "nl")
    ast_vocab = load_vocab(vocab_dir, "ast")
    
    return model, code_vocab#, nl_vocab, ast_vocab

def generate_candidates(model, input_text, vocab, num_beams=5, max_length=50):
    input_ids, attention_mask = vocab.encode_sequence(input_text)
    input_ids = torch.tensor([input_ids])  
    attention_mask = torch.tensor([attention_mask])

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_beams=num_beams,
        num_return_sequences=num_beams,
        output_scores=True,
        return_dict_in_generate=True
    )
    
    candidates = vocab.decode_batch(outputs.sequences.tolist())
    probabilities = torch.softmax(outputs.sequences_scores, dim=0).tolist()
    
    return candidates, probabilities

def main():
    model_dir = "/home/user1-system11/Documents/research-shradha/deploy-spt-code/spt-code/outputs/pre_train_20241125_001741/models"  
    vocab_dir = "/home/user1-system11/Documents/research-shradha/deploy-spt-code/spt-code/outputs/pre_train_20241125_001741/vocabs"  
    
    incomplete_code = "public void writeLock() {    this.fsLock.longReadLock().lock();     [MSK] .lock();}" 
    
    model, code_vocab = load_model_and_vocab(model_dir, vocab_dir)
    print("Code 1 tokenized input:", code_vocab.encode_sequence(incomplete_code))
    
    candidates, probabilities = generate_candidates(model, incomplete_code, code_vocab)
    
    for idx, (candidate, prob) in enumerate(zip(candidates, probabilities)):
        print(f"Candidate {idx + 1}: {candidate} (Probability: {prob:.2%})")

if __name__ == "__main__":
    main()


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


Code 1 tokenized input: ([151, 205, 3098, 9, 10, 64, 196, 15, 943, 389, 15, 333, 3813, 9, 10, 15, 389, 9, 10, 28, 34, 383, 48, 35, 15, 389, 9, 10, 28, 66], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
Candidate 1: this . lock (Probability: 28.29%)
Candidate 2: this . writelock (Probability: 19.12%)
Candidate 3: this . readwritelock (Probability: 19.01%)
Candidate 4: this . lock lock (Probability: 18.09%)
Candidate 5: this . fs lock (Probability: 15.49%)


Login Huggingface

In [1]:
from huggingface_hub import login

login(token="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")  # Replace with your Hugging Face access token


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/user1-system11/.cache/huggingface/token
Login successful


  from .autonotebook import tqdm as notebook_tqdm


Load the tokenizer from a JSON file located in saved vocabs directory (use code_tokenizer.json) and save the tokenizer and its metadata to a specific directory and then upload the tokenizer to the Hugging Face Hub 

In [4]:
from transformers import PreTrainedTokenizerFast

# Load the tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="/home/user1-system11/Documents/research-shradha/deploy-spt-code/spt-code/outputs/pre_train_20241125_001741/vocabs/code/code_tokenizer.json"
)

# Save the tokenizer and all relevant files to a directory
tokenizer.save_pretrained("/home/user1-system11/Documents/research-shradha/deploy-spt-code/spt-code/outputs/pre_train_20241125_001741/vocabs/code")

# Push the tokenizer to the Hugging Face Hub
tokenizer.push_to_hub("llm-coding-tasks-tokenizer")


CommitInfo(commit_url='https://huggingface.co/shradha01/llm-coding-tasks-tokenizer/commit/d3934091da68e08bea0ab0c238a5f9784bd88cf1', commit_message='Upload tokenizer', commit_description='', oid='d3934091da68e08bea0ab0c238a5f9784bd88cf1', pr_url=None, pr_revision=None, pr_num=None)

load a model saved in a specific directory and push it to the Hugging Face Hub.

In [5]:
from transformers import AutoModel

model_dir = "/home/user1-system11/Documents/research-shradha/deploy-spt-code/spt-code/outputs/pre_train_20241125_001741/models"  
repo_name = "llm-coding-tasks-model"  

model = AutoModel.from_pretrained(model_dir)

model.push_to_hub(repo_name)


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
model.safetensors: 100%|██████████| 1.05G/1.05G [00:20<00:00, 50.7MB/s]


CommitInfo(commit_url='https://huggingface.co/shradha01/llm-coding-tasks-model/commit/41654e68ebea92f0637f9323d501e37d3b4a01b7', commit_message='Upload model', commit_description='', oid='41654e68ebea92f0637f9323d501e37d3b4a01b7', pr_url=None, pr_revision=None, pr_num=None)

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

def main():
    model_name = "shradha01/llm-coding-tasks-model"  
    tokenizer_name = "shradha01/llm-coding-tasks-tokenizer"  

    print("Loading model and tokenizer...")
    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

        print("Model and tokenizer loaded successfully!")
    except Exception as e:
        print(f"Error loading model or tokenizer: {e}")
        return

    print("Initializing pipeline...")
    try:
        code_completion_pipeline = pipeline(
            "text2text-generation", model=model, tokenizer=tokenizer
        )
    except Exception as e:
        print(f"Error initializing pipeline: {e}")
        return

    incomplete_code = "public void writeLock() { this.fsLock.longReadLock().lock(); [MSK] .lock(); }"
    print("\nInput code:", incomplete_code)

    print("\nTokenization debug:")
    encoded = tokenizer(incomplete_code, return_tensors="pt")
    print("Input IDs:", encoded["input_ids"])
    print("Decoded Input:", tokenizer.decode(encoded["input_ids"][0]))

    print("\nGenerating predictions with probabilities...")
    try:
        # Generate outputs with model directly to access logits
        outputs = model.generate(
            encoded["input_ids"],
            max_length=50,
            num_beams=5,
            num_return_sequences=5,
            return_dict_in_generate=True,
            output_scores=True
        )

        # Decode generated sequences
        decoded_candidates = [
            tokenizer.decode(output, skip_special_tokens=True)
            for output in outputs.sequences
        ]

        # Calculate probabilities from logits
        probs = []
        for sequence_idx, score in enumerate(outputs.sequences_scores):
            prob = torch.exp(score).item()  # Convert log score to probability
            probs.append(prob)

        # Display candidates and their probabilities
        for idx, (candidate, prob) in enumerate(zip(decoded_candidates, probs)):
            print(f"Candidate {idx + 1}: {candidate.strip()} (Probability: {prob:.4f})")

    except Exception as e:
        print(f"Error during prediction: {e}")

if __name__ == "__main__":
    main()

#output: this.fsLock.writeLock()

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


Loading model and tokenizer...
Model and tokenizer loaded successfully!
Initializing pipeline...

Input code: public void writeLock() { this.fsLock.longReadLock().lock(); [MSK] .lock(); }

Tokenization debug:
Input IDs: tensor([[ 151,  205, 3098,    9,   10,   64,  196,   15,  943,  389,   15,  333,
         3813,    9,   10,   15,  389,    9,   10,   28,    4,   15,  389,    9,
           10,   28,   66]])
Decoded Input: public void writelock ( ) { this. fs lock. long readlock ( ). lock ( ) ; [MSK]. lock ( ) ; }

Generating predictions with probabilities...
Candidate 1: this. lock (Probability: 0.5484)
Candidate 2: this. writelock (Probability: 0.3635)
Candidate 3: this. locks (Probability: 0.2878)
Candidate 4: this. constants (Probability: 0.2645)
Candidate 5: timeunit. milliseconds (Probability: 0.2481)


In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

def main():
    model_name = "shradha01/llm-coding-tasks-model"  
    tokenizer_name = "shradha01/llm-coding-tasks-tokenizer"  

    print("Loading model and tokenizer...")
    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

        print("Model and tokenizer loaded successfully!")
    except Exception as e:
        print(f"Error loading model or tokenizer: {e}")
        return

    print("Initializing pipeline...")
    try:
        code_completion_pipeline = pipeline(
            "text2text-generation", model=model, tokenizer=tokenizer
        )
    except Exception as e:
        print(f"Error initializing pipeline: {e}")
        return

    
    incomplete_code = "@ override public int hashCode ( ) { final long v = currentValue . get ( ) ; return ( int ) v ^ ( int ) ( [MSK] ) ; }"
    print("\nInput code:", incomplete_code)

    print("\nTokenization debug:")
    encoded = tokenizer(incomplete_code, return_tensors="pt")
    print("Input IDs:", encoded["input_ids"])
    print("Decoded Input:", tokenizer.decode(encoded["input_ids"][0]))

    print("\nGenerating predictions with probabilities...")
    try:
        # Generate outputs with model directly to access logits
        outputs = model.generate(
            encoded["input_ids"],
            max_length=50,
            num_beams=5,
            num_return_sequences=5,
            return_dict_in_generate=True,
            output_scores=True
        )

        # Decode generated sequences
        decoded_candidates = [
            tokenizer.decode(output, skip_special_tokens=True)
            for output in outputs.sequences
        ]

        # Calculate probabilities from logits
        probs = []
        for sequence_idx, score in enumerate(outputs.sequences_scores):
            prob = torch.exp(score).item()  # Convert log score to probability
            probs.append(prob)

        # Display candidates and their probabilities
        for idx, (candidate, prob) in enumerate(zip(decoded_candidates, probs)):
            print(f"Candidate {idx + 1}: {candidate.strip()} (Probability: {prob:.4f})")

    except Exception as e:
        print(f"Error during prediction: {e}")

if __name__ == "__main__":
    main()


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


Loading model and tokenizer...
Model and tokenizer loaded successfully!
Initializing pipeline...

Input code: @ override public int hashCode ( ) { final long v = currentValue . get ( ) ; return ( int ) v ^ ( int ) ( [MSK] ) ; }

Tokenization debug:
Input IDs: tensor([[  33,  476,  151,  125, 2202,    9,   10,   64,  161,  333,   59,   30,
         5257,   15,   87,    9,   10,   28,  120,    9,  125,   10,   59,   36,
            9,  125,   10,    9,    4,   10,   28,   66]])
Decoded Input: @ override public int hashcode ( ) { final long v = currentvalue. get ( ) ; return ( int ) v ^ ( int ) ( [MSK] ) ; }

Generating predictions with probabilities...
Candidate 1: super. hashcode ( ) (Probability: 0.6413)
Candidate 2: long. min_value (Probability: 0.5478)
Candidate 3: v > > > 32 (Probability: 0.3460)
Candidate 4: long. height (Probability: 0.2960)
Candidate 5: long. max_value (Probability: 0.2699)
