In [11]:
import os

model_path = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code/spt-code/outputs/pre_train_20241112_201020/models/"
print(os.listdir(model_path))

['training_args.bin', 'model.safetensors', 'cap', 'generation_config.json', 'config.json']


In [32]:
import os
from transformers import BartConfig
from models.bart import BartForClassificationAndGeneration
from data.vocab import load_vocab

# Paths to the model and vocab directories
model_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code/spt-code/outputs/pre_train_20241115_232553/models"
vocab_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code/spt-code/outputs/pre_train_20241115_232553/vocabs"

# Load model configuration and weights
config_path = os.path.join(model_dir, "config.json")
model_weights_path = os.path.join(model_dir, "model.safetensors")

config = BartConfig.from_json_file(config_path)
model = BartForClassificationAndGeneration.from_pretrained(model_weights_path, config=config)

# Load vocabularies
code_vocab = load_vocab(vocab_root=vocab_dir, name="code")
ast_vocab = load_vocab(vocab_root=vocab_dir, name="ast")
nl_vocab = load_vocab(vocab_root=vocab_dir, name="nl")

print(f"Model and vocabularies loaded successfully.")
print(f"Code vocab size: {len(code_vocab)}")
print(f"AST vocab size: {len(ast_vocab)}")
print(f"NL vocab size: {len(nl_vocab)}")


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


Model and vocabularies loaded successfully.
Code vocab size: 919
AST vocab size: 34
NL vocab size: 1878


In [33]:
print(dir(code_vocab))

['EOS_TOKEN', 'MSK_TOKEN', 'PAD_TOKEN', 'SEP_TOKEN', 'SOS_TOKEN', 'START_VOCAB', 'UNK_TOKEN', '_Vocab__special_symbols', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'add_special_symbols', 'decode', 'decode_batch', 'encode_batch', 'encode_sequence', 'eos_processor', 'get_eos_index', 'get_index', 'get_mask_index', 'get_pad_index', 'get_sos_index', 'get_token', 'get_unk_index', 'ignore_case', 'index_offset', 'method', 'name', 'num_special_token', 'pad_token_id', 'restore_index', 'save', 'save_pickle', 'save_pretrained', 'sep_processor', 'sos_processor', 'tokenizer', 'transfer_index']


In [26]:
import os
import torch
from transformers import BartForConditionalGeneration, BartConfig
from data.vocab import load_vocab

def load_model_and_vocab(model_dir, vocab_dir):
    config_path = os.path.join(model_dir, 'config.json')
    model_path = os.path.join(model_dir, 'model.safetensors')
    config = BartConfig.from_json_file(config_path)
    model = BartForConditionalGeneration.from_pretrained(model_path, config=config)
    model.eval()  

    code_vocab = load_vocab(vocab_dir, "code")
    nl_vocab = load_vocab(vocab_dir, "nl")
    ast_vocab = load_vocab(vocab_dir, "ast")
    
    return model, code_vocab#, nl_vocab, ast_vocab

def generate_candidates(model, input_text, vocab, num_beams=5, max_length=50):
    input_ids, attention_mask = vocab.encode_sequence(input_text)
    input_ids = torch.tensor([input_ids])  
    attention_mask = torch.tensor([attention_mask])

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_beams=num_beams,
        num_return_sequences=num_beams,
        output_scores=True,
        return_dict_in_generate=True
    )
    
    candidates = vocab.decode_batch(outputs.sequences.tolist())
    probabilities = torch.softmax(outputs.sequences_scores, dim=0).tolist()
    
    return candidates, probabilities

def main():
    model_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/models"  
    vocab_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/vocabs"  
    
    incomplete_code = "public void writeLock() {    this.fsLock.longReadLock().lock();     [MSK] .lock();}" 
    
    model, code_vocab = load_model_and_vocab(model_dir, vocab_dir)
    print("Code 1 tokenized input:", code_vocab.encode_sequence(incomplete_code))
    
    candidates, probabilities = generate_candidates(model, incomplete_code, code_vocab)
    
    for idx, (candidate, prob) in enumerate(zip(candidates, probabilities)):
        print(f"Candidate {idx + 1}: {candidate} (Probability: {prob:.2%})")

if __name__ == "__main__":
    main()


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


Code 1 tokenized input: ([148, 209, 2909, 13, 14, 69, 197, 19, 923, 383, 19, 335, 3651, 13, 14, 19, 383, 13, 14, 32, 38, 46659, 40, 19, 383, 13, 14, 32, 71], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
Candidate 1: this . readwritelock . readlock ( ) (Probability: 30.31%)
Candidate 2: this . readwritelock . writelock ( ) (Probability: 21.15%)
Candidate 3: this . parents (Probability: 16.34%)
Candidate 4: this . readwritelock . getapplication lock ( ) (Probability: 16.31%)
Candidate 5: this . delegate (Probability: 15.90%)


In [None]:
from huggingface_hub import login

login(token="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")  # Replace with your Hugging Face access token


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/user1-system11/.cache/huggingface/token
Login successful


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from huggingface_hub import create_repo

repo_name = "code_completion_tokenizer"
create_repo(repo_id=repo_name, token="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", private=False)  # Adjust `private` as needed


RepoUrl('https://huggingface.co/shradha01/code_completion_tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='shradha01/code_completion_tokenizer')

In [10]:
from tokenizers import Tokenizer

# Load the individual tokenizers
ast_tokenizer = Tokenizer.from_file("/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241116_103947/multi-tokenizer/ast_tokenizer.json")
code_tokenizer = Tokenizer.from_file("/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241116_103947/multi-tokenizer/code_tokenizer.json")
nl_tokenizer = Tokenizer.from_file("/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241116_103947/multi-tokenizer/nl_tokenizer.json")


In [11]:
ast_vocab = ast_tokenizer.get_vocab()
code_vocab = code_tokenizer.get_vocab()
nl_vocab = nl_tokenizer.get_vocab()

In [12]:

merged_vocab = {**ast_vocab, **code_vocab, **nl_vocab}  # Combines dictionaries


In [13]:

merged_vocab = {token: idx for idx, (token, _) in enumerate(merged_vocab.items())}


In [15]:
from tokenizers.models import WordLevel
from tokenizers import Tokenizer

unified_tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))

special_tokens = ["[PAD]", "[SOS]", "[EOS]", "[SEP]", "[MSK]"]
unified_tokenizer.model = WordLevel(unk_token="[UNK]")

unified_tokenizer.model = WordLevel(unk_token="[UNK]", vocab=merged_vocab)


In [16]:

for token in special_tokens:
    if token not in merged_vocab:
        merged_vocab[token] = len(merged_vocab)


In [17]:

unified_tokenizer.enable_padding(pad_id=merged_vocab["[PAD]"], pad_token="[PAD]")

encoded = unified_tokenizer.encode("This is a test sentence.")
print("Encoded:", encoded.ids)

decoded = unified_tokenizer.decode(encoded.ids)
print("Decoded:", decoded)


Encoded: [52]
Decoded: [UNK]


In [18]:
unified_tokenizer.save("/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241116_103947/multi-tokenizer/unified_tokenizer.json")

In [19]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241116_103947/multi-tokenizer/unified_tokenizer.json")
tokenizer.push_to_hub("multi-purpose-tokenizer")


  _torch_pytree._register_pytree_node(


CommitInfo(commit_url='https://huggingface.co/shradha01/multi-purpose-tokenizer/commit/c7fa6ea69d507c600b1d6a352b86c15c35105ca4', commit_message='Upload tokenizer', commit_description='', oid='c7fa6ea69d507c600b1d6a352b86c15c35105ca4', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/vocabs/code/code-tokenizer.json")
tokenizer.push_to_hub("code-tokenizer")

In [6]:
from transformers import PreTrainedTokenizerFast

# Load the tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/vocabs/code/code_tokenizer.json"
)

# Save the tokenizer and all relevant files to a directory
tokenizer.save_pretrained("/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/vocabs/code")

# Push the tokenizer to the Hugging Face Hub
tokenizer.push_to_hub("code-tokenizer-01")


CommitInfo(commit_url='https://huggingface.co/shradha01/code-tokenizer-01/commit/761df7e731c83678eced702a1958faa589831af5', commit_message='Upload tokenizer', commit_description='', oid='761df7e731c83678eced702a1958faa589831af5', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
from transformers import AutoModel

model_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/models"  
repo_name = "Code-completion-01"  

model = AutoModel.from_pretrained(model_dir)

model.push_to_hub(repo_name)


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
model.safetensors: 100%|██████████| 1.05G/1.05G [00:23<00:00, 44.5MB/s]


CommitInfo(commit_url='https://huggingface.co/shradha01/Code-completion-01/commit/c2609253f8a45a249c48993be8d29ae6ed3b17ad', commit_message='Upload model', commit_description='', oid='c2609253f8a45a249c48993be8d29ae6ed3b17ad', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Load the model and tokenizer from the Hugging Face Hub
model_name = "shradha01/Code-completion-01"
tokenizer_name = "shradha01/code-tokenizer-01"#"shradha01/multi-purpose-tokenizer"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


In [13]:

code_completion_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

In [14]:
incomplete_code = "public void writeLock() {    this.fsLock.longReadLock().lock();     [MSK] .lock();}" 

In [15]:
encoded = tokenizer(incomplete_code, return_tensors="pt")
print("Token IDs:", encoded.input_ids)
print("Tokens:", [tokenizer.decode([id]) for id in encoded.input_ids[0]])

if any(id is None for id in encoded.input_ids[0]):
    print("Found out-of-vocabulary tokens!")


Token IDs: tensor([[ 148,  209, 2909,   13,   14,   69,  197,   19,  923,  383,   19,  335,
         3651,   13,   14,   19,  383,   13,   14,   32,    4,   19,  383,   13,
           14,   32,   71]])
Tokens: ['public', 'void', 'writelock', '(', ')', '{', 'this', '.', 'fs', 'lock', '.', 'long', 'readlock', '(', ')', '.', 'lock', '(', ')', ';', '[MSK]', '.', 'lock', '(', ')', ';', '}']


In [16]:
# if "[MSK]" not in tokenizer.get_vocab():
#     print("`[MSK]` token is missing from the vocabulary!")


In [17]:

# if "[MSK]" not in tokenizer.get_vocab():
#     tokenizer.add_tokens(["[MSK]"])
#     model.resize_token_embeddings(len(tokenizer))


In [18]:
# encoded_input = tokenizer(incomplete_code, return_tensors="pt")
# print("Input IDs:", encoded_input.input_ids)
# print("Tokens:", [tokenizer.decode([token_id]) for token_id in encoded_input.input_ids[0]])


In [19]:

tokenizer_vocab_size = len(tokenizer)
print("Tokenizer vocabulary size:", tokenizer_vocab_size)

model_vocab_size = model.config.vocab_size
print("Model vocabulary size:", model_vocab_size)

if tokenizer_vocab_size != model_vocab_size:
    model.resize_token_embeddings(tokenizer_vocab_size)
    print("Resized model embeddings to match tokenizer vocabulary size.")


Tokenizer vocabulary size: 50000
Model vocabulary size: 80092
Resized model embeddings to match tokenizer vocabulary size.


In [20]:

encoded_input = tokenizer(incomplete_code, return_tensors="pt")
print("Input IDs:", encoded_input.input_ids)
print("Decoded Tokens:", tokenizer.decode(encoded_input.input_ids[0]))


Input IDs: tensor([[ 148,  209, 2909,   13,   14,   69,  197,   19,  923,  383,   19,  335,
         3651,   13,   14,   19,  383,   13,   14,   32,    4,   19,  383,   13,
           14,   32,   71]])
Decoded Tokens: public void writelock ( ) { this. fs lock. long readlock ( ). lock ( ) ; [MSK]. lock ( ) ; }


In [21]:

candidates = code_completion_pipeline(
    incomplete_code,
    max_length=10,
    num_return_sequences=5,
    num_beams=5
)

for idx, candidate in enumerate(candidates):
    print(f"Candidate {idx + 1}: {candidate['generated_text']}")


Candidate 1: new public lock ( )
Candidate 2: new public lock ( 10 )
Candidate 3: new public lock ( lock )
Candidate 4: new public lock ( 10 , this .
Candidate 5: new public lock ( 10 , timeunit .


In [22]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

def main():
    model_name = "shradha01/Code-completion-01"  
    tokenizer = "shradha01/code-tokenizer-01"  

    print("Loading model and tokenizer...")
    try:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

        tokenizer = AutoTokenizer.from_pretrained(tokenizer)

        special_tokens = {"additional_special_tokens": ["[MSK]"]}
        tokenizer.add_special_tokens(special_tokens)
        model.resize_token_embeddings(len(tokenizer))

        print("Model and tokenizer loaded successfully!")
    except Exception as e:
        print(f"Error loading model or tokenizer: {e}")
        return

    print("Initializing pipeline...")
    try:
        code_completion_pipeline = pipeline(
            "text2text-generation", model=model, tokenizer=tokenizer
        )
    except Exception as e:
        print(f"Error initializing pipeline: {e}")
        return

    incomplete_code = "public void writeLock() { this.fsLock.longReadLock().lock(); [MSK] .lock(); }"
    print("\nInput code:", incomplete_code)

    print("\nTokenization debug:")
    encoded = tokenizer(incomplete_code, return_tensors="pt")
    print("Input IDs:", encoded["input_ids"])
    print("Decoded Input:", tokenizer.decode(encoded["input_ids"][0]))

    if tokenizer.convert_tokens_to_ids("[MSK]") == tokenizer.unk_token_id:
        print("Warning: `[MSK]` token is not recognized. Adding it to the tokenizer.")
        tokenizer.add_special_tokens({"additional_special_tokens": ["[MSK]"]})
        model.resize_token_embeddings(len(tokenizer))

    print("\nGenerating predictions...")
    try:
        candidates = code_completion_pipeline(
            incomplete_code,
            max_length=50,  
            num_beams=5,    
            num_return_sequences=5, 
            temperature=0.7,  
            top_k=50,         
            top_p=0.95        
        )

        for idx, candidate in enumerate(candidates):
            print(f"Candidate {idx + 1}: {candidate['generated_text']}")

    except Exception as e:
        print(f"Error during prediction: {e}")

if __name__ == "__main__":
    main()


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


Loading model and tokenizer...


Model and tokenizer loaded successfully!
Initializing pipeline...

Input code: public void writeLock() { this.fsLock.longReadLock().lock(); [MSK] .lock(); }

Tokenization debug:
Input IDs: tensor([[ 148,  209, 2909,   13,   14,   69,  197,   19,  923,  383,   19,  335,
         3651,   13,   14,   19,  383,   13,   14,   32,    4,   19,  383,   13,
           14,   32,   71]])
Decoded Input: public void writelock ( ) { this. fs lock. long readlock ( ). lock ( ) ; [MSK]. lock ( ) ; }

Generating predictions...




Candidate 1: new public lock ( )
Candidate 2: new public lock ( 10 )
Candidate 3: new public lock ( 10 , this . lock )
Candidate 4: new public lock ( 10 , timeunit . milliseconds )
Candidate 5: new public lock ( lock )


In [33]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import torch.nn.functional as F


def main():
    # Define repository and subfolder
    model_name = "shradha01/Code-completion-01"  # Main model repository
    tokenizer_name = "shradha01/code-tokenizer-01"  # Subfolder for tokenizer

    print("Loading model and tokenizer...")
    try:
        # Load model
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

        special_tokens = {"additional_special_tokens": ["[MSK]"]}
        tokenizer.special_tokens_map["[MSK]"] = 46659
        if tokenizer.pad_token is None:
            print("Padding token not found. Adding [PAD] as pad_token.")
            special_tokens["pad_token"] = "[PAD]"

        tokenizer.add_special_tokens(special_tokens)

        # Resize the model embeddings to accommodate new tokens
        model.resize_token_embeddings(len(tokenizer))

        print("Model and tokenizer loaded successfully!")

    except Exception as e:
        print(f"Error loading model or tokenizer: {e}")
        return

    # Example input
    incomplete_code = "public Token<? extends TokenIdentifier> getToken(Text alias) {    return  [MSK] ;}"
    print("\nInput code:", incomplete_code)

    # Tokenize input
    inputs = tokenizer(
            incomplete_code,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=model.config.max_position_embeddings,
        )
    incomplete_code = incomplete_code.replace("[MSK]", "38 [MSK] 40")

    print("\nTokenized Input IDs:", inputs["input_ids"].tolist())
    print("Attention Mask:", inputs["attention_mask"].tolist())
    print("Special Tokens:", tokenizer.all_special_tokens_extended)


    # Generate predictions with beam search
    print("\nGenerating predictions with probabilities...")
    try:
        # Perform beam search to get multiple candidates
        output_sequences = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=50,  # Maximum length of the generated sequence
            num_beams=5,    # Beam search for diverse outputs
            num_return_sequences=5,  # Number of candidates
            return_dict_in_generate=True,
            output_scores=True,
        )

        # Decode the candidates
        candidates = [
            tokenizer.decode(seq, skip_special_tokens=True)
            for seq in output_sequences["sequences"]
        ]

        # Calculate probabilities from scores
        scores = output_sequences["sequences_scores"]  # Log-probabilities
        probabilities = F.softmax(scores, dim=0)  # Convert to probabilities

        # Display results with probabilities
        for idx, (candidate, prob) in enumerate(zip(candidates, probabilities)):
            print(f"Candidate {idx + 1}: {candidate} (Probability: {prob:.4f})")

    except Exception as e:
        print(f"Error during prediction: {e}")


if __name__ == "__main__":
    main()

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


Loading model and tokenizer...


Padding token not found. Adding [PAD] as pad_token.
Model and tokenizer loaded successfully!

Input code: public Token<? extends TokenIdentifier> getToken(Text alias) {    return  [MSK] ;}

Tokenized Input IDs: [[148, 465, 33, 36, 872, 465, 1451, 35, 3970, 13, 497, 1494, 14, 69, 121, 4, 32, 71]]
Attention Mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
Special Tokens: [AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), AddedToken("[MSK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True)]

Generating predictions with probabilities...
Candidate 1: public token identifier. gettoken ( alias ) (Probability: 0.2223)
Candidate 2: public token identifier. parse ( text alias ) (Probability: 0.2213)
Candidate 3: public token identifier. to identifier ( alias ) (Probability: 0.2059)
Candidate 4: public token identifier. gettoken ( text alias ) (Probability: 0.1831)
Candidate 5: public token identifier. t

In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# def main():
#     # Define repository and subfolder
#     model_name = "claudios/sptcode"  # Main model repository
#     tokenizer_subfolder = "code_tokenizer_fast"  # Subfolder for tokenizer

#     print("Loading model and tokenizer...")
#     try:
#         # Load model
#         model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

#         # Load tokenizer from subfolder
#         tokenizer = AutoTokenizer.from_pretrained(model_name, subfolder=tokenizer_subfolder)

#         # Add special tokens if needed
#         special_tokens = {"additional_special_tokens": ["[MSK]"]}
#         tokenizer.add_special_tokens(special_tokens)
#         model.resize_token_embeddings(len(tokenizer))

#         print("Model and tokenizer loaded successfully!")
#     except Exception as e:
#         print(f"Error loading model or tokenizer: {e}")
#         return

#     # Initialize pipeline
#     print("Initializing pipeline...")
#     try:
#         code_completion_pipeline = pipeline(
#             "text2text-generation", model=model, tokenizer=tokenizer
#         )
#     except Exception as e:
#         print(f"Error initializing pipeline: {e}")
#         return

#     # Example input
#     incomplete_code = "public void writeLock() { this.fsLock.longReadLock().lock(); [MSK] .lock(); }"
#     print("\nInput code:", incomplete_code)

#     # Debug tokenization
#     print("\nTokenization debug:")
#     encoded = tokenizer(incomplete_code, return_tensors="pt")
#     print("Input IDs:", encoded["input_ids"])
#     print("Decoded Input:", tokenizer.decode(encoded["input_ids"][0]))

#     # Check if `[MSK]` is recognized
#     if tokenizer.convert_tokens_to_ids("[MSK]") == tokenizer.unk_token_id:
#         print("Warning: `[MSK]` token is not recognized. Adding it to the tokenizer.")
#         tokenizer.add_special_tokens({"additional_special_tokens": ["[MSK]"]})
#         model.resize_token_embeddings(len(tokenizer))

#     # Generate predictions
#     print("\nGenerating predictions...")
#     try:
#         candidates = code_completion_pipeline(
#             incomplete_code,
#             max_length=50,  # Maximum length of the generated sequence
#             num_beams=5,    # Beam search for diverse outputs
#             num_return_sequences=5,  # Number of candidates
#             temperature=0.7,  # Adjust randomness
#             top_k=50,         # Limit the sampling pool
#             top_p=0.95        # Nucleus sampling
#         )

#         # Display results
#         for idx, candidate in enumerate(candidates):
#             print(f"Candidate {idx + 1}: {candidate['generated_text']}")
#     except Exception as e:
#         print(f"Error during prediction: {e}")

# if __name__ == "__main__":
#     main()
