In [11]:
import os

model_path = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code/spt-code/outputs/pre_train_20241112_201020/models/"
print(os.listdir(model_path))

['training_args.bin', 'model.safetensors', 'cap', 'generation_config.json', 'config.json']


In [4]:
import os
from transformers import BartConfig
from models.bart import BartForClassificationAndGeneration
from data.vocab import load_vocab

# Paths to the model and vocab directories
model_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code/spt-code/outputs/pre_train_20241115_232553/models"
vocab_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code/spt-code/outputs/pre_train_20241115_232553/vocabs"

# Load model configuration and weights
config_path = os.path.join(model_dir, "config.json")
model_weights_path = os.path.join(model_dir, "model.safetensors")

config = BartConfig.from_json_file(config_path)
model = BartForClassificationAndGeneration.from_pretrained(model_weights_path, config=config)

# Load vocabularies
code_vocab = load_vocab(vocab_root=vocab_dir, name="code")
ast_vocab = load_vocab(vocab_root=vocab_dir, name="ast")
nl_vocab = load_vocab(vocab_root=vocab_dir, name="nl")

print(f"Model and vocabularies loaded successfully.")
print(f"Code vocab size: {len(code_vocab)}")
print(f"AST vocab size: {len(ast_vocab)}")
print(f"NL vocab size: {len(nl_vocab)}")


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


Model and vocabularies loaded successfully.
Code vocab size: 919
AST vocab size: 34
NL vocab size: 1878


In [7]:
print(dir(code_vocab))

['EOS_TOKEN', 'MSK_TOKEN', 'PAD_TOKEN', 'SEP_TOKEN', 'SOS_TOKEN', 'START_VOCAB', 'UNK_TOKEN', '_Vocab__special_symbols', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'add_special_symbols', 'decode', 'decode_batch', 'encode_batch', 'encode_sequence', 'eos_processor', 'get_eos_index', 'get_index', 'get_mask_index', 'get_pad_index', 'get_sos_index', 'get_token', 'get_unk_index', 'ignore_case', 'index_offset', 'method', 'name', 'num_special_token', 'pad_token_id', 'restore_index', 'save', 'save_pickle', 'save_pretrained', 'sep_processor', 'sos_processor', 'tokenizer', 'transfer_index']


In [6]:
import os
import torch
from transformers import BartForConditionalGeneration, BartConfig
from data.vocab import load_vocab

def load_model_and_vocab(model_dir, vocab_dir):
    config_path = os.path.join(model_dir, 'config.json')
    model_path = os.path.join(model_dir, 'model.safetensors')
    config = BartConfig.from_json_file(config_path)
    model = BartForConditionalGeneration.from_pretrained(model_path, config=config)
    model.eval()  

    code_vocab = load_vocab(vocab_dir, "code")
    nl_vocab = load_vocab(vocab_dir, "nl")
    ast_vocab = load_vocab(vocab_dir, "ast")
    
    return model, code_vocab, nl_vocab, ast_vocab

def generate_candidates(model, input_text, code_vocab, num_beams=5, max_length=50):
    input_ids, attention_mask = code_vocab.encode_sequence(input_text)
    input_ids = torch.tensor([input_ids])  
    attention_mask = torch.tensor([attention_mask])

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_beams=num_beams,
        num_return_sequences=num_beams,
        output_scores=True,
        return_dict_in_generate=True
    )
    
    candidates = code_vocab.decode_batch(outputs.sequences.tolist())
    probabilities = torch.softmax(outputs.sequences_scores, dim=0).tolist()
    
    return candidates, probabilities

def main():
    model_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/models"  
    vocab_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/vocabs"  
    
    incomplete_code = "public int add(int a, int b) { return" 
    
    model, code_vocab, nl_vocab, ast_vocab = load_model_and_vocab(model_dir, vocab_dir)
    
    candidates, probabilities = generate_candidates(model, incomplete_code, code_vocab)
    
    for idx, (candidate, prob) in enumerate(zip(candidates, probabilities)):
        print(f"Candidate {idx + 1}: {candidate} (Probability: {prob:.2%})")

if __name__ == "__main__":
    main()


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


Candidate 1: math . min ( a , b , a + b ) (Probability: 22.00%)
Candidate 2: math . min ( a , b ) + b (Probability: 20.53%)
Candidate 3: add ( a , a , b , a , b ) (Probability: 20.07%)
Candidate 4: add ( a , a , b , b ) (Probability: 18.84%)
Candidate 5: addfile ( a , a , b , b , b ) (Probability: 18.55%)


In [13]:
from transformers import pipeline

model_name = "shradha01/code-completion"  
code_completion_pipeline = pipeline("text2text-generation", model=model_name)

incomplete_code = """
public class Main {
"""

output = code_completion_pipeline(incomplete_code, max_length=100, num_return_sequences=1)

print("Predicted Completion:")
print(output[0]["generated_text"])


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


OSError: Can't load tokenizer for 'shradha01/code-completion'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'shradha01/code-completion' is the correct path to a directory containing all relevant files for a BartTokenizerFast tokenizer.

In [12]:
from huggingface_hub import Repository, HfApi

def push_model_to_huggingface(model_dir: str, repo_name: str, organization: str = None):

    from huggingface_hub import HfApi, Repository

    api = HfApi()

    repo_id = "shradha01/code-completion-model"
    try:
        api.create_repo(repo_id=repo_id, exist_ok=True)
    except Exception as e:
        print(f"Error creating repository: {e}")
        return

    # Push model
    repo = Repository(local_dir=model_dir, clone_from=f"https://huggingface.co/{repo_id}")
    repo.push_to_hub(commit_message="Add fine-tuned code completion model")

    print(f"Model pushed to Hugging Face Hub at: https://huggingface.co/{repo_id}")

# Usage
model_directory = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/models"
repository_name = "code-completion-model"
organization_name = "shradha01"  # Optional, replace with your Hugging Face organization name if any
push_model_to_huggingface(model_directory, repository_name, organization_name)


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.


OSError: Tried to clone a repository in a non-empty folder that isn't a git repository ('/home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/models'). If you really want to do this, do it manually:
 cd /home/user1-system11/Documents/research-shradha/CODE-SPT-Code-TreeSitterV3/SPT-Code/outputs/pre_train_20241105_105459/models && git init && git remote add origin && git pull origin main
 or clone repo to a new folder and move your existing files there afterwards.

In [2]:
import torch
from torch.utils.data import Dataset
from transformers import BartConfig, GenerationConfig
from models.bart import BartForClassificationAndGeneration
from data.vocab import load_vocab
from utils.trainer import CodeTrainer
from utils.callbacks import LogStateCallBack
from data.data_collator import collate_fn
import os


class UserInputDataset(Dataset):
    """
    A simple dataset for wrapping user-provided input code snippets for prediction.
    """
    def __init__(self, input_code, code_vocab, max_length=128):
        """
        Args:
            input_code (str): The user-provided incomplete code snippet.
            code_vocab (Vocab): Pretrained vocabulary for code tokens.
            max_length (int): Maximum length for tokenized sequences.
        """
        self.max_length = max_length
        self.code_vocab = code_vocab
        self.pad_id = code_vocab.get_pad_index()  # Padding token index
        self.unk_id = code_vocab.get_unk_index()  # Unknown token index

        # Tokenize the input code snippet
        input_tokens = input_code.split()
        self.input_ids = [
            code_vocab.get_index(token) if code_vocab.get_index(token) is not None else self.unk_id
            for token in input_tokens
        ]

        # Create attention mask (1 for valid tokens, 0 for padding)
        self.attention_mask = [1] * len(self.input_ids)

        # Pad or truncate to max_length
        if len(self.input_ids) < self.max_length:
            self.input_ids += [self.pad_id] * (self.max_length - len(self.input_ids))
            self.attention_mask += [0] * (self.max_length - len(self.attention_mask))
        else:
            self.input_ids = self.input_ids[:self.max_length]
            self.attention_mask = self.attention_mask[:self.max_length]

    def __len__(self):
        return 1  # Single user input case

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(self.attention_mask, dtype=torch.long)
        }


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [3]:
import enums
print("MODEL_MODE_GEN:", enums.MODEL_MODE_GEN)
print("MODEL_MODE_CLS:", enums.MODEL_MODE_CLS)
print("MODEL_MODE_SEARCH:", enums.MODEL_MODE_SEARCH)

MODEL_MODE_GEN: bart_gen
MODEL_MODE_CLS: bart_cls
MODEL_MODE_SEARCH: bart_search


In [16]:
from transformers import Seq2SeqTrainingArguments, IntervalStrategy, SchedulerType
from transformers import EarlyStoppingCallback

def run_custom_completion(args, model_dir, vocab_dir, input_code):
    """
    Run custom code completion using Seq2SeqTrainingArguments and CodeTrainer.

    Args:
        args: Arguments with necessary configurations.
        model_dir: Path to the saved model.
        vocab_dir: Path to the vocabularies.
        input_code: User-provided incomplete code.

    Returns:
        List: Generated completions.
    """
    # Load Model
    config = BartConfig.from_pretrained(os.path.join(model_dir, "config.json"))
    model = BartForClassificationAndGeneration.from_pretrained(
        os.path.join(model_dir, "model.safetensors"),
        config=config
    )
    model.set_model_mode(enums.MODEL_MODE_GEN)

    # Assign generation_config directly to the model
    model.generation_config = GenerationConfig(
        max_length=50,
        num_beams=5,
        early_stopping=True
    )

    # Load Vocabularies
    code_vocab = load_vocab(vocab_dir, "code")
    nl_vocab = load_vocab(vocab_dir, "nl")
    ast_vocab = load_vocab(vocab_dir, "ast")

    # Prepare Dataset
    dataset = UserInputDataset(input_code=input_code, code_vocab=code_vocab, max_length=args.max_code_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=1, collate_fn=lambda batch: collate_fn(batch, args, "completion", code_vocab, nl_vocab, ast_vocab)
    )


     # Initialize Trainer without relying on args.generation_config
    trainer = CodeTrainer(
        main_args=args,
        code_vocab=code_vocab,
        ast_vocab=ast_vocab,
        nl_vocab=nl_vocab,
        task="completion",
        model=model,
        data_collator=None,
        tokenizer=nl_vocab,
        compute_metrics=None,  # Metrics are not needed for single prediction
        callbacks=[LogStateCallBack()]
    )

    # Generate Predictions
    results = trainer.predict(test_dataset=data_loader)
    predictions = results.predictions

    # Decode Predictions
    decoded_completions = [code_vocab.decode(output.tolist()) for output in predictions]
    return decoded_completions



In [17]:

# Example Usage
class Args:
    max_code_len = 128
    # Add other required arguments

# Define paths
model_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code/spt-code/outputs/pre_train_20241115_232553/models"
vocab_dir = "/home/user1-system11/Documents/research-shradha/CODE-SPT-Code/spt-code/outputs/pre_train_20241115_232553/vocabs"

# User-provided incomplete code snippet
input_code = "for (int i = 0; i < n; i++) {"

# Initialize arguments
args = Args()

# Run the completion function
completions = run_custom_completion(args, model_dir, vocab_dir, input_code)

# Display the results
print("\nGenerated Completions:")
for i, completion in enumerate(completions, start=1):
    print(f"Completion {i}: {completion}")

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


AttributeError: 'TrainingArguments' object has no attribute 'generation_config'