In [2]:
#%pip install peft datasets ipywidgets bitsandbytes transformers optimum[openvino] nncf optimum[ipex]

In [None]:
#%pip freeze > requirements.txt

In [4]:
#%pip install -r requirements.txt

In [5]:
# model_name='data/fine_tuned_qwen'
model_name='Qwen/Qwen2.5-Coder-0.5B'

In [None]:
import torch
import openvino as ov
useOpenVino=False
if (torch.cuda.is_available()):
    torch.set_default_device('cuda:0')
else: 
    print('OpenVINO Devices: ' ,ov.Core().available_devices)
    ovDevice=ov.Core().available_devices[-1]
    print("Using OpenVINO device "+ovDevice)
    useOpenVino=True
print('CUDA available: '+str(torch.cuda.is_available()))
print("Using OpenVino: "+ str(useOpenVino))


In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

if not useOpenVino:
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
from transformers import AutoTokenizer, pipeline

if useOpenVino:
    quantization_config = OVWeightQuantizationConfig(bits=8)
    model = OVModelForCausalLM.from_pretrained(model_name, library='transformers', compile=False, use_cache=False, quantization_config=quantization_config).to(ovDevice)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model.compile()
#model.save_pretrained("data/test")

In [None]:
import time
import torch
from transformers import EosTokenCriteria, MaxLengthCriteria

fim_prefix_id = tokenizer.convert_tokens_to_ids("<|fim_prefix|>")
fim_suffix_id = tokenizer.convert_tokens_to_ids("<|fim_suffix|>")
fim_middle_id = tokenizer.convert_tokens_to_ids("<|fim_middle|>")
fim_pad_id = tokenizer.convert_tokens_to_ids("<|fim_pad|>")

model.generation_config.pad_token_id = tokenizer.pad_token_id

def generate_response(prefix, suffix):
    start = time.time()
    # Tokenize all prefixes and suffixes together
    prefix_ids = tokenizer(prefix, add_special_tokens=False)["input_ids"]
    suffix_ids = tokenizer(suffix, add_special_tokens=False)["input_ids"]

    # Combine the IDs
    inputs = {'input_ids': torch.tensor([[fim_prefix_id] + prefix_ids + [fim_suffix_id] + suffix_ids +[fim_middle_id]], dtype=torch.int64)}
    inputs['attention_mask']=torch.tensor([[1]*inputs['input_ids'].shape[1]], dtype=torch.int64)


    input_token_count=inputs['input_ids'].shape[1]
    
    outputs = model.generate(**inputs,max_new_tokens=20,eos_token_id=[tokenizer.eos_token_id,fim_pad_id ] )
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    output_token_count=outputs.shape[-1]
    print("Total input tokens: {} Output token count: {} Time: {}ms Token/s {}".format(input_token_count, output_token_count, 1000*(time.time()-start), ( output_token_count+input_token_count)/(time.time()-start)))
    return tokenizer.decode(outputs[0])
# Example usage
response = generate_response('print("Total input tokens: {} Output token count:'*10+'\n// Log Hello World\nconsole.','')
print(f"Response: {response}")

In [10]:
from http.server import ThreadingHTTPServer, BaseHTTPRequestHandler
import json
import ipywidgets as widgets


class Handler(BaseHTTPRequestHandler):

    def do_GET(self):
        server.shutdown()
        server.server_close()
    def do_POST(self):
        # read the body and parse as json
        content_len = int(self.headers.get('content-length', -1))
        post_body = self.rfile.read(content_len)
        
        # Parse the JSON data
        try:
            request = json.loads(post_body.decode('utf-8'))
            prefix=request['prefix']
            suffix=request['suffix']
            self.send_response(200)
            self.end_headers()
            response=generate_response(prefix, suffix)
            self.wfile.write(response.encode('utf8'))
            return
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            # Handle the error, e.g., send an error response
            self.send_response(400)
            self.end_headers()
            self.wfile.write(b'Invalid JSON data')
            return

server = ThreadingHTTPServer(('0.0.0.0', 4444), Handler)
server.serve_forever()


In [None]:
!curl http://localhost:4444/200 -d '{"prefix":"//log Hello World\nconsole.","suffix":"}"}'

In [None]:
from peft import PeftModel
base_model=AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-0.5B", use_cache=True);
merged_model= PeftModel.from_pretrained(base_model,model_name);
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained("data/test")
tokenizer.save_pretrained("data/test")

In [None]:
!optimum-cli export onnx --model data/test --task text-generation data/onnx

In [None]:
!optimum-cli export openvino --model data/test --task text-generation --weight-format fp16  data/ov_model/

git clone https://github.com/ggerganov/llama.cpp --depth=1
cd llama.cpp/
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
python convert_hf_to_gguf.py ../open-auto-complete/fine-tune/data/test/ --outfile qwen_fine.gguf


/llm/ollama/ollama create --quantize q4_K_M qwen_tuned