In [None]:
%pip install peft datasets ipywidgets bitsandbytes transformers[onnx] optimum[openvino] optimum[ipex]

In [None]:
%pip freeze > requirements.txt

In [None]:
%pip install -r requirements.txt

In [None]:
model_name='data/fine_tuned_qwen'

In [None]:
import torch
import openvino as ov
useOpenVino=False
if (torch.cuda.is_available()):
    device='cuda:0'
else: 
    print('OpenVINO Devices: ' ,ov.Core().available_devices)
    device=ov.Core().available_devices[-1]
    useOpenVino=True
print('CUDA available: '+str(torch.cuda.is_available()))
print("Using OpenVino: "+ str(useOpenVino))
print("Using Device "+device)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
import openvino as ov
core = ov.Core()
core.available_devices

In [None]:
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
from transformers import AutoTokenizer, pipeline

quantization_config = OVWeightQuantizationConfig(bits=4)
model = OVModelForCausalLM.from_pretrained("data/test", library='transformers', compile=False, use_cache=False,  quantization_config=quantization_config).to('GPU.1')
tokenizer = AutoTokenizer.from_pretrained("data/test")
model.compile()
#model.save_pretrained("data/test")

In [None]:
import time
import torch
fim_prefix_id = tokenizer.convert_tokens_to_ids("<|fim_prefix|>")
fim_suffix_id = tokenizer.convert_tokens_to_ids("<|fim_suffix|>")
fim_middle_id = tokenizer.convert_tokens_to_ids("<|fim_middle|>")

def generate_response(prefix, suffix):
    start = time.time()
    # Tokenize all prefixes and suffixes together
    prefix_ids = tokenizer(prefix, add_special_tokens=False)["input_ids"]
    suffix_ids = tokenizer(suffix, add_special_tokens=False)["input_ids"]

    # Combine the IDs
    inputs = {'input_ids':torch.tensor([[fim_prefix_id] + prefix_ids + [fim_suffix_id] + suffix_ids +[fim_middle_id]], dtype=torch.int64).to(device)}
    inputs['attention_mask']=torch.tensor([[1]*inputs['input_ids'].shape[1]], dtype=torch.int64).to(device)

    input_token_count=inputs['input_ids'].shape[1]
    
    outputs = model.generate(**inputs, max_new_tokens=50)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    output_token_count=outputs.shape[-1]
    print("Total input tokens: {} Output token count: {} Time: {}ms Token/s {}".format(input_token_count, output_token_count, 1000*(time.time()-start), ( output_token_count+input_token_count)/(time.time()-start)))
    return tokenizer.decode(outputs[0])
# Example usage
response = generate_response('print("Total input tokens: {} Output token count:'*200+'\n// Log Hello World\nconsole.','')
print(f"Response: {response}")

In [None]:
from http.server import ThreadingHTTPServer, BaseHTTPRequestHandler
import json

class Handler(BaseHTTPRequestHandler):

    def do_POST(self):
        # read the body and parse as json
        content_len = int(self.headers.get('content-length', 0))
        post_body = self.rfile.read(content_len)
        
        # Parse the JSON data
        try:
            request = json.loads(post_body.decode('utf-8'))
            prefix=request['prefix']
            suffix=request['suffix']
            self.send_response(200)
            self.end_headers()
            response=generate_response(prefix, suffix)
            self.wfile.write(response.encode('utf8'))
            return
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            # Handle the error, e.g., send an error response
            self.send_response(400)
            self.end_headers()
            self.wfile.write(b'Invalid JSON data')
            return


def run():
    server = ThreadingHTTPServer(('0.0.0.0', 4444), Handler)
    server.serve_forever()
run();

In [None]:
!curl http://localhost:4444/200 -d '{"prefix":"//log Hello World\nconsole.","suffix":"}"}'

In [None]:
from peft import PeftModel
base_model=AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-0.5B", use_cache=True);
merged_model= PeftModel.from_pretrained(base_model,model_name);
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained("data/test")
tokenizer.save_pretrained("data/test")

In [None]:
!optimum-cli export onnx --model data/test --task text-generation data/onnx

In [None]:
!optimum-cli export openvino --model data/test --task text-generation --weight-format int4  data/ov_model/

git clone https://github.com/ggerganov/llama.cpp --depth=1
cd llama.cpp/
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
python convert_hf_to_gguf.py ../open-auto-complete/fine-tune/data/test/ --outfile qwen_fine.gguf


/llm/ollama/ollama create --quantize q4_K_M qwen_tuned