From 7ece012d8a0006741c242aac081c33d9e797f065 Mon Sep 17 00:00:00 2001 From: shewu-quic Date: Mon, 3 Feb 2025 14:03:43 +0800 Subject: [PATCH 1/4] Qualcomm AI Engine Direct - Enable AR-N mode to process prompt in hybrid mode Summary: - Add `max_seq_len` to refer to maximum number of tokens that the model can process & consider at once to generate predictions/responses. - Add `prefill_ar_n` to determine the number of tokens to consume and the number of logits to produce for prompt processor in hybrid mode. - Remove prefill mode --- examples/qualcomm/oss_scripts/llama/README.md | 15 +- examples/qualcomm/oss_scripts/llama/llama.py | 237 +++--- .../oss_scripts/llama/model/static_llama.py | 60 +- .../oss_scripts/llama/qnn_llama_runner.cpp | 6 +- .../oss_scripts/llama/runner/io_manager.cpp | 690 ++++++++++++------ .../oss_scripts/llama/runner/io_manager.h | 74 +- .../oss_scripts/llama/runner/runner.cpp | 123 ++-- .../oss_scripts/llama/runner/runner.h | 7 +- exir/lowered_backend_module.py | 2 +- 9 files changed, 757 insertions(+), 457 deletions(-) diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md index 439278cb424..4532e7656f7 100644 --- a/examples/qualcomm/oss_scripts/llama/README.md +++ b/examples/qualcomm/oss_scripts/llama/README.md @@ -8,11 +8,10 @@ This file provides you the instructions to run LLAMA model with different parame We offer the following modes to execute the model: -Prefill Mode: This is also known as batch prefill mode, where the model takes in a list of tokens as input and generates the next token along with the key-value (KV) cache for all tokens. This mode is efficient for encoding the user's prompt. - KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt. -Hybrid Mode: Hybrid mode leverages the strengths of both batch prefill and KV cache modes to optimize token generation speed. Initially, it uses prefill mode to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens. +Hybrid Mode: Hybrid mode leverages the strengths of both AR-N model and KV cache modes to optimize token generation speed. Initially, it uses AR-N model to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens. + - AR-N model: The auto-regression (AR) length determines the number of tokens to consume and the number of logits to produce. Use it to process the prompt and generate the key-value (kv) cache, which serves as a prompt processor in hybrid mode. ## Instructions @@ -50,13 +49,13 @@ At the end of this step, users should have the following files ready: `consolida ### Step3: Run default examples using hybrid mode. #### LLAMA2 ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M.pt --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --llama_model stories110m --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "Once upon a time" +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M.pt --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --llama_model stories110m --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "Once upon a time" ``` #### LLAMA3.2 Default example using hybrid mode. ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" ``` ### KV Cache update mechanism @@ -109,16 +108,16 @@ We have two distinct mechanisms for updating the key-value (KV) cache, which can ### Additional Configs when running the script If you would like to compile the model only, we have provided the flag `--compile_only`. Taking LLAMA3.2 as an example: ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --compile_only +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --compile_only ``` On the other hand, if you already have a pre-compiled .pte model, you can perform inference by providing the flag `--pre_gen_pte` and specifying the folder that contains the .pte model. Taking LLAMA3.2 as an example: ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --pre_gen_pte ${FOLDER_TO_PRE_GEN_PTE} ``` You can select the KV Cache update mechanism at runtime by setting the `KV_UPDATER` variable to either "shift_pointer" or "smart_mask". By default, it is set to "smart_mask". `KV_UPDATER` = "shift_pointer" ```bash -python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_seq_len 32 --kv_seq_len 128 --prompt "what is 1+1" --kv_updator ${KV_UPDATER} +python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --kv_updator ${KV_UPDATER} ``` diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index e853812a949..9cad2499730 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -89,32 +89,38 @@ logging.getLogger().setLevel(logging.INFO) -def smart_mask_updator(atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches): - for i, k_cache in enumerate(k_caches): - k_cache[:, :, pos] = new_k_caches[i][:, :, 0] +def smart_mask_updater( + ar_len, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches +): + # Update the KV cache input for the next inference when the position exceeds the autoregressive length. + if pos >= ar_len: + for i, k_cache in enumerate(k_caches): + k_cache[:, :, pos - ar_len] = new_k_caches[i][:, :, 0] - for i, v_cache in enumerate(v_caches): - v_cache[:, pos, :] = new_v_caches[i] + for i, v_cache in enumerate(v_caches): + v_cache[:, pos - ar_len, :] = new_v_caches[i][:, 0, :] + atten_mask[:, :, pos - ar_len] = 0 - atten_mask[0][pos] = 0 pos += 1 return (atten_mask, pos, k_caches, v_caches) -def shift_pointer_updator( - atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches +def shift_pointer_updater( + ar_len, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches ): - k_caches = [ - torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1) - for i, k_cache in enumerate(k_caches) - ] - v_caches = [ - torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1) - for i, v_cache in enumerate(v_caches) - ] + # Update the KV cache input for the next inference when the position exceeds the autoregressive length. + if pos >= ar_len: + k_caches = [ + torch.cat([k_cache[:, :, 1:], new_k_caches[i][:, :, :1]], dim=-1) + for i, k_cache in enumerate(k_caches) + ] + v_caches = [ + torch.cat([v_cache[:, 1:, :], new_v_caches[i][:, :1, :]], dim=1) + for i, v_cache in enumerate(v_caches) + ] + atten_mask[:, :, -pos - 1] = 0 pos += 1 - atten_mask[0][-pos - 1] = 0 return (atten_mask, pos, k_caches, v_caches) @@ -123,15 +129,15 @@ def _kv_calibrate( user_prompts, module: torch.fx.GraphModule, tokenizer, + ar_len=1, max_seq_len=512, - updator=smart_mask_updator, + updater=smart_mask_updater, use_i64_token=False, ): _, atten_mask, _, k_caches, v_caches = example_inputs # TODO: change criteria & support batch inputs if necessary - pos = torch.tensor(0, dtype=torch.int32) - max_cache_len = max_seq_len - 1 + all_pos = torch.arange(0, max_seq_len, 1, dtype=torch.int32).unsqueeze(0) token_list = [] # Llama2 tokenizer has no special tokens @@ -144,21 +150,50 @@ def _kv_calibrate( else: raise RuntimeError("Unkown tokenizer") + pos = len(token_list) if len(token_list) < ar_len else ar_len + dtype = torch.int64 if use_i64_token else torch.int32 + with torch.no_grad(): - while token_list[-1] != tokenizer.eos_id and pos < max_cache_len: - dtype = torch.int64 if use_i64_token else torch.int32 - token = torch.full((1, 1), token_list[pos], dtype=dtype) + while token_list[-1] != tokenizer.eos_id and pos < max_seq_len: + tmp_token_list = torch.tensor( + token_list[pos - ar_len : pos], dtype=dtype + ).reshape(1, -1) + tmp_pos = all_pos[:, pos - ar_len : pos] + tmp_atten_mask = atten_mask + if pos < ar_len: + tmp_token_list = torch.cat( + [ + torch.zeros((1, ar_len - pos), dtype=dtype), + torch.tensor(token_list, dtype=dtype).reshape(1, -1), + ], + dim=1, + ) + tmp_pos = torch.cat( + [ + torch.zeros((1, ar_len - pos), dtype=torch.int32), + all_pos[:, :pos], + ], + dim=1, + ) + tmp_atten_mask = torch.cat( + [ + torch.ones(1, ar_len, max_seq_len - pos) * -255.0, + atten_mask[:, :, -pos:], + ], + dim=-1, + ) + logits, new_k_caches, new_v_caches = module( - token, - atten_mask, - torch.full((1, 1), pos), + tmp_token_list, + tmp_atten_mask, + tmp_pos, *k_caches, *v_caches, ) - atten_mask, pos, k_caches, v_caches = updator( - atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches + atten_mask, pos, k_caches, v_caches = updater( + ar_len, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches ) - if pos >= len(token_list): + if pos > len(token_list): token_list.append(torch.argmax(logits[:, -1], dim=-1).item()) print(f"kv calibration data:\n{tokenizer.decode(token_list)}") @@ -173,7 +208,6 @@ def _prefill_calibrate( use_i64_token=False, ): _, atten_mask = example_inputs - max_cache_len = max_seq_len - 1 # TODO: change criteria & support batch inputs if necessary @@ -192,20 +226,24 @@ def _prefill_calibrate( dtype = torch.int64 if use_i64_token else torch.int32 with torch.no_grad(): - while token_list[-1] != tokenizer.eos_id and pos < max_cache_len: + while token_list[-1] != tokenizer.eos_id and pos < max_seq_len: tmp_token_list = torch.tensor(token_list, dtype=dtype).reshape(1, -1) - if pos < max_cache_len: + if pos < max_seq_len: tmp_token_list = torch.cat( [ tmp_token_list, - torch.zeros((1, max_cache_len - pos), dtype=dtype), + torch.zeros((1, max_seq_len - pos), dtype=dtype), ], dim=1, ) - logits, new_k_caches, new_v_caches = module( + results = module( tmp_token_list, atten_mask, ) + if len(results) == 3: + logits, new_k_caches, new_v_caches = results + elif len(results) == 1: + logits = results token_list.append(torch.argmax(logits[:, pos - 1], dim=-1).item()) pos += 1 @@ -217,8 +255,9 @@ def calibrate( user_prompts, module: torch.fx.GraphModule, tokenizer, + ar_len=1, max_seq_len=512, - kv_updator=smart_mask_updator, + kv_updater=smart_mask_updater, use_i64_token=False, ): if len(example_inputs) == 2: @@ -236,8 +275,9 @@ def calibrate( user_prompts, module, tokenizer, + ar_len, max_seq_len, - updator=kv_updator, + updater=kv_updater, use_i64_token=use_i64_token, ) else: @@ -268,56 +308,36 @@ def _tag_ios(self, gm: torch.fx.GraphModule, fixed_point_type): # shape of k caches and v caches kv_cache_shape = { - # single head, kv mode input + # single head, kv input (self.llama_meta["get_head_dim"], self.llama_meta["get_max_seq_len"]), (self.llama_meta["get_max_seq_len"], self.llama_meta["get_head_dim"]), - # single head, kv mode output - (self.llama_meta["get_head_dim"], 1), - (1, self.llama_meta["get_head_dim"]), - # single head, bert mode - (self.llama_meta["get_head_dim"], self.llama_meta["get_max_seq_len"] - 1), - (self.llama_meta["get_max_seq_len"] - 1, self.llama_meta["get_head_dim"]), + # single head, kv output + (self.llama_meta["get_head_dim"], self.llama_meta["get_ar_len"]), + (self.llama_meta["get_ar_len"], self.llama_meta["get_head_dim"]), } io_shape = { - # kv mode + # logit output ( self.llama_meta["get_max_batch_size"], - 1, - self.llama_meta["get_vocab_size"], - ), - # bert mode - ( - self.llama_meta["get_max_batch_size"], - self.llama_meta["get_max_seq_len"] - 1, + self.llama_meta["get_ar_len"], self.llama_meta["get_vocab_size"], ), } atten_mask_shape = { - # kv mode - (self.llama_meta["get_max_batch_size"], self.llama_meta["get_max_seq_len"]), - # bert mode ( - self.llama_meta["get_max_seq_len"] - 1, - self.llama_meta["get_max_seq_len"] - 1, + self.llama_meta["get_max_batch_size"], + self.llama_meta["get_ar_len"], + self.llama_meta["get_max_seq_len"], ), } freq_shape = { - # kv mode - (1, self.llama_meta["get_head_dim"] // 2), - # bert mode - ( - self.llama_meta["get_max_seq_len"] - 1, - self.llama_meta["get_head_dim"] // 2, - ), + (self.llama_meta["get_ar_len"], self.llama_meta["get_head_dim"] // 2), } freq_op = { - # kv mode exir_ops.edge.aten.select.int, - # bert mode - exir_ops.edge.aten.slice_copy.Tensor, } for n in gm.graph.nodes: @@ -376,8 +396,9 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()): args.prompt, fx_graph_module, tokenizer=tokenizer, + ar_len=self.llama_meta["get_ar_len"], max_seq_len=self.llama_meta["get_max_seq_len"], - kv_updator=args.kv_updator, + kv_updater=args.kv_updater, use_i64_token=args.embedding_quantize is not None, ) @@ -467,12 +488,14 @@ def compile(args, pte_filename, tokenizer): kv_config = ModelArgs(**json.load(f)) # TODO: support batch inputs if necessary kv_config.max_batch_size = 1 - kv_config.max_seq_len = args.kv_seq_len + kv_config.max_seq_len = args.max_seq_len kv_config.use_kv_cache = True prefill_config = copy.copy(kv_config) - prefill_config.max_seq_len = args.prefill_seq_len - prefill_config.use_kv_cache = False + prefill_config.max_seq_len = args.max_seq_len + prefill_config.use_kv_cache = ( + False if args.max_seq_len == args.prefill_ar_len else True + ) state_dict = torch.load( args.checkpoint, weights_only=True, map_location="cpu", mmap=True @@ -484,27 +507,29 @@ def compile(args, pte_filename, tokenizer): if args.model_mode == "kv": llama_instance_list.append( LlamaModel( - kv_config, output_new_cache_only=True, use_i64_token=use_i64_token - ) - ) - elif args.model_mode == "prefill": - llama_instance_list.append( - LlamaModel( - prefill_config, - output_new_cache_only=False, + kv_config, + ar_len=1, + output_new_cache_only=True, + output_cache=True, use_i64_token=use_i64_token, ) ) elif args.model_mode == "hybrid": llama_instance_list.append( LlamaModel( - kv_config, output_new_cache_only=True, use_i64_token=use_i64_token + kv_config, + ar_len=1, + output_new_cache_only=True, + output_cache=True, + use_i64_token=use_i64_token, ) ) llama_instance_list.append( LlamaModel( prefill_config, - output_new_cache_only=False, + ar_len=args.prefill_ar_len, + output_new_cache_only=True, + output_cache=True, use_i64_token=use_i64_token, ) ) @@ -606,7 +631,7 @@ def compile(args, pte_filename, tokenizer): start_lowering_ts = time.time() quant_attrs = None - if args.model_mode in ["kv", "prefill"]: + if args.model_mode in ["kv"]: llama_instance_list[0].lowering_modules( args.artifact, fixed_point_type, @@ -783,12 +808,10 @@ def compile(args, pte_filename, tokenizer): def inference(args, quant_attrs, pte_filename, runtime_tokenizer_path, pre_gen_pte=""): workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama" - if args.model_mode == "prefill": + if args.model_mode == "kv": eval_mode = 0 - elif args.model_mode == "kv": - eval_mode = 1 elif args.model_mode == "hybrid": - eval_mode = 2 + eval_mode = 1 else: raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") @@ -807,7 +830,7 @@ def post_process(): with open(f"{args.artifact}/outputs/outputs.txt", "r") as f: outputs.append(f.read()) - seq_len = args.prefill_seq_len if args.model_mode == "prefill" else args.kv_seq_len + seq_len = args.max_seq_len runner_args = " ".join( [ f'--prompt "{args.prompt}"', @@ -824,9 +847,9 @@ def post_process(): # x86 emulator is intended for CI and not performance. Check only the first few tokens. seq_len = min(seq_len, 16) - if args.kv_updator == smart_mask_updator: + if args.kv_updater == smart_mask_updater: logging.warning( - "x86 only support ShiftPointer, overwrite kv_updator to ShiftPointer" + "x86 only support ShiftPointer, overwrite kv_updater to ShiftPointer" ) qnn_sdk = os.getenv("QNN_SDK_ROOT") @@ -839,7 +862,7 @@ def post_process(): f"--model_path {pte_path}", f"--seq_len {seq_len}", f"--output_path {args.artifact}/outputs/outputs.txt", - f"--kv_updator ShiftPointer", + f"--kv_updater ShiftPointer", runner_args, ] ) @@ -859,7 +882,7 @@ def post_process(): f"--model_path {pte_filename}.pte", f"--seq_len {seq_len}", "--output_path outputs/outputs.txt", - f"--kv_updator {'SmartMask' if args.kv_updator == smart_mask_updator else 'ShiftPointer'}", + f"--kv_updater {'SmartMask' if args.kv_updater == smart_mask_updater else 'ShiftPointer'}", runner_args, ] ) @@ -998,28 +1021,28 @@ def _build_parser(): parser.add_argument( "--model_mode", - help="Export and inference prefill mode, kv mode or hybrid mode", + help="Export and inference kv mode or hybrid mode", default="kv", - choices=["prefill", "kv", "hybrid"], + choices=["kv", "hybrid"], type=str, ) parser.add_argument( - "--prefill_seq_len", - help="Ouput sequence length for llama. Use this option for prefill or hybrid mode", - default=32, + "--max_seq_len", + help="This refers to maximum number of tokens that the model can process & consider at once to generate predictions/responses.", + default=512, type=int, ) parser.add_argument( - "--kv_seq_len", - help="Ouput sequence length for llama. Use this option for kv or hybrid mode", - default=512, + "--prefill_ar_len", + help="The auto-regression (AR) length determines the number of tokens to consume and the number of logits to produce. Use this option to process the prompt and generate the key-value (kv) cache, which serves as a prompt processor for hybrid mode.", + default=32, type=int, ) parser.add_argument( - "--kv_updator", + "--kv_updater", help="Choose how to update kv cache during runtime", choices=["smart_mask", "shift_pointer"], default="smart_mask", @@ -1045,12 +1068,10 @@ def export_llama(args) -> None: if args.model_mode == "kv": pte_filename = "kv_llama_qnn" - elif args.model_mode == "prefill": - pte_filename = "prefill_llama_qnn" elif args.model_mode == "hybrid": assert ( - args.kv_seq_len >= args.prefill_seq_len - ), "Please ensure kv_seq_len is >= prefill_seq_len" + args.max_seq_len >= args.prefill_ar_len + ), "Please ensure max_seq_len is >= prefill_ar_len" pte_filename = "hybrid_llama_qnn" else: raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") @@ -1073,13 +1094,13 @@ def export_llama(args) -> None: else: raise RuntimeError(f"Unknown llama_model: {args.llama_model}.") - if args.kv_updator == "smart_mask": + if args.kv_updater == "smart_mask": args.shared_buffer = True - args.kv_updator = smart_mask_updator - elif args.kv_updator == "shift_pointer": - args.kv_updator = shift_pointer_updator + args.kv_updater = smart_mask_updater + elif args.kv_updater == "shift_pointer": + args.kv_updater = shift_pointer_updater else: - exit(f"Using an unkown kv update {args.kv_updator}") + exit(f"Using an unkown kv update {args.kv_updater}") if args.pre_gen_pte: quant_attrs = json.load( diff --git a/examples/qualcomm/oss_scripts/llama/model/static_llama.py b/examples/qualcomm/oss_scripts/llama/model/static_llama.py index 253abc9578c..09cc7504224 100755 --- a/examples/qualcomm/oss_scripts/llama/model/static_llama.py +++ b/examples/qualcomm/oss_scripts/llama/model/static_llama.py @@ -153,10 +153,7 @@ def forward_sha( y = y.reshape(bsz, seq_len, -1) if self.output_new_cache_only: - if k_caches and v_caches: - return y, k, v - # batch_prefill mode. Consider to remove, it's not really used - return y, k[-1], v[-1] + return y, k, v return y, kh, vh @@ -298,7 +295,12 @@ def forward( class LlamaModel(nn.Module): def __init__( - self, config: ModelArgs, output_new_cache_only=True, use_i64_token=False + self, + config: ModelArgs, + ar_len=1, + output_new_cache_only=True, + output_cache=True, + use_i64_token=False, ): super().__init__() self.dim = config.dim @@ -311,8 +313,10 @@ def __init__( self.vocab_size = config.vocab_size self.rope_freq_base = config.rope_freq_base self.use_kv_cache = config.use_kv_cache + self.ar_len = ar_len self.output_new_cache_only = output_new_cache_only self.use_i64_token = use_i64_token + self.output_cache = output_cache self.layers = nn.ModuleList( [ @@ -359,10 +363,10 @@ def forward( output_v_cache = [] # following tensors should be invariant across batches freqs_cos = ( - self.freqs_cos[input_pos][0] if self.use_kv_cache else self.freqs_cos[:-1] + self.freqs_cos[input_pos][0] if self.use_kv_cache else self.freqs_cos ) freqs_sin = ( - self.freqs_sin[input_pos][0] if self.use_kv_cache else self.freqs_sin[:-1] + self.freqs_sin[input_pos][0] if self.use_kv_cache else self.freqs_sin ) hidden_states = self.tok_embeddings(tokens) @@ -388,19 +392,36 @@ def forward( hidden_states = self.norm(hidden_states) logits = self.output(hidden_states) - return logits, output_k_cache, output_v_cache + if self.output_cache: + return logits, output_k_cache, output_v_cache + return logits def get_example_inputs(self, use_kv_cache=True): dtype = torch.int64 if self.use_i64_token else torch.int32 - if use_kv_cache: - tokens = torch.randint( - self.vocab_size, (self.max_batch_size, 1), dtype=dtype - ) + tokens = torch.randint( + self.vocab_size, (self.max_batch_size, self.ar_len), dtype=dtype + ) - pos_ids = torch.zeros((self.max_batch_size, 1), dtype=torch.int32) + atten_mask = torch.full((self.ar_len, self.ar_len), torch.tensor(-255.0)) + mask_cond = torch.arange(atten_mask.size(-1)) + atten_mask.masked_fill_( + mask_cond < (mask_cond + 1).view(atten_mask.size(-1), 1), 0 + ) + if self.max_seq_len != self.ar_len: + atten_mask = torch.cat( + [ + torch.ones(self.ar_len, self.max_seq_len - self.ar_len) * -255.0, + atten_mask, + ], + dim=-1, + ) + atten_mask = atten_mask[None, :, :].expand( + self.max_batch_size, self.ar_len, self.max_seq_len + ) + if use_kv_cache: + pos_ids = torch.zeros((self.max_batch_size, self.ar_len), dtype=torch.int32) k_cache, v_cache = [], [] - atten_mask = torch.full((self.max_batch_size, self.max_seq_len), -255.0) - atten_mask[:, -1] = 0 + for _ in range(self.n_layers): for _ in range(self.n_kv_heads): # transpose first to decrease the runtime efforts @@ -408,13 +429,13 @@ def get_example_inputs(self, use_kv_cache=True): torch.zeros( self.max_batch_size, self.head_dim, - self.max_seq_len - 1, + self.max_seq_len - self.ar_len, ) ) v_cache.append( torch.zeros( self.max_batch_size, - self.max_seq_len - 1, + self.max_seq_len - self.ar_len, self.head_dim, ) ) @@ -426,10 +447,6 @@ def get_example_inputs(self, use_kv_cache=True): v_cache, ) - max_promp = self.max_seq_len - 1 - tokens = torch.arange(0, max_promp, 1, dtype=dtype).unsqueeze(0) - atten_mask = torch.triu(torch.rand((max_promp, max_promp)), 1) - atten_mask[atten_mask != 0] = -255 return ( tokens, atten_mask, @@ -438,6 +455,7 @@ def get_example_inputs(self, use_kv_cache=True): def get_metadata(self): # TODO: modify this when enabling LLAMA 7B return { + "get_ar_len": self.ar_len, "get_bos_id": 1, "get_eos_id": 2, "get_dim": self.dim, diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp index 1bc90a11f9d..0a1635223e6 100644 --- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp @@ -48,11 +48,11 @@ DEFINE_int32( DEFINE_int32( eval_mode, 1, - "0: PromptProcessor(prefill) / 1: TokenGenerator(kv) / 2: HybridMode (prefill+kv)"); + "0: TokenGenerator(kv) / 1: HybridMode (prefill+kv)"); DEFINE_double(logits_scale, 0.0, "Logits scale"); DEFINE_int32(logits_offset, 0, "Logits offset"); DEFINE_string( - kv_updator, + kv_updater, "How to update kv cache. Choose between SmartMask and ShiftPointer", "SmartMask"); @@ -67,7 +67,7 @@ int main(int argc, char** argv) { FLAGS_logits_offset, FLAGS_temperature, FLAGS_eval_mode, - FLAGS_kv_updator); + FLAGS_kv_updater); std::vector buf; buf.reserve(5 * FLAGS_seq_len); // assume each token is around 5 char std::ofstream fout(FLAGS_output_path.c_str()); diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp index badaea0ca73..1c83637f8d7 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp @@ -54,7 +54,10 @@ std::vector IoMgrBase::get_output_tensors( ShiftPointerIoMgr::ShiftPointerIoMgr( std::vector>& modules, + int32_t context_len, + int32_t prefill_ar_len, int32_t prefill_cache_len, + int32_t kv_ar_len, int32_t kv_cache_len, int32_t vocab_size, int32_t num_layers, @@ -66,7 +69,10 @@ ShiftPointerIoMgr::ShiftPointerIoMgr( const bool use_int64_token) : IoMgrBase(modules), shard_layers_({num_layers}), + context_len_(context_len), + kv_ar_len_(kv_ar_len), kv_cache_len_(kv_cache_len), + prefill_ar_len_(prefill_ar_len), prefill_cache_len_(prefill_cache_len), vocab_size_(vocab_size), num_layers_(num_layers), @@ -75,7 +81,8 @@ ShiftPointerIoMgr::ShiftPointerIoMgr( eval_mode_(eval_mode), prefill_forward_name_(prefill_forward_name), kv_forward_name_(kv_forward_name), - use_int64_token_(use_int64_token) { + use_int64_token_(use_int64_token), + is_bert_(prefill_cache_len_ == 0) { if (!prefill_forward_name_.empty()) { input_tensors_[prefill_forward_name_] = std::vector>(modules.size()); @@ -113,15 +120,14 @@ void ShiftPointerIoMgr::init_io() { IO* ptr = static_cast(data_ptr_.get()); std::memset(ptr, 0, sizeof(IO)); - int32_t max_cache_len = std::max(kv_cache_len_, prefill_cache_len_); - int32_t k_in_size = (head_dim_ + 1) * max_cache_len; - int32_t v_cache_size = (num_heads_ + 1) * max_cache_len * head_dim_; - int32_t k_cache_out_size = num_heads_ * head_dim_; - if (eval_mode_ == EvalMode::kHybrid || eval_mode_ == EvalMode::kPrefill) { - k_cache_out_size *= prefill_cache_len_; - } + int32_t max_ar_len = std::max(kv_ar_len_, prefill_ar_len_); + int32_t k_in_size = (head_dim_ + 1) * kv_cache_len_; + // Use context length to prevent exceeding the range when the AR-N model + // updates the last block in hybrid mode. + int32_t v_cache_size = (num_heads_ + 1) * context_len_ * head_dim_; + int32_t k_cache_out_size = num_heads_ * max_ar_len * head_dim_; - // Init kv vector shape, general enough to be shared across all 3 modes. + // Init kv vector shape, general enough to be shared across all modes. ptr->k_cache_out.reserve(num_layers_); ptr->v_cache.reserve(num_layers_); for (int layer = 0; layer < num_layers_; layer++) { @@ -130,14 +136,15 @@ void ShiftPointerIoMgr::init_io() { } auto init_prefill = [&]() { - ptr->prefill_input_toks.resize(prefill_cache_len_); - ptr->prefill_atten_mask.resize(prefill_cache_len_ * prefill_cache_len_); - ptr->prefill_logits.resize(prefill_cache_len_ * vocab_size_); + ptr->prefill_input_toks.resize(prefill_ar_len_, 0); + ptr->prefill_input_pos.resize(prefill_ar_len_, 0); + ptr->prefill_attention_mask.resize((prefill_ar_len_ * context_len_), 0); + ptr->prefill_logits.resize(prefill_ar_len_ * vocab_size_); }; auto init_kv = [&]() { - ptr->kv_logits.resize(vocab_size_); - ptr->kv_attention_mask.resize((kv_cache_len_ + 1), 0); + ptr->kv_logits.resize(kv_ar_len_ * vocab_size_); + ptr->kv_attention_mask.resize((kv_ar_len_ * context_len_), 0); ptr->k_cache.reserve(num_layers_); for (int layer = 0; layer < num_layers_; layer++) { ptr->k_cache.emplace_back(); @@ -149,9 +156,6 @@ void ShiftPointerIoMgr::init_io() { }; switch (eval_mode_) { - case EvalMode::kPrefill: - init_prefill(); - break; case EvalMode::kKVCached: init_kv(); break; @@ -177,37 +181,38 @@ void ShiftPointerIoMgr::prepare_kv_io( IO* ptr = static_cast(data_ptr_.get()); // [I]: input_tokens - Result input_tok = methods_meta[0]->input_tensor_meta(0); - input_tok_ = std::make_unique( - input_tok->scalar_type(), - input_tok->sizes().size(), - const_cast(input_tok->sizes().data()), - &ptr->input_tok, - const_cast(input_tok->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(input_tok_.get()); + Result kv_input_toks = methods_meta[0]->input_tensor_meta(0); + kv_input_toks_ = std::make_unique( + kv_input_toks->scalar_type(), + kv_input_toks->sizes().size(), + const_cast(kv_input_toks->sizes().data()), + &ptr->kv_input_toks, + const_cast(kv_input_toks->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(kv_input_toks_.get()); // [I]: atten_mask - Result atten_mask = methods_meta[0]->input_tensor_meta(1); - attention_mask_ = std::make_unique( - atten_mask->scalar_type(), - atten_mask->sizes().size(), - const_cast(atten_mask->sizes().data()), + Result kv_attention_mask = methods_meta[0]->input_tensor_meta(1); + kv_attention_mask_ = std::make_unique( + kv_attention_mask->scalar_type(), + kv_attention_mask->sizes().size(), + const_cast(kv_attention_mask->sizes().data()), ptr->kv_attention_mask.data(), - const_cast(atten_mask->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get()); + const_cast( + kv_attention_mask->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(kv_attention_mask_.get()); // [I]: input_pos - Result input_pos = methods_meta[0]->input_tensor_meta(2); - input_pos_ = std::make_unique( - input_pos->scalar_type(), - input_pos->sizes().size(), - const_cast(input_pos->sizes().data()), - &ptr->input_pos, - const_cast(input_pos->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(input_pos_.get()); + Result kv_input_pos = methods_meta[0]->input_tensor_meta(2); + kv_input_pos_ = std::make_unique( + kv_input_pos->scalar_type(), + kv_input_pos->sizes().size(), + const_cast(kv_input_pos->sizes().data()), + &ptr->kv_input_pos, + const_cast(kv_input_pos->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(kv_input_pos_.get()); // [I] kv_cache - int index = 3; // bypass input_tokens, input_pos, atten_mask + int index = 3; // bypass input_tokens, atten_mask, input_pos for (int offset = 0, shard_index = 0, v_stride = kv_cache_len_ * head_dim_; shard_index < modules_.size(); offset += shard_layers_[shard_index], shard_index++) { @@ -304,7 +309,7 @@ void ShiftPointerIoMgr::prepare_prefill_io( IO* ptr = static_cast(data_ptr_.get()); - // [I]: pre_input_tokens + // [I]: prefill_input_tokens Result prefill_input_toks = methods_meta[0]->input_tensor_meta(0); prefill_input_toks_ = std::make_unique( prefill_input_toks->scalar_type(), @@ -314,25 +319,81 @@ void ShiftPointerIoMgr::prepare_prefill_io( const_cast( prefill_input_toks->dim_order().data())); input_tensors_[prefill_forward_name_][0].push_back(prefill_input_toks_.get()); - // [I]: prefill_attn_mask - for (int i = 0; i < prefill_cache_len_; ++i) { - for (int j = 0; j < prefill_cache_len_; ++j) { - if (i < j) { - ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 0; - } else { - ptr->prefill_atten_mask[i * prefill_cache_len_ + j] = 65535; + // [I]: prefill_attention_mask + for (int i = 0; i < prefill_ar_len_; ++i) { + for (int j = 0, + offset = i * context_len_ + (context_len_ - prefill_ar_len_); + j < prefill_ar_len_; + ++j) { + if (i >= j) { + ptr->prefill_attention_mask[j + offset] = 65535; } } } - Result prefill_atten_mask = methods_meta[0]->input_tensor_meta(1); - prefill_attn_mask_ = std::make_unique( - prefill_atten_mask->scalar_type(), - prefill_atten_mask->sizes().size(), - const_cast(prefill_atten_mask->sizes().data()), - ptr->prefill_atten_mask.data(), + Result prefill_attention_mask = + methods_meta[0]->input_tensor_meta(1); + prefill_attention_mask_ = std::make_unique( + prefill_attention_mask->scalar_type(), + prefill_attention_mask->sizes().size(), + const_cast( + prefill_attention_mask->sizes().data()), + ptr->prefill_attention_mask.data(), const_cast( - prefill_atten_mask->dim_order().data())); - input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get()); + prefill_attention_mask->dim_order().data())); + input_tensors_[prefill_forward_name_][0].push_back( + prefill_attention_mask_.get()); + + if (!is_bert_) { + // [I]: prefill_input_pos + Result prefill_input_pos = + methods_meta[0]->input_tensor_meta(2); + prefill_input_pos_ = std::make_unique( + prefill_input_pos->scalar_type(), + prefill_input_pos->sizes().size(), + const_cast(prefill_input_pos->sizes().data()), + ptr->prefill_input_pos.data(), + const_cast( + prefill_input_pos->dim_order().data())); + input_tensors_[prefill_forward_name_][0].push_back( + prefill_input_pos_.get()); + + // [I] kv_cache + int index = 3; // bypass input_tokens, atten_mask, input_pos + // Add prefill offset to align the v_out pointer with the decode model. + for (int offset = 0, + shard_index = 0, + v_stride = kv_cache_len_ * head_dim_, + prefill_offset = (kv_cache_len_ - prefill_cache_len_) * head_dim_; + shard_index < modules_.size(); + offset += shard_layers_[shard_index], shard_index++) { + for (int cache_group = 0; cache_group < 2; ++cache_group) { + for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { + for (int head = 0; head < num_heads_; ++head, ++index) { + Result kv_cache = + methods_meta[shard_index]->input_tensor_meta(index); + std::vector>& cache = + (cache_group == 0 ? k_cache_in_[prefill_forward_name_] + : v_cache_in_[prefill_forward_name_]); + void* cache_ptr = (cache_group == 0) + ? static_cast(ptr->k_cache[layer + offset][head].data()) + : static_cast( + ptr->v_cache[layer + offset].data() + head * v_stride + + prefill_offset); + + cache.emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + input_tensors_[prefill_forward_name_][shard_index].push_back( + cache.back().get()); + } + } + } + } + } // [O]: logits int logit_index = 0; Result logits = @@ -348,18 +409,11 @@ void ShiftPointerIoMgr::prepare_prefill_io( // [O] kv_cache int index = 1; - // prefill_k_stride should be equal to prefill_v_stride in prefill mode. // In hybrid mode, we use kv mode cache len for v stride since we want to // update prefill's result onto kv modes input. - int32_t prefill_k_stride = prefill_cache_len_ * head_dim_; - int32_t prefill_v_stride = - std::max(prefill_cache_len_, kv_cache_len_) * head_dim_; + int32_t prefill_k_stride = prefill_ar_len_ * head_dim_; + int32_t prefill_v_stride = kv_cache_len_ * head_dim_; - if (eval_mode_ == EvalMode::kPrefill) { - ET_CHECK_MSG( - prefill_k_stride == prefill_v_stride, - "prefill_k_stride should be equal to prefill_v_stride"); - } for (int offset = 0, shard_index = 0; shard_index < modules_.size(); offset += shard_layers_[shard_index], shard_index++) { for (int cache_group = 0; cache_group < 2; ++cache_group) { @@ -397,13 +451,11 @@ void ShiftPointerIoMgr::update_prefill_to_kv_io( int64_t pos, std::vector>& output_tensors) { ET_CHECK_MSG(kv_cache_len_ != 0, "k_cache_len_ should not equal to 0"); - ET_CHECK_MSG( - prefill_cache_len_ != 0, "prefill_cache_len_ should not equal to 0"); IO* ptr = static_cast(data_ptr_.get()); - ptr->input_tok = + ptr->kv_input_toks = use_int64_token_ ? cur_token : static_cast(cur_token); - ptr->input_pos = static_cast(pos); + ptr->kv_input_pos = static_cast(pos); // If prompt len is 30, prefill will handle to pos = 30. // At this point, pos should be 31. for (int i = 0; i < pos + 1; i++) { @@ -435,17 +487,29 @@ void ShiftPointerIoMgr::update_prefill_to_kv_io( } } + // Update k_cache std::vector>& k_cache_in = k_cache_in_[kv_forward_name_]; std::vector>& k_cache_out = k_cache_out_[prefill_forward_name_]; + // copy from last to prevent from overwriting values + size_t copied_size = pos * sizeof(uint8_t); for (int i = 0; i < k_cache_in.size(); ++i) { uint8_t* ptr_in = k_cache_in[i]->mutable_data(); - const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = kv_cache_len_; j < head_dim_; - ++j, offset += kv_cache_len_) { - for (int k = 0, k_stride = j * prefill_cache_len_; k < pos; k++) { - ptr_in[offset + k] = ptr_out[k_stride + k]; + if (is_bert_) { + const uint8_t* ptr_out = k_cache_out[i]->data(); + for (size_t j = 0, offset = kv_cache_len_; j < head_dim_; + ++j, offset += kv_cache_len_) { + for (int k = 0, k_stride = j * prefill_ar_len_; k < pos; k++) { + ptr_in[offset + k] = ptr_out[k_stride + k]; + } + } + } else { + for (int j = head_dim_; j > -1; --j) { + memcpy( + ptr_in + j * kv_cache_len_, + ptr_in + j * prefill_cache_len_, + copied_size); } } k_cache_in[i]->set_data(ptr_in + pos); @@ -458,10 +522,10 @@ void ShiftPointerIoMgr::update_kv_io( std::vector>& output_tensors) { IO* ptr = static_cast(data_ptr_.get()); // update input_tok - ptr->input_tok = + ptr->kv_input_toks = use_int64_token_ ? cur_token : static_cast(cur_token); // update position_ids - ptr->input_pos = static_cast(pos); + ptr->kv_input_pos = static_cast(pos); // update causal mask for next token ptr->kv_attention_mask[kv_cache_len_ - pos] = 65535; @@ -505,47 +569,102 @@ void ShiftPointerIoMgr::update_prefill_io( int64_t cur_token, int64_t pos, std::vector>& output_tensors) { + (void)cur_token; (void)output_tensors; IO* ptr = static_cast(data_ptr_.get()); - // Support CPU 4-bit embedding, which requires int64 input. - // However, for QNN embedding, only int32 input is needed. - // Therefore, we need to cast to the correct type to write the data. - if (use_int64_token_) { - ptr->prefill_input_toks[pos] = cur_token; - } else { - int32_t* prefill_input_toks_ptr = - reinterpret_cast(ptr->prefill_input_toks.data()); - prefill_input_toks_ptr[pos] = static_cast(cur_token); + + if (!is_bert_) { + // update v_cache + auto& v_cache_in = v_cache_in_[prefill_forward_name_]; + auto& v_cache_out = v_cache_out_[prefill_forward_name_]; + for (int i = 0; i < v_cache_in.size(); i++) { + v_cache_in[i]->set_data( + v_cache_in[i]->mutable_data() + prefill_ar_len_ * head_dim_); + v_cache_out[i]->set_data( + v_cache_out[i]->mutable_data() + + prefill_ar_len_ * head_dim_); + } + + for (int shard = 0; shard < output_tensors.size(); shard++) { + for (int index = 0; index < output_tensors[shard].size(); index++) { + ET_CHECK_MSG( + modules_[shard]->set_output( + prefill_forward_name_, output_tensors[shard][index], index) == + Error::Ok, + "failed to set output tensor for module %d's %d'th output " + "while updating kv_cache output tensors", + shard, + index); + } + } + + auto& k_cache_in = k_cache_in_[prefill_forward_name_]; + auto& k_cache_out = k_cache_out_[prefill_forward_name_]; + // update k_cache by single thread, this part is cpu cache sensitive + for (int i = 0; i < k_cache_in.size(); ++i) { + uint8_t* ptr_in = k_cache_in[i]->mutable_data(); + const uint8_t* ptr_out = k_cache_out[i]->data(); + for (size_t j = 0, offset = prefill_cache_len_; j < head_dim_; + ++j, offset += prefill_cache_len_) { + for (int k = 0, k_stride = j * prefill_ar_len_; k < prefill_ar_len_; + k++) { + ptr_in[offset + k] = ptr_out[k_stride + k]; + } + } + k_cache_in[i]->set_data(ptr_in + prefill_ar_len_); + } } } void ShiftPointerIoMgr::fill_prefill_toks( + int64_t start_pos, std::vector& prompt_tokens) { IO* ptr = static_cast(get_mutable_ptr()); - for (int i = 0; i < prompt_tokens.size(); i++) { - // Support CPU 4-bit embedding, which requires int64 input. - // However, for QNN embedding, only int32 input is needed. - // Therefore, we need to cast to the correct type to write the data. - if (use_int64_token_) { - ptr->prefill_input_toks[i] = prompt_tokens[i]; - } else { - int32_t* prefill_input_toks_ptr = - reinterpret_cast(ptr->prefill_input_toks.data()); - prefill_input_toks_ptr[i] = static_cast(prompt_tokens[i]); + for (int i = 0; i < prefill_ar_len_; i++) { + if (!is_bert_) { + ptr->prefill_input_pos[i] = start_pos + i; + } + + if (start_pos + i < prompt_tokens.size()) { + // Support CPU 4-bit embedding, which requires int64 input. + // However, for QNN embedding, only int32 input is needed. + // Therefore, we need to cast to the correct type to write the data. + if (use_int64_token_) { + ptr->prefill_input_toks[i] = prompt_tokens[start_pos + i]; + } else { + int32_t* prefill_input_toks_ptr = + reinterpret_cast(ptr->prefill_input_toks.data()); + prefill_input_toks_ptr[i] = + static_cast(prompt_tokens[start_pos + i]); + } + } + if (start_pos >= prefill_ar_len_) { + for (int j = 0, + offset = i * context_len_ + + (context_len_ - prefill_ar_len_ - start_pos); + j < prefill_ar_len_; + ++j) { + ptr->prefill_attention_mask[offset + j] = 65535; + } } } } void ShiftPointerIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) { IO* ptr = static_cast(get_mutable_ptr()); - ptr->input_tok = + ptr->kv_input_toks = use_int64_token_ ? cur_token : static_cast(cur_token); + ptr->kv_input_pos = static_cast(pos); + ; ptr->kv_attention_mask[kv_cache_len_] = 65535; } SmartMaskIoMgr::SmartMaskIoMgr( std::vector>& modules, + int32_t context_len, + int32_t prefill_ar_len, int32_t prefill_cache_len, + int32_t kv_ar_len, int32_t kv_cache_len, int32_t vocab_size, int32_t num_layers, @@ -557,7 +676,10 @@ SmartMaskIoMgr::SmartMaskIoMgr( const bool use_int64_token) : IoMgrBase(modules), shard_layers_({num_layers}), + context_len_(context_len), + kv_ar_len_(kv_ar_len), kv_cache_len_(kv_cache_len), + prefill_ar_len_(prefill_ar_len), prefill_cache_len_(prefill_cache_len), vocab_size_(vocab_size), num_layers_(num_layers), @@ -566,12 +688,17 @@ SmartMaskIoMgr::SmartMaskIoMgr( eval_mode_(eval_mode), prefill_forward_name_(prefill_forward_name), kv_forward_name_(kv_forward_name), - use_int64_token_(use_int64_token) { + use_int64_token_(use_int64_token), + is_bert_(prefill_cache_len == 0) { if (!prefill_forward_name_.empty()) { input_tensors_[prefill_forward_name_] = std::vector>(modules.size()); output_tensors_[prefill_forward_name_] = std::vector>(modules.size()); + k_cache_in_[prefill_forward_name_] = + std::vector>(); + v_cache_in_[prefill_forward_name_] = + std::vector>(); k_cache_out_[prefill_forward_name_] = std::vector>(); v_cache_out_[prefill_forward_name_] = @@ -597,20 +724,20 @@ SmartMaskIoMgr::SmartMaskIoMgr( } std::unordered_map SmartMaskIoMgr::get_io_elements() { - size_t cache_len = std::max(kv_cache_len_, prefill_cache_len_); - size_t cache_in_ele = num_layers_ * num_heads_ * head_dim_ * cache_len; - size_t cache_out_ele = num_layers_ * num_heads_ * head_dim_; + int32_t max_ar_len = std::max(kv_ar_len_, prefill_ar_len_); + size_t cache_in_ele = num_layers_ * num_heads_ * head_dim_ * kv_cache_len_; + size_t cache_out_ele = num_layers_ * num_heads_ * head_dim_ * max_ar_len; return std::unordered_map{ - {"input_tok_ele", 1}, - {"input_pos_ele", 1}, + {"kv_input_toks_ele", kv_ar_len_}, + {"kv_input_pos_ele", kv_ar_len_}, {"cache_in_ele", cache_in_ele}, {"cache_out_ele", cache_out_ele}, - // 1 for the input prompt - {"atten_mask_ele", cache_len + 1}, - {"kv_logits_ele", vocab_size_}, - {"prefill_input_toks_ele", prefill_cache_len_}, - {"prefill_atten_mask_ele", prefill_cache_len_ * prefill_cache_len_}, - {"prefill_logits_ele", prefill_cache_len_ * vocab_size_}}; + {"kv_attention_mask_ele", kv_ar_len_ * context_len_}, + {"kv_logits_ele", kv_ar_len_ * vocab_size_}, + {"prefill_input_toks_ele", prefill_ar_len_}, + {"prefill_input_pos_ele", prefill_ar_len_}, + {"prefill_attention_mask_ele", prefill_ar_len_ * context_len_}, + {"prefill_logits_ele", prefill_ar_len_ * vocab_size_}}; } std::unordered_map SmartMaskIoMgr::get_io_bytes() { @@ -623,21 +750,23 @@ std::unordered_map SmartMaskIoMgr::get_io_bytes() { byte % static_cast(alignment)); }; return std::unordered_map{ - {"input_tok_bytes", - align(element_map["input_tok_ele"] * sizeof(int32_t))}, - {"input_pos_bytes", - align(element_map["input_pos_ele"] * sizeof(int32_t))}, + {"kv_input_toks_bytes", + align(element_map["kv_input_toks_ele"] * sizeof(int32_t))}, + {"kv_input_pos_bytes", + align(element_map["kv_input_pos_ele"] * sizeof(int32_t))}, {"cache_in_bytes", align(element_map["cache_in_ele"] * sizeof(uint8_t))}, {"cache_out_bytes", align(element_map["cache_out_ele"] * sizeof(uint8_t))}, - {"atten_mask_bytes", - align(element_map["atten_mask_ele"] * sizeof(uint16_t))}, + {"kv_attention_mask_bytes", + align(element_map["kv_attention_mask_ele"] * sizeof(uint16_t))}, {"kv_logits_bytes", align(element_map["kv_logits_ele"] * sizeof(uint16_t))}, {"prefill_input_toks_bytes", align(element_map["prefill_input_toks_ele"] * sizeof(int32_t))}, - {"prefill_atten_mask_bytes", - align(element_map["prefill_atten_mask_ele"] * sizeof(uint16_t))}, + {"prefill_input_pos_bytes", + align(element_map["prefill_input_pos_ele"] * sizeof(int32_t))}, + {"prefill_attention_mask_bytes", + align(element_map["prefill_attention_mask_ele"] * sizeof(uint16_t))}, {"prefill_logits_bytes", align(element_map["prefill_logits_ele"] * sizeof(uint16_t))}}; } @@ -654,10 +783,10 @@ void SmartMaskIoMgr::IO::init_io_ptrs( for (const auto& iter : io_bytes_map) { std::string key = iter.first; size_t size = iter.second; - if (key == "input_tok_bytes") { - input_tok = reinterpret_cast(cur_ptr); - } else if (key == "input_pos_bytes") { - input_pos = reinterpret_cast(cur_ptr); + if (key == "kv_input_toks_bytes") { + kv_input_toks = reinterpret_cast(cur_ptr); + } else if (key == "kv_input_pos_bytes") { + kv_input_pos = reinterpret_cast(cur_ptr); } else if (key == "cache_in_bytes" || key == "cache_out_bytes") { auto& k_cache_ref = (key == "cache_in_bytes") ? k_cache : k_cache_out; auto& v_cache_ref = (key == "cache_in_bytes") ? v_cache : v_cache_out; @@ -679,14 +808,16 @@ void SmartMaskIoMgr::IO::init_io_ptrs( } } continue; - } else if (key == "atten_mask_bytes") { + } else if (key == "kv_attention_mask_bytes") { kv_attention_mask = reinterpret_cast(cur_ptr); } else if (key == "kv_logits_bytes") { kv_logits = reinterpret_cast(cur_ptr); } else if (key == "prefill_input_toks_bytes") { prefill_input_toks = reinterpret_cast(cur_ptr); - } else if (key == "prefill_atten_mask_bytes") { - prefill_atten_mask = reinterpret_cast(cur_ptr); + } else if (key == "prefill_input_pos_bytes") { + prefill_input_pos = reinterpret_cast(cur_ptr); + } else if (key == "prefill_attention_mask_bytes") { + prefill_attention_mask = reinterpret_cast(cur_ptr); } else if (key == "prefill_logits_bytes") { prefill_logits = reinterpret_cast(cur_ptr); } else { @@ -720,15 +851,10 @@ void SmartMaskIoMgr::init_io() { std::unordered_map io_bytes_map = get_io_bytes(); switch (eval_mode_) { - case EvalMode::kPrefill: - io_bytes_map.erase("input_tok_bytes"); - io_bytes_map.erase("input_pos_bytes"); - io_bytes_map.erase("atten_mask_bytes"); - io_bytes_map.erase("kv_logits_bytes"); - break; case EvalMode::kKVCached: io_bytes_map.erase("prefill_input_toks_bytes"); - io_bytes_map.erase("prefill_atten_mask_bytes"); + io_bytes_map.erase("prefill_input_pos_bytes"); + io_bytes_map.erase("prefill_attention_mask_bytes"); io_bytes_map.erase("prefill_logits_bytes"); break; case EvalMode::kHybrid: @@ -774,53 +900,55 @@ void SmartMaskIoMgr::prepare_kv_io( std::unordered_map io_bytes_map = get_io_bytes(); // [I]: input_tokens - Result input_tok = methods_meta[0]->input_tensor_meta(0); - input_tok_ = std::make_unique( - input_tok->scalar_type(), - input_tok->sizes().size(), - const_cast(input_tok->sizes().data()), - ptr->input_tok, - const_cast(input_tok->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(input_tok_.get()); + Result kv_input_toks = methods_meta[0]->input_tensor_meta(0); + kv_input_toks_ = std::make_unique( + kv_input_toks->scalar_type(), + kv_input_toks->sizes().size(), + const_cast(kv_input_toks->sizes().data()), + ptr->kv_input_toks, + const_cast(kv_input_toks->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(kv_input_toks_.get()); ptr->add_custom_mem_info( - ptr->input_tok, - io_bytes_map["input_tok_bytes"], - input_tok->scalar_type(), - input_tok.get()); + ptr->kv_input_toks, + io_bytes_map["kv_input_toks_bytes"], + kv_input_toks->scalar_type(), + kv_input_toks.get()); // [I]: atten_mask - Result atten_mask = methods_meta[0]->input_tensor_meta(1); - attention_mask_ = std::make_unique( - atten_mask->scalar_type(), - atten_mask->sizes().size(), - const_cast(atten_mask->sizes().data()), + std::fill_n(ptr->kv_attention_mask, kv_ar_len_ * context_len_, 0); + Result kv_attention_mask = methods_meta[0]->input_tensor_meta(1); + kv_attention_mask_ = std::make_unique( + kv_attention_mask->scalar_type(), + kv_attention_mask->sizes().size(), + const_cast(kv_attention_mask->sizes().data()), ptr->kv_attention_mask, - const_cast(atten_mask->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(attention_mask_.get()); + const_cast( + kv_attention_mask->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(kv_attention_mask_.get()); ptr->add_custom_mem_info( ptr->kv_attention_mask, - io_bytes_map["atten_mask_bytes"], - atten_mask->scalar_type(), - atten_mask.get()); + io_bytes_map["kv_attention_mask_bytes"], + kv_attention_mask->scalar_type(), + kv_attention_mask.get()); // [I]: input_pos - Result input_pos = methods_meta[0]->input_tensor_meta(2); - input_pos_ = std::make_unique( - input_pos->scalar_type(), - input_pos->sizes().size(), - const_cast(input_pos->sizes().data()), - ptr->input_pos, - const_cast(input_pos->dim_order().data())); - input_tensors_[kv_forward_name_][0].push_back(input_pos_.get()); + Result kv_input_pos = methods_meta[0]->input_tensor_meta(2); + kv_input_pos_ = std::make_unique( + kv_input_pos->scalar_type(), + kv_input_pos->sizes().size(), + const_cast(kv_input_pos->sizes().data()), + ptr->kv_input_pos, + const_cast(kv_input_pos->dim_order().data())); + input_tensors_[kv_forward_name_][0].push_back(kv_input_pos_.get()); ptr->add_custom_mem_info( - ptr->input_pos, - io_bytes_map["input_pos_bytes"], - input_pos->scalar_type(), - input_pos.get()); + ptr->kv_input_pos, + io_bytes_map["kv_input_pos_bytes"], + kv_input_pos->scalar_type(), + kv_input_pos.get()); // [I] kv_cache size_t layered_head_count = num_layers_ * num_heads_; - int index = 3; // bypass input_tokens, input_pos, atten_mask + int index = 3; // bypass input_tokens, atten_mask, input_pos for (int offset = 0, shard_index = 0; shard_index < modules_.size(); offset += shard_layers_[shard_index], shard_index++) { for (int cache_group = 0; cache_group < 2; ++cache_group) { @@ -915,10 +1043,10 @@ void SmartMaskIoMgr::update_kv_io( IO* ptr = static_cast(data_ptr_.get()); size_t cache_len = std::max(kv_cache_len_, prefill_cache_len_); // update input_tok - *ptr->input_tok = + *ptr->kv_input_toks = use_int64_token_ ? cur_token : static_cast(cur_token); // update position_ids - *ptr->input_pos = static_cast(pos); + *ptr->kv_input_pos = static_cast(pos); // update smart mask for previous cache ptr->kv_attention_mask[pos] = 65535; @@ -975,30 +1103,92 @@ void SmartMaskIoMgr::prepare_prefill_io( executorch::aten::ScalarType::Int, prefill_input_toks.get()); - // [I]: prefill_attn_mask - for (int i = 0; i < cache_len; ++i) { - for (int j = 0; j < cache_len; ++j) { + // [I]: prefill_attention_mask + for (int i = 0; i < prefill_ar_len_; ++i) { + for (int j = 0, + offset = i * context_len_ + (context_len_ - prefill_ar_len_); + j < prefill_ar_len_; + ++j) { if (i < j) { - ptr->prefill_atten_mask[i * cache_len + j] = 0; + ptr->prefill_attention_mask[j + offset] = 0; } else { - ptr->prefill_atten_mask[i * cache_len + j] = 65535; + ptr->prefill_attention_mask[j + offset] = 65535; } } } - Result prefill_atten_mask = methods_meta[0]->input_tensor_meta(1); - prefill_attn_mask_ = std::make_unique( - prefill_atten_mask->scalar_type(), - prefill_atten_mask->sizes().size(), - const_cast(prefill_atten_mask->sizes().data()), - ptr->prefill_atten_mask, + Result prefill_attention_mask = + methods_meta[0]->input_tensor_meta(1); + prefill_attention_mask_ = std::make_unique( + prefill_attention_mask->scalar_type(), + prefill_attention_mask->sizes().size(), + const_cast( + prefill_attention_mask->sizes().data()), + ptr->prefill_attention_mask, const_cast( - prefill_atten_mask->dim_order().data())); - input_tensors_[prefill_forward_name_][0].push_back(prefill_attn_mask_.get()); + prefill_attention_mask->dim_order().data())); + input_tensors_[prefill_forward_name_][0].push_back( + prefill_attention_mask_.get()); ptr->add_custom_mem_info( - ptr->prefill_atten_mask, - io_bytes_map["prefill_atten_mask_bytes"], + ptr->prefill_attention_mask, + io_bytes_map["prefill_attention_mask_bytes"], executorch::aten::ScalarType::Bits16, - prefill_atten_mask.get()); + prefill_attention_mask.get()); + + if (!is_bert_) { + // [I]: prefill_input_pos + Result prefill_input_pos = + methods_meta[0]->input_tensor_meta(2); + prefill_input_pos_ = std::make_unique( + prefill_input_pos->scalar_type(), + prefill_input_pos->sizes().size(), + const_cast(prefill_input_pos->sizes().data()), + ptr->prefill_input_pos, + const_cast( + prefill_input_pos->dim_order().data())); + input_tensors_[prefill_forward_name_][0].push_back( + prefill_input_pos_.get()); + ptr->add_custom_mem_info( + ptr->prefill_input_pos, + io_bytes_map["prefill_input_pos_bytes"], + prefill_input_pos->scalar_type(), + prefill_input_pos.get()); + + // [I] kv_cache + size_t layered_head_count = num_layers_ * num_heads_; + int index = 3; // bypass input_tokens, atten_mask, input_pos + for (int offset = 0, shard_index = 0; shard_index < modules_.size(); + offset += shard_layers_[shard_index], shard_index++) { + for (int cache_group = 0; cache_group < 2; ++cache_group) { + for (int layer = 0; layer < shard_layers_[shard_index]; ++layer) { + for (int head = 0; head < num_heads_; ++head, ++index) { + Result kv_cache = + methods_meta[shard_index]->input_tensor_meta(index); + std::vector>& cache = + (cache_group == 0 ? k_cache_in_[prefill_forward_name_] + : v_cache_in_[prefill_forward_name_]); + uint8_t* cache_ptr = (cache_group == 0) + ? ptr->k_cache[layer + offset][head] + : ptr->v_cache[layer + offset][head]; + + cache.emplace_back(std::make_unique( + kv_cache->scalar_type(), + kv_cache->sizes().size(), + const_cast(kv_cache->sizes().data()), + cache_ptr, + const_cast( + kv_cache->dim_order().data()))); + ptr->add_custom_mem_info( + cache_ptr, + io_bytes_map["cache_in_bytes"] / layered_head_count, + kv_cache->scalar_type(), + kv_cache.get()); + input_tensors_[prefill_forward_name_][shard_index].push_back( + cache.back().get()); + } + } + } + } + } // [O]: logits int logit_index = 0; @@ -1031,8 +1221,8 @@ void SmartMaskIoMgr::prepare_prefill_io( (cache_group == 0 ? k_cache_out_[prefill_forward_name_] : v_cache_out_[prefill_forward_name_]); void* cache_ptr = (cache_group == 0) - ? ptr->k_cache[layer + offset][head] - : ptr->v_cache[layer + offset][head]; + ? ptr->k_cache_out[layer + offset][head] + : ptr->v_cache_out[layer + offset][head]; cache.emplace_back(std::make_unique( kv_cache->scalar_type(), kv_cache->sizes().size(), @@ -1042,7 +1232,7 @@ void SmartMaskIoMgr::prepare_prefill_io( kv_cache->dim_order().data()))); ptr->add_custom_mem_info( cache_ptr, - io_bytes_map["cache_in_bytes"] / layered_head_count, + io_bytes_map["cache_out_bytes"] / layered_head_count, executorch::aten::ScalarType::Byte, kv_cache.get()); output_tensors_[prefill_forward_name_][shard_index].push_back( @@ -1059,24 +1249,50 @@ void SmartMaskIoMgr::update_prefill_to_kv_io( std::vector>& output_tensors) { IO* ptr = static_cast(data_ptr_.get()); - *ptr->input_tok = + *ptr->kv_input_toks = use_int64_token_ ? cur_token : static_cast(cur_token); - *ptr->input_pos = static_cast(pos); + *ptr->kv_input_pos = static_cast(pos); // pos means the cur_token pos for (int i = 0; i < pos; i++) { ptr->kv_attention_mask[i] = 65535; } - // Update K is enough, copy from last to prevent from overwriting values - size_t copied_size = prefill_cache_len_ * sizeof(uint8_t); - for (int l = 0; l < num_layers_; l++) { - for (int h = 0; h < num_heads_; h++) { - uint8_t* k_cache = ptr->k_cache[l][h]; - for (int hd = head_dim_ - 1; hd > -1; hd--) { - memcpy( - k_cache + (kv_cache_len_ * hd), - k_cache + (prefill_cache_len_ * hd), - copied_size); + if (is_bert_) { + // update v_cache + auto& v_cache_in = v_cache_in_[kv_forward_name_]; + auto& v_cache_out = v_cache_out_[prefill_forward_name_]; + // update v_cache by single thread, this part is cpu cache sensitive + size_t copied_size = kv_cache_len_ * head_dim_ * sizeof(uint8_t); + for (int i = 0; i < v_cache_in.size(); ++i) { + uint8_t* ptr_in = v_cache_in[i]->mutable_data(); + const uint8_t* ptr_out = v_cache_out[i]->data(); + memcpy(ptr_in, ptr_out, copied_size); + } + + auto& k_cache_in = k_cache_in_[kv_forward_name_]; + auto& k_cache_out = k_cache_out_[prefill_forward_name_]; + for (int i = 0; i < k_cache_in.size(); ++i) { + uint8_t* ptr_in = k_cache_in[i]->mutable_data(); + const uint8_t* ptr_out = k_cache_out[i]->data(); + for (size_t j = 0, offset = 0; j < head_dim_; + ++j, offset += kv_cache_len_) { + for (size_t k = 0, k_stride = j * prefill_ar_len_; k < pos; k++) { + ptr_in[offset + k] = ptr_out[k_stride + k]; + } + } + } + } else { + // Update K is enough, copy from last to prevent from overwriting values + size_t copied_size = pos * sizeof(uint8_t); + for (int l = 0; l < num_layers_; l++) { + for (int h = 0; h < num_heads_; h++) { + uint8_t* k_cache = ptr->k_cache[l][h]; + for (int hd = head_dim_ - 1; hd > -1; hd--) { + memcpy( + k_cache + (kv_cache_len_ * hd), + k_cache + (prefill_cache_len_ * hd), + copied_size); + } } } } @@ -1088,37 +1304,71 @@ void SmartMaskIoMgr::update_prefill_io( std::vector>& output_tensors) { (void)output_tensors; IO* ptr = static_cast(data_ptr_.get()); - // Support CPU 4-bit embedding, which requires int64 input. - // However, for QNN embedding, only int32 input is needed. - // Therefore, we need to cast to the correct type to write the data. - if (use_int64_token_) { - ptr->prefill_input_toks[pos] = cur_token; - } else { - int32_t* prefill_input_toks_ptr = - reinterpret_cast(ptr->prefill_input_toks); - prefill_input_toks_ptr[pos] = static_cast(cur_token); + + if (!is_bert_) { + // update v_cache + auto& v_cache_in = v_cache_in_[prefill_forward_name_]; + auto& v_cache_out = v_cache_out_[prefill_forward_name_]; + // update v_cache by single thread, this part is cpu cache sensitive + size_t copied_size = prefill_ar_len_ * head_dim_ * sizeof(uint8_t); + for (int i = 0; i < v_cache_in.size(); ++i) { + uint8_t* ptr_in = + v_cache_in[i]->mutable_data() + pos * head_dim_; + const uint8_t* ptr_out = v_cache_out[i]->data(); + memcpy(ptr_in, ptr_out, copied_size); + } + + auto& k_cache_in = k_cache_in_[prefill_forward_name_]; + auto& k_cache_out = k_cache_out_[prefill_forward_name_]; + for (int i = 0; i < k_cache_in.size(); ++i) { + uint8_t* ptr_in = k_cache_in[i]->mutable_data(); + const uint8_t* ptr_out = k_cache_out[i]->data(); + for (size_t j = 0, offset = pos; j < head_dim_; + ++j, offset += prefill_cache_len_) { + for (size_t k = 0, k_stride = j * prefill_ar_len_; k < prefill_ar_len_; + k++) { + ptr_in[offset + k] = ptr_out[k_stride + k]; + } + } + } } } -void SmartMaskIoMgr::fill_prefill_toks(std::vector& prompt_tokens) { +void SmartMaskIoMgr::fill_prefill_toks( + int64_t start_pos, + std::vector& prompt_tokens) { IO* ptr = static_cast(get_mutable_ptr()); - for (int i = 0; i < prompt_tokens.size(); i++) { - // Support CPU 4-bit embedding, which requires int64 input. - // However, for QNN embedding, only int32 input is needed. - // Therefore, we need to cast to the correct type to write the data. - if (use_int64_token_) { - ptr->prefill_input_toks[i] = prompt_tokens[i]; - } else { - int32_t* prefill_input_toks_ptr = - reinterpret_cast(ptr->prefill_input_toks); - prefill_input_toks_ptr[i] = static_cast(prompt_tokens[i]); + for (int i = 0; i < prefill_ar_len_; i++) { + if (!is_bert_) { + ptr->prefill_input_pos[i] = start_pos + i; + } + + if (start_pos + i < prompt_tokens.size()) { + // Support CPU 4-bit embedding, which requires int64 input. + // However, for QNN embedding, only int32 input is needed. + // Therefore, we need to cast to the correct type to write the data. + if (use_int64_token_) { + ptr->prefill_input_toks[i] = prompt_tokens[start_pos + i]; + } else { + int32_t* prefill_input_toks_ptr = + reinterpret_cast(ptr->prefill_input_toks); + prefill_input_toks_ptr[i] = + static_cast(prompt_tokens[start_pos + i]); + } + } + if (start_pos >= prefill_ar_len_) { + for (int j = 0, offset = i * context_len_ + (start_pos - prefill_ar_len_); + j < prefill_ar_len_; + ++j) { + ptr->prefill_attention_mask[offset + j] = 65535; + } } } } void SmartMaskIoMgr::fill_kv_tok_mask(int64_t pos, int64_t cur_token) { IO* ptr = static_cast(get_mutable_ptr()); - *ptr->input_tok = + *ptr->kv_input_toks = use_int64_token_ ? cur_token : static_cast(cur_token); ptr->kv_attention_mask[kv_cache_len_] = 65535; } diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h index 3a59ab6924e..f1887b99280 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h +++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h @@ -23,8 +23,7 @@ namespace example { enum EvalMode { - kPrefill = 0, - kKVCached, + kKVCached = 0, kHybrid, kUnsupported, }; @@ -42,7 +41,9 @@ class IoMgrBase { const std::vector< executorch::runtime::Result>& methods_meta) = 0; - virtual void fill_prefill_toks(std::vector& prompt_tokens) = 0; + virtual void fill_prefill_toks( + int64_t start_pos, + std::vector& prompt_tokens) = 0; virtual void fill_kv_tok_mask(int64_t pos, int64_t cur_token) = 0; virtual void update_prefill_to_kv_io( int64_t cur_token, @@ -81,7 +82,10 @@ class ShiftPointerIoMgr : public IoMgrBase { public: ShiftPointerIoMgr( std::vector>& modules, + int32_t context_len, + int32_t prefill_ar_len, int32_t prefill_cache_len, + int32_t kv_ar_len, int32_t kv_cache_len, int32_t vocab_size, int32_t num_layers, @@ -101,7 +105,9 @@ class ShiftPointerIoMgr : public IoMgrBase { const std::vector< executorch::runtime::Result>& methods_meta) override; - void fill_prefill_toks(std::vector& prompt_tokens) override; + void fill_prefill_toks( + int64_t start_pos, + std::vector& prompt_tokens) override; void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override; void update_prefill_to_kv_io( int64_t cur_token, @@ -119,25 +125,26 @@ class ShiftPointerIoMgr : public IoMgrBase { std::vector>& output_tensors) override; struct IO { - int64_t input_tok; - int32_t input_pos; + int64_t kv_input_toks; + int32_t kv_input_pos; std::vector>> k_cache; std::vector> v_cache; std::vector> k_cache_out; std::vector kv_attention_mask; std::vector kv_logits; std::vector prefill_input_toks; - std::vector prefill_atten_mask; + std::vector prefill_input_pos; + std::vector prefill_attention_mask; std::vector prefill_logits; }; private: - std::unique_ptr input_tok_; - std::unique_ptr input_pos_; - std::unique_ptr hidden_state_; - std::unique_ptr attention_mask_; + std::unique_ptr kv_input_toks_; + std::unique_ptr kv_input_pos_; + std::unique_ptr kv_attention_mask_; std::unique_ptr prefill_input_toks_; - std::unique_ptr prefill_attn_mask_; + std::unique_ptr prefill_input_pos_; + std::unique_ptr prefill_attention_mask_; std::unique_ptr prefill_logits_; std::unordered_map< std::string, @@ -157,7 +164,10 @@ class ShiftPointerIoMgr : public IoMgrBase { v_cache_out_; std::unique_ptr kv_logits_; std::vector shard_layers_; + int32_t context_len_{0}; + int32_t kv_ar_len_{0}; int32_t kv_cache_len_{0}; + int32_t prefill_ar_len_{0}; int32_t prefill_cache_len_{0}; int32_t vocab_size_; int32_t num_layers_; @@ -167,13 +177,17 @@ class ShiftPointerIoMgr : public IoMgrBase { std::string prefill_forward_name_; std::string kv_forward_name_; const bool use_int64_token_{false}; + const bool is_bert_{false}; }; class SmartMaskIoMgr : public IoMgrBase { public: SmartMaskIoMgr( std::vector>& modules, + int32_t context_len, + int32_t prefill_ar_len, int32_t prefill_cache_len, + int32_t kv_ar_len, int32_t kv_cache_len, int32_t vocab_size, int32_t num_layers, @@ -193,7 +207,9 @@ class SmartMaskIoMgr : public IoMgrBase { const std::vector< executorch::runtime::Result>& methods_meta) override; - void fill_prefill_toks(std::vector& prompt_tokens) override; + void fill_prefill_toks( + int64_t start_pos, + std::vector& prompt_tokens) override; void fill_kv_tok_mask(int64_t pos, int64_t cur_token) override; void update_prefill_to_kv_io( int64_t cur_token, @@ -216,22 +232,24 @@ class SmartMaskIoMgr : public IoMgrBase { struct IO { void* shared_buffer_base; - int64_t* input_tok; - int32_t* input_pos; + int64_t* kv_input_toks; + int32_t* kv_input_pos; // layer -> head -> head_dim * seq_len std::vector> k_cache; std::vector> v_cache; // layer -> head -> head_dim std::vector> k_cache_out; std::vector> v_cache_out; - // max_seq_len + // kv_ar_len_ * context_len_ uint16_t* kv_attention_mask; - // vocab_size + // kv_ar_len_ * vocab_size uint16_t* kv_logits; + // prefill_ar_len_ int64_t* prefill_input_toks; - // prefill_cache_len_ ^ 2 - uint16_t* prefill_atten_mask; - // vocab_size * prefill_cache_len_ + int32_t* prefill_input_pos; + // prefill_ar_len_ * context_len_ + uint16_t* prefill_attention_mask; + // vocab_size * prefill_ar_len_ uint16_t* prefill_logits; size_t num_layers_; @@ -252,12 +270,12 @@ class SmartMaskIoMgr : public IoMgrBase { }; private: - std::unique_ptr input_tok_; - std::unique_ptr input_pos_; - std::unique_ptr hidden_state_; - std::unique_ptr attention_mask_; + std::unique_ptr kv_input_toks_; + std::unique_ptr kv_input_pos_; + std::unique_ptr kv_attention_mask_; std::unique_ptr prefill_input_toks_; - std::unique_ptr prefill_attn_mask_; + std::unique_ptr prefill_input_pos_; + std::unique_ptr prefill_attention_mask_; std::unique_ptr prefill_logits_; std::unordered_map< std::string, @@ -277,7 +295,10 @@ class SmartMaskIoMgr : public IoMgrBase { v_cache_out_; std::unique_ptr kv_logits_; std::vector shard_layers_; + int32_t context_len_{0}; + int32_t kv_ar_len_{0}; int32_t kv_cache_len_{0}; + int32_t prefill_ar_len_{0}; int32_t prefill_cache_len_{0}; int32_t vocab_size_; int32_t num_layers_; @@ -287,6 +308,9 @@ class SmartMaskIoMgr : public IoMgrBase { std::string prefill_forward_name_; std::string kv_forward_name_; const bool use_int64_token_{false}; + // If the cache length is zero, it indicates a BERT model, which does not use + // position ids or KV cache inputs. + const bool is_bert_{false}; }; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 70ba25a0972..da1997a5060 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -45,7 +45,7 @@ Runner::Runner( const int32_t logits_offset, const float temperature, const int eval_mode, - const std::string& kv_updator) + const std::string& kv_updater) : n_bos_(1), n_eos_(1), tokenizer_path_(tokenizer_path), @@ -53,7 +53,7 @@ Runner::Runner( logits_offset_(logits_offset), temperature_(temperature), eval_mode_(static_cast(eval_mode)), - kv_updator_(kv_updator) { + kv_updater_(kv_updater) { for (size_t i = 0; i < models_path.size(); ++i) { modules_.push_back(std::make_shared( models_path[i], Module::LoadMode::MmapUseMlockIgnoreErrors)); @@ -77,10 +77,6 @@ Error Runner::load() { } switch (eval_mode_) { - case EvalMode::kPrefill: - prefill_forward_name_ = "forward"; - method_names_.emplace_back(prefill_forward_name_); - break; case EvalMode::kKVCached: kv_forward_name_ = "forward"; method_names_.emplace_back(kv_forward_name_); @@ -106,17 +102,22 @@ Error Runner::load() { } if (!prefill_forward_name_.empty()) { - // Use input tokens length to retrieve prefill cache len - // Cache len equals to prefill model seq_len - 1 - prefill_cache_len_ = get_methods_meta(prefill_forward_name_)[0] - ->input_tensor_meta(0) - ->sizes()[1]; + // Use attention mask length to retrieve prefill_ar_len and context length + // Prefill cache length equals to context_len - prefill_ar_len + auto atten_mask_meta = + get_methods_meta(prefill_forward_name_)[0]->input_tensor_meta(1); + prefill_ar_len_ = atten_mask_meta->sizes()[1]; + context_len_ = atten_mask_meta->sizes()[2]; + prefill_cache_len_ = context_len_ - prefill_ar_len_; } if (!kv_forward_name_.empty()) { - // Use k cache length to retirieve kv cache len - // Cache len equals to kv model seq_len - 1 - kv_cache_len_ = - get_methods_meta(kv_forward_name_)[0]->input_tensor_meta(3)->sizes()[2]; + // Use attention mask length to retrieve kv ar len and context length + // Cache len equals to kv model context_len - kv_ar_len + auto atten_mask_meta = + get_methods_meta(kv_forward_name_)[0]->input_tensor_meta(1); + kv_ar_len_ = atten_mask_meta->sizes()[1]; + context_len_ = atten_mask_meta->sizes()[2]; + kv_cache_len_ = context_len_ - kv_ar_len_; } // retrieve any method meta, can be either prefill or kv @@ -130,10 +131,13 @@ Error Runner::load() { executorch::aten::ScalarType::Long; ET_CHECK_MSG(num_layers != -1, "Could not retrieve num layers"); - if (kv_updator_ == "SmartMask") { + if (kv_updater_ == "SmartMask") { io_mgr_ = std::make_unique( modules_, + context_len_, + prefill_ar_len_, prefill_cache_len_, + kv_ar_len_, kv_cache_len_, vocab_size_, num_layers, @@ -143,10 +147,13 @@ Error Runner::load() { prefill_forward_name_, kv_forward_name_, use_int64_token_); - } else if (kv_updator_ == "ShiftPointer") { + } else if (kv_updater_ == "ShiftPointer") { io_mgr_ = std::make_unique( modules_, + context_len_, + prefill_ar_len_, prefill_cache_len_, + kv_ar_len_, kv_cache_len_, vocab_size_, num_layers, @@ -157,16 +164,13 @@ Error Runner::load() { kv_forward_name_, use_int64_token_); } else { - ET_LOG(Error, "Using an unknown updator %s", kv_updator_.c_str()); + ET_LOG(Error, "Using an unknown updater %s", kv_updater_.c_str()); } ET_LOG(Info, "creating io_memory"); // prepare io io_mgr_->init_io(); switch (eval_mode_) { - case EvalMode::kPrefill: - io_mgr_->prepare_prefill_io(get_methods_meta(prefill_forward_name_)); - break; case EvalMode::kKVCached: io_mgr_->prepare_kv_io(get_methods_meta(kv_forward_name_)); break; @@ -324,8 +328,7 @@ Error Runner::generate( break; } - int max_seq_len = std::max(prefill_cache_len_, kv_cache_len_) + 1; - seq_len = (seq_len > 0 && seq_len <= max_seq_len) ? seq_len : max_seq_len; + seq_len = (seq_len > 0 && seq_len <= context_len_) ? seq_len : context_len_; Result> encode_res = tokenizer_->encode(prompt_, n_bos_, 0); ET_CHECK_OK_OR_RETURN_ERROR( @@ -333,61 +336,46 @@ Error Runner::generate( std::vector prompt_tokens = encode_res.get(); int num_prompt_tokens = prompt_tokens.size(); - ET_CHECK_MSG(num_prompt_tokens < max_seq_len, "max seq length exceeded"); ET_CHECK_MSG( num_prompt_tokens < seq_len, "sequence length exceeded - please increase the seq_len value"); - if (eval_mode_ == EvalMode::kHybrid) { - int prefill_seq_len = get_methods_meta(prefill_forward_name_)[0] - ->input_tensor_meta(0) - ->sizes()[1] + - 1; - ET_CHECK_MSG( - num_prompt_tokens < prefill_seq_len, - "For hybrid mode, please ensure prompt length(%d) is less than prefill's seq_len(%d)", - num_prompt_tokens, - prefill_seq_len); - } int64_t pos = 0, prev_token, cur_token = prompt_tokens[0]; if (token_callback) { token_callback(prompt_); } auto prefill_execute = [&](const std::string& method_name) { - io_mgr_->fill_prefill_toks(prompt_tokens); + int num_iters = 1 + ((num_prompt_tokens - 1) / prefill_ar_len_); + ET_LOG( + Info, + "Prompt Processor: total %d tokens (AR-%d * %d iters)", + num_prompt_tokens, + prefill_ar_len_, + num_iters); - pos = num_prompt_tokens - 1; - cur_token = prompt_tokens[pos]; - while (pos < seq_len - 1) { - // inference + for (int i = 0; i < num_iters; i++) { + io_mgr_->fill_prefill_toks(pos, prompt_tokens); run_model_step(method_name, inputs[method_name]); - Tensor& logits_tensor = output_tensors[method_name].back()[0]; - prev_token = cur_token; - long sample_start_time_ms = time_in_ms(); - cur_token = logitsToToken(logits_tensor, pos); - stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; - - io_mgr_->update_prefill_io(cur_token, ++pos, output_tensors[method_name]); - auto piece_res = tokenizer_->decode(prev_token, cur_token); - ET_CHECK(piece_res.ok()); - if (token_callback) { - token_callback(piece_res.get().c_str()); - } - - if (pos == num_prompt_tokens) { - stats_.first_token_ms = time_in_ms(); - stats_.prompt_eval_end_ms = time_in_ms(); - } - - if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) { - ET_LOG(Info, "\nReached to the end of generation"); - break; - } - // prefill model inferences once for prompt in the hybrid mode - if (eval_mode_ == EvalMode::kHybrid) { - break; - } + io_mgr_->update_prefill_io(cur_token, pos, output_tensors[method_name]); + pos += prefill_ar_len_; } + Tensor& logits_tensor = output_tensors[method_name].back()[0]; + prev_token = prompt_tokens[num_prompt_tokens - 1]; + long sample_start_time_ms = time_in_ms(); + cur_token = logitsToToken( + logits_tensor, + (num_prompt_tokens + prefill_ar_len_ - 1) % prefill_ar_len_); + stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; + + auto piece_res = tokenizer_->decode(prev_token, cur_token); + ET_CHECK(piece_res.ok()); + if (token_callback) { + token_callback(piece_res.get().c_str()); + } + + pos = num_prompt_tokens; + stats_.first_token_ms = time_in_ms(); + stats_.prompt_eval_end_ms = time_in_ms(); }; auto kv_execute = [&](const std::string& method_name) { @@ -429,9 +417,6 @@ Error Runner::generate( }; switch (eval_mode_) { - case EvalMode::kPrefill: - prefill_execute(prefill_forward_name_); - break; case EvalMode::kKVCached: kv_execute(kv_forward_name_); break; diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h index b6ba1360bff..e659ac55164 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h @@ -33,7 +33,7 @@ class Runner { const int32_t logits_offset, const float temperature, const int eval_mode, - const std::string& kv_updator); + const std::string& kv_updater); struct Stats { // Scaling factor for timestamps - in this case, we use ms. @@ -89,7 +89,10 @@ class Runner { std::string prompt_; // metadata + int32_t context_len_{0}; + int32_t prefill_ar_len_{0}; int32_t prefill_cache_len_{0}; + int32_t kv_ar_len_{0}; int32_t kv_cache_len_{0}; int32_t vocab_size_; int32_t bos_id_; @@ -111,7 +114,7 @@ class Runner { std::string kv_forward_name_; std::vector method_names_; LlamaVersion llama_version_; - std::string kv_updator_; + std::string kv_updater_; }; } // namespace example diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index 720877f0555..dde6a397d9a 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -890,7 +890,7 @@ def _unsafe_adjust_original_program( # noqa: C901 del original_program._state_dict[input_target] elif input_spec.kind == InputKind.BUFFER: if input_spec.persistent: - del original_program._state_dict[input_target] + original_program._state_dict.pop(input_target, None) else: del original_program._constants[input_spec.target] elif input_spec.kind == InputKind.CONSTANT_TENSOR: From 7910a89cbc2148dde4d762c88297529d77a3f7e0 Mon Sep 17 00:00:00 2001 From: shewu-quic Date: Mon, 17 Feb 2025 17:37:04 +0800 Subject: [PATCH 2/4] fixed CI --- backends/qualcomm/tests/test_qnn_delegate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 9b05ad871f4..f8552e4fd4b 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -3154,9 +3154,9 @@ def test_llama3_2_1b(self): "llama3_2", "--model_mode", "hybrid", - "--prefill_seq_len", + "--prefill_ar_len", "32", - "--kv_seq_len", + "--max_seq_len", "512", "--num_sharding", "4", @@ -3234,9 +3234,9 @@ def test_llama_stories_110m(self): "stories110m", "--model_mode", "hybrid", - "--prefill_seq_len", + "--prefill_ar_len", "32", - "--kv_seq_len", + "--max_seq_len", "128", ] if self.compile_only: From 6b2b64f65fd827e1e92d2e4d2e4cd4dc3aad263f Mon Sep 17 00:00:00 2001 From: shewu-quic Date: Wed, 19 Feb 2025 09:27:33 +0800 Subject: [PATCH 3/4] Add the figure to readme and fixed unused variable --- examples/qualcomm/oss_scripts/llama/README.md | 6 ++++++ .../llama/assets/PromptProcessingWithARN.png | Bin 0 -> 48393 bytes .../oss_scripts/llama/runner/io_manager.cpp | 6 +----- 3 files changed, 7 insertions(+), 5 deletions(-) create mode 100644 examples/qualcomm/oss_scripts/llama/assets/PromptProcessingWithARN.png diff --git a/examples/qualcomm/oss_scripts/llama/README.md b/examples/qualcomm/oss_scripts/llama/README.md index 4532e7656f7..cd468eebb26 100644 --- a/examples/qualcomm/oss_scripts/llama/README.md +++ b/examples/qualcomm/oss_scripts/llama/README.md @@ -12,6 +12,12 @@ KV Cache Mode: In KV Cache mode, the model takes in a single previous token and Hybrid Mode: Hybrid mode leverages the strengths of both AR-N model and KV cache modes to optimize token generation speed. Initially, it uses AR-N model to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens. - AR-N model: The auto-regression (AR) length determines the number of tokens to consume and the number of logits to produce. Use it to process the prompt and generate the key-value (kv) cache, which serves as a prompt processor in hybrid mode. + - Prompt processing with AR-N model: +
+ Prompt Processing With AR-N Model +
Prompt processing is done using a for-loop. An N-token block is taken, and the KV cache is updated for that block. This process is repeated until all tokens are consumed, with the last block potentially requiring padding. For flexibility, the AR-N model can handle any input length less than the maximum sequence length. For TTFT, the input length (or number of blocks) will vary depending on the actual input length, rather than always being the same. +
+
## Instructions diff --git a/examples/qualcomm/oss_scripts/llama/assets/PromptProcessingWithARN.png b/examples/qualcomm/oss_scripts/llama/assets/PromptProcessingWithARN.png new file mode 100644 index 0000000000000000000000000000000000000000..228b846f7c386b1db5de0b6b42b21dd30eb561c6 GIT binary patch literal 48393 zcmd421yCGsyY`8@1_-VRfdqE~!7T(QXmGb+fk6g`;2y(*+u#};1__ej!CiyQ;4lm} z%ntweJ-es&eBaiduj;GauA1tafvWC)p6=(quitfbthS~KF##O`8X6k0>T5+^G&GDn z)W;7G2lY#vj!rachwi1TB9B%*#;}k2g5@BmA%})mmrQtPg^l`-@A2B$3k{8=@8N?! z=nk?)L+i*Yojx)k(UC$)ktk4=M&3g1t^jaY|LFYXO~=hKZP>ph^FumkaeAq;v+Xg-&M{Nt?=$!xTGtezYCxd_B|8y@oLe4GG#Gxn3{zk5QC2@8#$5eG}{b9-#4?E{kv zPM+a?MUKi;>(pWWK_UCV#NCg>-Q2%_)1J|rIRxzWpx4_ClPaeH!bhW5{``4l*B|#m z(9=3@a`2r()R!+mT*kk;gQc_{^Sr1iac55mxxe$8zTh`gU*k;|EpZxb>p#n&mn?Bw z{bn?1;5Iy#9iROp!LibO%%JKM`;)@&nUC)|_L?5eZX7JwO5`ttKwAQ4Vey4+dP zv&;kWhx^3Ci8k_$RxINQ*I$b*9_qN`~9P@$mPRlL2^_D#^mD6Az zDevd1LCaoRC8jl3q?3aNid@K#YFp2TLbbqJ-K;RqI&kcK#618CkZEH zINZOd&S4tV=bX9JWLL4JAbe#~Lzm<`XKwOykL?AE(upr%zg(K!yUI8_zrNsncl_(l zXeP*w(m&bnLyM%%%}^hyA7w=08hs7;S@ZI=@(*}dO&Wnc%$=Vp+X}`cvv?|%K-$OD z0?xf*y1D%=TOiYgn?5F_B~!e^@qIr%I?F0By###S?GS9MM9L^p72b2)H7s~zRH2bo zP;aO94f+Q01u_r@R;P7A5`2BZUcTQwmc8&oo6B(Q2Fx5DwY`l+_0*V{gC>N7xuVhC zLQfibSM;uQqQRlW8{#edV+nou`=?JVg(}RVH%6GZ9N`u;;wLFz_}mY>B0}Admbc0o zp_g09TCW@ICrO#^5b&Cog4%%g0%NeWnwg_wUlQZO3()Aus{jk76q%qKokA|4prB^k zY_7x13#`~HaL1MPkx67NsnWUlwu|w*CUt|soc?&PWF?D0PhCdl0@ev2wyAK?*)P?? zcu*)-9v)ME;57k+Lezbc^LQkqran=~K{G7^m$Xf&#L0=erMerJZf)d;_b;{#=`DcB zT?(|F^M(85CeBRuYJTBJme|~tJ#PZ@!brc`x8Y=yrlIdI-7B0}g$S=!I^i&#!!Yi5 zE!}`m8opY)KQe?8SX%=%P-Amm2Y@D**U^)7NYFH5)_>Sx`J0PW|3tx2N&KBoWD!cj z;yqDs9r**-hO!E9V5ApCENOdD_ZY*OJFV4k9Ky|~nZvI3xW4kWKzjSpixLd>iVX~T z!zJ0-2?!yR|f=+w7H|mQbIG4_0KKn{|KTEF8a~P& z-0i+t>YGALKU;C+!MZ1`H7F^YFzapEJ2hTb!qOuLR*JXvZ5njO`}9nmv#LJ-o@911 zyg&p?#RnU{7Lr?B68-r_99+~@g6apGQb>w%*Lt}8r<`C}jj1FEC0(45%k=9{)aEh4 z10zcZ3xlTA8m%y`r6wN8Sf(*HdhVDwTb@e$Ofi=Ie6{^8e3!;`>?0t7Xd_eP^3D)o zrcODTOl#wM$uwwWI~`5j;3Gj9!ayII2@M^~L^L(FbQKG1zmF#Iw~z78^WT2$O>#!1 zo?*zfAi`MM9Y&7+r^A1c=U3oWI7^vY8YbkUn^Lv}aeu%N8!ML;HCEfse*Ow154%6M z#c!hL6-EG7q|e@JXA8k}Wmk%9-cq}-C#|Gt^D2|H&fM5<_2VbW+isFr&%JiO==4_@ z&H0>;o#c2W*=XrIOLpBj#~rL@yZUE45l{7WeJm$!u>UG>^A2B z|K-Ks41q})r`neo&CuF_aks+JcRm_}_+){bPm@CX4Nr|4Hx#Q`wH2~OxiHynhf|b# zy(jY+#n2RQ-K>R{KUy+@8I?9NGG02^4IhA_QGZn^mLRKG^k;^_uO zPaYLOPy~yRvs@{qAU?LP`Y`;30En(|0~~(PG-W9xj}~LsPYVk9ZmaSW(LH@L9m}E%$CQY)JGyVMwt4Ub|K*jn98r8m3H8XGMXi#iE+6iJmZ~lEzVOd#8?+JaC3(O7s&%LH2Fd8xZ?Zk)z)0c z-)DOw+){LUiO~$~jkE(}3Pc(GNNBW|&)VWZD)H~@bDp`qsRw3;+9?=JBteJ+;5#4| z^jn^FiWShACDp63H^((Y-R406@}=^!?m3HG-Kn2GN)d=RYd?1>jd-yhjtJOdNP3C7 zGzgUm?fjPYuRsR*OQs3UpJ>zA0BN-}-ot94IN-NJm1LnPWfI@gX6FS{?1md<=4pk_ zP8*IHX5V6!2=M;J-hvu%WyOqVif^e&^#?>`n*T(E`5Hj+X+`_#$1HB|Y>5>k{A8&z zv-a9d5+=zAf@>dau#a4)qODI|ppho`@rtjBNztB^#w!Rx|DZvG5KAB9aEp;VfSmh@ zw8Pjdd#97z-&`syw@|K{BGCGHweyQmp%HuM5BJ-T%-woXvtEm z{%89unr9FFO+A%R-a`*(%Z2m*-JVhmxHhL&eP<};`+s#=%Nu-P;f1}p<;)pc=ezvw zi%c<>&752fdNCS>AtLCnjzhQBvc$nA0*7o|3P^CMYvlFBxlqWxsSO^vaUwcrDyxx4 z1O;6e1u>0`W=npz>LYYbPLApDJe6_tGjPpp#SY||hTUF@T)q3Txech3td`rnQQJI2 zLSc0L;UU@Ri=t8ZML(Oo>XS^&z&%x}0lP|Gz`q?)ASFquUI}Q~by!Zm%_9MM=MRlYWypDxQpEAiO-%uFQ?{=U#jScILD>gr)uF!F4({c_LY4F{ zuu)qIKfREB5rcH#&|ZPjAljFqe1nPcIUxT4LE^WOqp1y4{n|%U}o;;S^tH{ zaBMHj4>n&}*|I^tY{`3QvI(kXqfd~|u^;=+i%ZJnjVpmF{6i_Mn2Gt- z0GanO(^V!~#3miKtz0K-?1a?8jzpu8>t3=*=?|!0^+@(@pR|z6<&v`D5%Py#nXYc( z0J}0Xfl{!=YsbBi?ICj2Vg}Nlvv@`qTyQOq0UQ#Zg6RD&K?PfQR5eQBu679yM z78d(tw<)ig?sZkD%{7GQgp({h4yx~!Q1aLyoaJf4pJla_YI2z~N8kt=1RbSzzV{4L zGw!l3cAg{qq7FRu{}B}Kiu;}A)C{YBdkIyG+nZM~;qcG1u8VIn1nm;?a~|`UvK%gz z(wniZw86B&gQ)dPy@BMu&b6dNc&4*T1S@yOWm>!foWUzNfc)W(yuA(~e2cmWw4~;i z*MS$4LR-c24|DU>xkqE*#aF6{9+}!Ql)=Vsn+hVoH*CV`7 z=r@V=X-OD=^AwzmEH8EVdljorxrDnhACQX6S(~t>u&<3MHRFk<{1wA}Vh@yblHmX8 zLuj|;I$vv!P?7Y#WDdl*FHFzf&B+CYryFs;NdnaQunGjuTLLyp`JO$O=0{@}j5YA^ zx^O(9*j79c zCH4iSd&hkyM47C9n(6{7VVy{4Q3VNwwC4!$+Io+8)CcY?Q0q;$)v!>4<;XXObZlq? zqwt|Y3-!fE5)C&p#Su!jGdpA1g^N~W-<$Qq^-)x_U zBolv{sW2=%_IVA~pOKmB55l9-!M(^aUzzd1`c$Z#tlu|MTSackhSx7`C2IHkss6_p z>(^3RjvdWHe+8*mRwqwI^~K0wF6dBNbo+{Lqcb-OIA^qBL$Ts-n$XQ)0Ph^2pKG~mfwS6kI_jnNEr6W5?IiB+)vCH9GQVvI$LuC#M~4X zdTWCoSfvtIJ@}-6F^@+-(W!dKHKXn|gVV)qF)of{W6m(c6-==*^NaA04Gh-Pz)7Y^|%hO@{_8D~LV9XaSw5;MTk{7?A#tvh}!!;{UPU z{ksl!dQiWj7oB|1L@!RGW#AzOKAQJ$JDt&3UislXW3VO^f(y=o?U*nj9&?)pw)y4; zA02)9LJ4b+qvC44{n1)?xSyB>_VO*v5WfC9WiyZj--|w8Z*QaDI?2IRC!7+7_r3Dx zBMpwWn^-XtbGFvaqmHwTnJnd}}4%RoXsj`Rck(6LyyooxgE8 z;=1V?0EDabxn?Z=TxuFgc7nJGFDJEXn9r5vfWsG01k@=3TLVeo!Xlz*%qVNdRG)_t z>cKhs`P|fYyG6aRrC$w5$jTvQ1ks8?TdHHUG>zi8&Bb1p^cc-HcU*f&m05Ilw%lhL z!Nx&nin|QU-xTWko@CImOJW2z6ovvaI#yru-K zQdQ84#N5vS2QruAuDXl_q-p~)!%A=swBEW8Hi7 z>RhbT96c}h!B?A}!@{ruk3GyCPOFu%&J#y`+KS5cUZqPb zeW|B??kQCYJL7A|<@zEgYK|jtlVL z649fLtDoYhl6OqQ1^LJa=J3so6#lht=EMvkHIgi|I`#8-gr9iCrq_MfhxHo6>?>hBQsiW2xq5x)6bzZ`Td zb$&o7h$`iGkG~=v?u?S1+*WQ1C5M(T&&MA?)>l~fiDAi@_SPS{k{uUg(Mk)BcHd*2 z!wu_i{j?Wb<~NtEhX1!aWY6ZkoShmPaG2+?NGtbVIqfxSjkN2FS`O7xSQp#x-_IJ&r+QKJyEl@#+-<(o zp_yR*<`TTdSZf7r_c{F1-sY{pauxKsWX_NB_{rY1{v&5pAqiKqAfsFZx&+K5odzVU z^a%@)oC`1gaH$Db9~v?$9w;s~Mgaf-dfqN=AP?bs0I*{tZks@gheyYtl5@AsNLAp* z$Ym(yQ@zh&>)B!R?ymwRyP1oskEi3gAxW7`CGq=9*i!?-hepjt(q1-$!lO%%p!2_7 z7h1jQ8orSy&I~V<>(@h(rkWarNDA-hC(9I!|L*~#!b0`@w|pj>|3VqVviXC!;CAjt z;QLwZ1unR~d!v)HyL+$(_>y1}wgZKl(@o1@-(P!Hbj4r9pR z&;KGYO-F;QvnidqSZrFUDPjKo>lxiMjTbyh@y}??W)~YcRCo&AZeYX0QAU`!@Y$-* zGZOt5mxm2LCS#d^5u59Y_iH(a5+!!@YqbLt`RsPTzZ5zHErR`ZnS!Pl3DbKb)ZWxt z<2wV5nnhfmsUq(u$7fND=*jE0?N3jgCarTbgFAt$I+CeRIO#>ozc9!gU6VXcU=A~m zB$(_BEwcdE5-3);9~~eicebjUlPX{xj+6L5vK7b=XrS)K{Cwh<33LD>4X>r9_kh3{ zoDisf-|5k!OFWI4q9}G=%S(_u7`=Qb0M&#SxvexD1DU0O-wEbNV9(UPaIzk{L-T@5 z=_%Q9+w1o8qnY({S7EHDCjT1p!-<{<-Hxk_d$Tt0&(n> z9j(nK{U6G5n|Au3qAj+x&X~_swSfXUG*9ao8-^dcpo@EloNztQ#5= z?S)67kOYRj`K$A+rMjfJ>hzWW($LDHk+#lw^KiyM1FMOYTT0lyPzMg3% zdL|+=^W<}k6C1R=G)#T}Xr*266ikb;!`%Gd6W48Mxd61vAiJf^bj%`739`=l40#*TsyS-c-qsXiF6{DG}KXB%1@3DYT)&u*U&a2Xf{@GDQg8%$ENo39l@f;}4Drj&!agzY}v5aFPU zd>c0HZ~%Ox<{1CPPH1dUL^ERMOZ7JF(~JjteuXQG3(XOBr1WHD8&9XuPWtNgt_sq7 z1I6JFZ$`x&!gWpgmAS+f&}#UMMoy5^$VtcEv+Fk5o<6?=p?d%C9lgXnHfXQT#6ns;0HBDyt>Kzgz&nYdqG zy}%||K=Te~OQ~=$c8jq5X77H-S!M@9d=VTVsPi_c z?ggIGdgM)-p_tPP#y{@*;C>BhRPPL!8z;irf-F{*PX1+vl(KbCp-hu!D6!zAnRyjX zkkDBg>$c2oH@3*Or<5uh9E}4oYX?B{qarco{-VJAjJP*KJ-bdx+yCmsN`#1i^r2CA z8}WL#i2dO>C}^60`ugXMJkC4P!Fqp(kRTp4b2oVck>N*#9@}-9D$lE%Kq)v(ZL0nH z&lD-hX;CLzUcu8okxuojmvC}C-9JkT)e;vxZ@zb0YEoJ;ZqVMRFas?;dZ4kJ+i^Y> zYWO3yW}}zK;{uD{@u4)dy&^8*NW(gwb6~;@KtM-P#y;W4LI8tJst&U#xBFC&bQsZ# zBtC^`CS4@%9WMFzf;OC+C&LNvZ=~RqhTxu4TzvzN_bsSd-cRbiyq1F`YG3Dkok zLEH*3O1)C~5aAXhI{v(tOzM%()vG5*?Sm2D^d3s>*IJqLm`*$UZ9h=1u+!PFavGPT z8`T&P3=%!Q4()d{2m;F2SeZ;%na@am zi_<{f<5q8{?&CGNjZXa>C8^$uxE1AZ2uf%gnv;ZZM?6 z&El-ye6$w)jVEh8RRhk!H#L1r7^heF9Z0LQOuNVSj1M(nlNB{3pwH6b!0(|*#4(!# z)sujDZxbLAUI%8|n_Z&PS3L<4aX4n-gU$rdWx_)SO^Sx`y6#DVW;Rl{OKBa=cBPV% z?a6gvCA9C2Ygcu^a{=wjUY(ik0x}UeUoZUCPsB9Thvz<|55@--I!&>a3u7TwO}RJP zSD{NCx!9RGHD0Oyrs=sA)xWDg=@!2!kZ59Shq^Nbz+sFr5TuQq-5NU&6-~aq@fih) z5s07J`6R*y@Nqd7+DWyNj%PB3%V?`QD5q(M~m(r6yg4NZAQP}{8!8j z<49KH)6j)MF7O$1L4(w)*(2J`>=-8BUwif{eZ#H&PoNoX)Tx$vG*kbX<;lKU(V)?t z%qUU^%P4i0g9IBxv zQz{b4mUU0|?4m%vE50;32hH>f=6CNDK<(L>6XUBr9Ke-!AQd;JbIF<*0-BVKstT0M z1t1OF9L{fX+dHKmc}zTU(J*08>)HpV+^0e9Gi|ooE~bKwKkts7yQlm$+5@#n>C|Za zkFoM^NO!N>16FSLb%>@4+`;}ASmu8BZ?H@!DGK+Czx41&;*!m8%7n&KO!Fx@{i!y4 z3GvIjmN<$cSet?3Z%sg-i%=yKOO}1(5&4l4@I4)Mqr;KHDdG2;Avk%IfQJbptWNj* zj9vdxLV;$K>w|?hAIRsh*IB~vfQTd=1(s76JN!T2RKBe!L9M?WX#Zx+d=N9^Ujdn* zEZESK)D>?k5=l(K<+_A;c0KAI!A-B75gRLP@Qorhq$%+4*TN$N?5m-N&;UN5_XpuU zz06$-`|!7~`b&?O=EwGz&;j`5dDCT|+j9UBl~yeC-%^=u>dZ%x)UW6W221=xD`nhn z`ZT0pv*bC_Bf}yIlPhjAK}iw7;}-tu+V@vA)->fW*IVdcn9{6^r85oW&r#$!PE?s3 zY3Pv3nD$C(ju7tPpGWPl_?YO3jE6lu-u)7Jwyl=10=CpNpiydlK&F3LG}ZHS;l3&{ znzHaF^PoCl=D*dehlz673hLQuD&Dz(#W`J8oIaG>o4%H4Z*&H^C94dZI+mdo?DT?EE|=Gwnax zb(07iL88H7wZ6|V{|%+2735?F{!!Auk~o`Atb1SnTh!X5b3zDi3faFG941!T_C&|e zm>3#uQ?Ffwd|DE`w=%&gl0M~HaUuUnJcE>XZY zEq$bhevTJ53f^Lwo;E5KZR$J06$VvbdLo$4_i+8-bw@{m4l1x066d}p>Ny}tPUh`~ z>;4xtUQqn==s& z!o9`l5^#rYj~w6%Je#xC{P#!>TWr+fW68b%XUk%W9AWvQg%b}hv0~s{}VmLqOvH4 z4_3zwag-HPhwJUNgGfl0s2l%{5#f2X|1ZK3Xl;JP^qDa+l%wWc?K=J;dqc%JE+wOL z51#?fEKk(`AYv4JjV+`yJbkbmQO03#{6>#qkp9&CeZ3X4QWDf zR_Fu+_o7FN{Auc=p|inN7;Hp@wj=4Ox}D0z{(BoL`o3Sw^^4#AfYiTV(4DE`F{^*Q zxHj9fCxLk$y1!3!6vGi6V9Su}fAjo-gJjCs8jRBF*#cpATav4^KcA>14Lu>1S!Z$^ zBgzVi*Rsekwha|9?F+OH4orL-LJcKDOuCsrKCTp4m%L)k#63FRJsMPZ475gjqo{~I zi%M#gNDrsg@iF(G#*{-&kQ{NJI||k7^VWn!KDdU& z=eX#Pi9!ND-b_C@>;3VM>C9UDhlk=sou}vbyarCJ6i^}y=qwSR#+_!??(^$`0$Abg zpj2vEhmb>8W{9n?z<8)9+tlrRcMqhwMm?L=Yt?`FNs4KsM_8kKZgrIJ;p|)Utd|O~ zIdK0?H-Dh?#d>H;+~Vk#I?B?J(sVs9NqpfFXLs#6{5V5s*VQH_gqTiL*Z6(6-*IP6 zvUBNGFWMJQ=8fNsh9_N_mVtJ7F`VMI;}=Mxi~*g(!E$w5lry1cSs34`_Kmw(8n80< zi2cpbpJFvVv99FfO|ks>A6}*0cOGHO9tvtG{-u#+x<&BRw=F`>R-wl2kDf*!QQxT) zqeni7zNcnvbGFRP5IedJufw+N(+eR&&Oa(C3G2lCZdk!PY{6JDz^i_0i^_1z3GmEX zzXQsKdWc4qOOl<7qwx6_Q)x+r=ZFx-DKY&W?#VVc>i$0?*L>O%u3|Hmm&eNHlKvlI zfO~cAm~~R6xt5l5-p9%Z@?ABChqB}+<*e|7gL*xj`VI8a&GFyM)& z@mZ9#r4qYNUpddXMX{a9k}yj@tpQ8F9hHgeOf{QMVj{j=y7 zc7?C$<(>=1uPU_@NiB~$9=@h6U&^qb{V%3 zmAr$|)w?c<+TiK*e96hVeWm$@fTg=!w>S#;>1x2E%Rr7(b+iW72^*u4Bl(~kdi z$x@E-KA`wbnl!Ya7Le5hkxBNvj*!R!am4QDJwK#N&N*J~ye%ixx)^MMPQQGc#sEMC zJ*f22_&XGCsvtD^yE0^VkHsuDa5&H1NA;=CFxW+^-iaE{q<9-p(OT-6$cCtsqL|Z8jPkVe@|5^GGk)fhVf2jB+*@LWq1}u%X^)xDreFP>qibx^93jFI(&TFuC4-cSR zureKDT1Ts9MVs+W0w&J|EGjA*oB;w0!y{?63uyjUbWz9cz4`jPQ;0f6iT0&)<%5x! zOizDe@=9UC)PWH9M!TL7WxyNQpuM3i)LtVLqukr2EU2bnLXQ~COlJit;!ui-?y*pc z39p|Dwgpq(6z(O!ieoq*uebCcIP?rW+`5Q0n+RJNME?kAh_Y3$xSy$D48F`qrXZ^k2%@Jvhsp_p)5n6Jh}&2>2F_L6oD&pl}0&ipcU7wRmR ztP+BoSc8Z{=j?yF%o$^nPpeKg zTrJ@jkPY&mnHw@uP8%}2gs9P16@R`7OSw6u4l`GM=`gkg@l*oRXfEDs0u?thZ;vD) zqr!gmXiP^nm;$zJr#p|cTL+as2T#bf!KMgKurSWMZWj~wnffQVM9!|}8y!y3{ zO4R9v9G?dC1+G_#`U#OlRWx68Ty+e#oLYz_fuJNk)mbwwLCbRlNn z>rW&13NdF|PUR!)hCc`~rXRHk6i$1u-CE|~C^ko}ZB)Gk=J1a&bYO&Qz7iz@_iqiH zEx~01YXs$ordfvJqheW!tObJKX-ep`PZgDSV+W&u4>I%lIs~H8vP$vVW_1Y3q#HE2 zR2%f2c526&3XqJ<4UucL2pR310v(s1hm!uIV!X*?!Bm8HlF791f!e+%vB2KMn2HZ)C?p%Gl z;TIC!wRXeOf>Bu1jxR@-blerx4ac7Z$HPE#k?^yt^G(8WwV=7W?R|%w&+)wFl{bj< z+#j|#BXcwN^uNUed&ILuT$nt`hS9!QM~z+N^~6$f*t{~wa#b!3v&d=LgB>Cr25nsNrd@yLL*-2J4US~8us^P07ld19=7iky?dsAs~nhk2`g z8wnB!CkZ@#g!QWC@yeA{mrJnw&5=~Q{z*O(3@5E1S{5pYY z?YZalL>Q4FrhfT%^$5~ArCvd*yPM%x*Vg9BchxIAVyZwO?hK+kBKc}0={9`JyC&dB zA54G$6OY!#G;q&uP@iYA% zSU-dq3QBTe{^@tRq-79zKu(6U=^nHNuNnPbJrHf>y12ScQiS2^OC$9H4n8~Q1D64S z>)q%xjOw+2FW8?vQ3#@*lK+KuIf`E``{+Euus=%IHhJHtt)XEOfD^8hpn_YR>Id0( zm~r{;8AgGM_)rRs0Ay2*CuxgNohe95==%kVgcmJ(hPxh1q>OnX>Q2qqQJJ=V&)#_4 zA+mh&M;>ewVJ*A^!nq6{)+$IxS#lQ;2a|g`sh4LhMJIgq6~MEw8C$c{RgGAiLTuV67`MHBmB0AFI zyu#*u38nvv-!IgccOpY?^=om9)k;B@qzi1E#`Q%v(_x>RwWN*nF?7v3k|`v^aS;LC zjTiXL=vM<|aO{NU0C-NhSu#>ZhMJ-)J?(VIWKng_O=@|o_3Ss-$}j^_xIUs?jjvU2 zw#K4#5mpbi>v#D3TYvdHChu-1%U%C4)JC|RAFW)#GWY^GIb4Q{KTec2RPm{QujKtC zzu_LlcXy&7q5+ri95n$gnc;3+@z7zN<>NJd?_zPmPbZm01v900?h!K@)v4p1kmB2L zG}`f$CX5}zC7+)ZeNw!Q@;#^r$h_82mu+}@8m6L{${t6G-hXy#J6kzdHgwy0 zgS*wjN{IU^dn1ubqR^iA4Dg6j|B2C0>EdQ=y6*|{X7&f|^ICQ85FBd5s}U~S`%c!Y z3s^|(v+3BCA1-Mls5t67l4I&8EPZjoH21%!X`LA}=colMhF(0xzb84G_CB|ZI*dt{ z<5BnL^wewM3X#&eVv~Lgs)AY*LsiAVclD!;wwrI^gZ-)?V+Gl~*%~Aopw9HiS>)Nl z6op<&I(0?{t{#rw_~)@;2#e77LnbKM!SV8S^dyzE9-(Zi3|%Fudkc#khLbF~!Q#^+ zJk-kW-{MSs@#c-~D?m4GvinWX*+b7xP!J@S7adC3&(O6F7terlGrBnn%$+NQCaHgA zN;=MrcG-)r{N{@=P8bigp%$!{py|4g`esF?I!Lj6YMw6-F}|g2iZtJup4LlFu(>{? znl3d5&->XGd$HfEA@0zGr1ALi%C z^)li97PWUqVve=afjy#;9*|uWqG=)9`}&3ROri57mU0`7q!hvjMoT(djd+7Cav$#S zeT3Yx(a|$va&St0PXDea0(XU0qB4tB0f$b(%RO93;R(BQw2h!(;#6uKV8j&olM?y~Gi3E&LbE49Ugb=Yc3Xgfdsf5>5R!WL`n6 z%mZA7^b;x?9BHEHmF}n(0{Rkp@>hv$qkt7R@>o4s-v-@`d?*K@)<&HNMmc@Tfak=B z6rLGv8S;hKoxGh;3bw5>OieS}YAaJC_YH`j3WvUo9x6eQXi2ov(>I2PoRcKfoKxcDT6WV=ybD3!)4A(X5Fj{}TOWDG)jMeIh?^BGSYNse9?#exU$>^%P9gJA*1ujR95 z{t1+mynK0(CNb3873+4#G;+&Jw|r0t*SZiphST7775XPLugz>>o8s+lTnSse@(ZE9 z@EhqU#lOnF4%qSB4|c3!O8WPbS^SFPvrSP`>xa+O2w#(4O%3^hA zfH;QiGJnyAGBv|+E%8L?swfzF+CGv6ua*Q6Qtc1|tORn?Ek?we7@Hhw52bDQJoG;% zZ_K;XiJaYOT~oY&dA@nF(FfUdfF?Vv{fu#qpwA6Pu){sof&O)OM$PtB@0zsx09%ZK zxZx5FEdo4b=a-x9>#=7UyuH(5cx|4aZl)0cZ`dYRzQCa3 zd0v-dYqtx!{D_sh%@+ChD)#MxXd5}+>W8-``li)XABZ{iID0x*38*!{OuZ~+T&jYT z+sb-I2Q~_3PKICi62ImAt+F2L_MQ&~Qmq*Tl{kY{QugAzXw{}*q@F@=IK_(c{FoME zH~MheL!iX{cbsn!PSDwgxbWCbe9cfi#lF#WV-0GvtO%{wcM!Gb8wBEyOy%@r58i$f z27a+yS-`#gMan2Sbi-$;*39esTMOFt!54Xg@YTsR7j9dl5j3Jg0OuP_;(`#x1S4dX z>>(#zt}DzPGFrj3G6um}h^SWrfiWE?%Twh6H5DPk1IG=w705Cb9@bcLwVu)%Bwe}g z-eEsVo8hceCEe%w(TjB!m6VjZG3r4qCvO*$Q56dCMu+ktYPiN3o@LqL&d?J^sC2>^ z2Pcsy_b8|~2M5C|B%=M4E8xEAh$27jKY+R@HUnF^^ACo#RN8_2Q2Vs+LAUJrF4WHf zkd~+3do0GK_1sX+{e`!@twklKh~Ih{VnJ=wj8X$Dh+}8{Lg`LN@3pn!tXogj#r;3e zKLEF1Qx&lqNg~D=mf9q#p0kOFhH%ZS2CL+B!3<~*B&rK;!}o;>1cP%|-j!O5w1!!=$C>%(~sO!4$?ztf9M{4in3b`0SGj7 zZZ~iF73mR44Bcti6E+vh;E*m#dI`m(uCsDm5yv678aInaNetf}`K~-KefxVTwQTC? z#!pWbUp3uNb=#*A=$tx4-I(pCBZ?saew(ks-&3xYDcIt=K8~mx)_U$Nq#axyTw=@Z z?Qa(xt&i-loT6l*4$(;$`E&DifmeP9njP3tnI*k5fKe4vY5n9KtrOhr+PujKDEQnK& z(&*frdkW2aX>q@e&qWmVWl-NCPrl2j({J_JygxBHPsF>QeNqx($x^86MuMt2b z``mt=hj7ww>*tYK`!-7d(3#ElHoQFEpXoc!r5FeIeb~n}xsA9@uC8j{}t=~Ww9(E_jXaf@8DzI_;bnL~O=f(>C-p|iDF^c%Cx@EX$@XZ+a~fRdE{ z$WH$wLq?qN!6zGx%LW1#WDvUxYNBWiFyU?i8{bqMwtpulflA{n)qqu3Z4Xprbcv{M zQbv!4dSHVXirp8E4-1$i+i&cc!%t$r$-dv1*pl0F0i)u(0?=FLF2eRr7LVyD*J7qlVw z$JbjunR6?w8E-@8v~l0~W6KucL5I~tn`^f@Oo`a|Nnjf%Ve{VhR^e5CL4?qB#RPK{pTHu4cSdLOFkd~E)?DX`zc8F!#!Cx<5ZSw(EK z`0~t;tV)y|%68lx&#KUb%x?g2u)$LqT!GA_y-G3+F(*wF9mj?Wv!!!V@bV3D^khb% z-!zXHR-cA7(<+_+aHed(J4`FzYgG~ZdfFEptL2S(*~gK?@R$5<{g(01(y#63d=8)Hy&5U7Ff%ED|upAg+@_E0kTc@yGqz4k?{^F z14bVIH%b;$^ERRq6tl0)iG+NiBcG#u7;-NHH*V8;F$BD$fu5LQym2GLNm}PX#rB$_ zuz%^`zJ9VDW34Ge5poNrTd>MBa_laoyD-ip?4zNzG67QQeL&~vlMQzhYh##?+?KnP zm6UbR&AY)X#@V!BDHs0qlGPHL<9a)0T7?h!U~_{`X2@ZFv(UGmxic&jt+t*`uWKDA z#q4k$9&FuZ=Ay5(RrM`RV(O({+lR%*qB+ltX`sym?mf-l08n1rM4??Pe z=xf^3pB~8(=r2Em5%u2tlz`L1Tu;M#TUFTp@l)6@4lX^}a=0Xx!;o-#u2Xs%LXgix zCvr|6V#%e4OTwU?ycZ|tW3FfUZV1mt+`JF8=E+2K+3=uF7gcI=SP?y9{l*`#RbH2ytgf>4ukqC?Y%ayJB4mMYt1evZX9f)5G zuHW3syDgDES_>TJQO*@Q8BrfmDLSzT+-aqr<}&PCz{&JC3C=l<9Bg8ca{kCL54Pap zqU(@;i|Y-PvSPHnuaoYi5c2DM791_cW_7I)=7jM{XkO(+h6L-jo8pOb+5Y-W!Ti{~ z{Z5b&C6Vlwp3kMzLjB*EKm?#9ud@jHq#c9dIg@i@9k>z(tlhzO z5bv+C`0NU$8}j_KOeahPl6)`tc{Vk&| zzvE8^Z5Bi{d!#oPn_B>-SceotYXBXEpP>xMGUq|TC$)1+0>$1=aGySDC(TC7*1Bwk zeALO93%Bs&W8H@496#o%Hk@(mfNO*x$%Sz8JlElC$;0L2@b+p9-`}8AnTpV}n;i?a z(xj|ybHmWwVO2t-nsevT&(ZTAlV-5`!Qfq&9vhaA_D0`KHU^0C0`D9&PmX{H zO(2KhaT@4PSUIELpunYC5ZlS1Pcwq7ivUZt5=Id?t9_)MSU5fop^YXhs{x?j@L z^S4suOznZoT=1bLQJ$P^QN)PHC!AHX$;5HP$=98ZxMeqKLrWnQ;- z!)2iSjU3$uG(#;P1{3s~usH%m5;c$r>VWBolMv)eJG8W?h>!%PZ_?>n6}y0 zbO`?yqrI3nXzlF7aSDOrw1#w(R_D32kL1Ej2{=g^h6IEEn*O6%xWlsOJ!bG$X*KSy zJ=dpR6qS@~AxwhU-K0a;$LXD=r~<{NF^Q^2$kCKBna+|%SebO)NzP8_)H@RK5R-Xk zV7j*sbj+wXrzHtvtsY{dV~Pb}-<7=U^WAa>AY>!|A-wg@+*X&6Amp~#_hh3pWJBM3 zXS2)es(aJ|l?tvPS0nVu2;a8g^Jgg_|Hf<3U5cf^|N+1HEZ+ft3Ll4m50vX-_>Xp zWoyd{>MPRIsL~MxbV8rogC*;@IpQfijgpB*b z<;+_6x{PeRv7;iGAWDC!%(ILa0HMkTl%&Ai^D&bn7SfuPkAJxg%5hHaeJ70dY#CF7 zumX|mJrVVOzdnd`psd-m8>r9>|AXf!7&HKdNVfYNd1H&JC9sB>1)lc*sX<~&uaO?> z-CwG8%H+#vx{0{V>aEae=5em}dSHdlYprDnY0W%Q0N=oz{VC&dD8_=FqT)$kJ$_D^ zZtOb2TJe|Z}kGt?e0RJfx4-;rnUy) zIzJv50_dniUKL#sxf&b&x|w~>Q}l7 z0)>6#13|qWUeA-P_RA9PR2FRwS8VAA2p&(Bf}NvL*6eT;8f@snLT>bBWP!DBkbe)5 z@uEBq9`q1Ky_pB8zSm|w>2>$EJg}XwMomQ^B#5Zb(^~KtcV`0_(&u~8a?w?()Mko1SD&zCT|bd)?S^b?4=L^`x3ba#|uUJp`-yit!#76?qkmGK#O+ zZ2YyO5pBEDrcf=rkymN)e{lAeaZz?{`!)y&k|HG_pdcYFAyNZKNp}kvGzik&QUger z#8A?Llr)G)w{#BOFm%U%4cB!)?{oj&`0(;c&zU)6ohy#D*1m6tvof&0y$Bq)+&;f6 zrDxUrC8Fz^3DWM%sySzIy|~;>2jO2x5AIsd8}j$}E!kUJaJ*7>cA!fiCYR^*#+>LJ zGwF*|E&jzlQrQSRmfr%`5m^tZ$d!edX0@uw%L>o-M_rNP-*M5bUi&8Dp4QW%VP ziG|xA7`g99CpA+wYqO_p*>XZ-&vIyqofnzOQK`8Sut?FK0X3br3l*cEQctozpXfw$ z$i+AQ3z~Z=%HOdds1b=`ZPH}EdZ=UpSQ4JbJ^L~2wJ$JPBiFDy!7u& zu)i@39|@Nve#rH}tp4Ul#zQ`*ia}C$he7Y^?m!>uu``K5BIqE}tXE5rw4wHHe3eCn zyVmYoPEO~1i)o|(!bOe!YW1dBCc8!+AH|a?XmZM{!G-g93tgmB=rF|Fb%NuWN8W7u zSPjW1QyxE?H|l>T7c(#Q)t9!WIPU4!z0Ox_r_T__>=cLt4=mfCX4gs2yS$;5(^bDd z^lfMD`Oc2FRw2dFx@6YHHT%qPZ7980m2sarZ@h1OQ9V+iD5>C{Zm!AeAcVXn>re~Y|F=Z};@QvrMxpQuw=v7h2spWL9 zfvK+<59{K1JC4$(UItHzT~|HL)5MQrb~EiFTl4#5*&s9=Ow+0_>fcGP9h zut}Cq;9fC+gOQIg$9l}7{#xUpT!2n|Hud0Vfw*pwi|rz1&2EE$<3=q$D@iycGpBN4 zahZ9uV6}aWCwRZ*Y&9VPzY3l~QPJ+7!mP1;xIUzMdW>&pwdKO^+Mc@g!qHUUHLf)0 zJ$!wCnZDhzp4H<6CI4()L`wYywX}0WLt){?jgxtA`DS4`z8Uq^>7t6x*#zWlYv^i! z#yKdwv6Mw~(`N-^V_iqZu!nutQS`&TvK*weNvGT%Q5GQ~G=@qKCApWBWGArPQ_eB5 zJwc_E*|1eQyG;-}M|ZfKkhDMS?G+b9Go88N+9tj$Q;KuM;27mzA(3TqhZ+G&6OXI* zcM$U(k*mX)LCGlSvIPTiE3BWXez2%bMw&xaOm~@-j?}8dz$WqfIJy8xY7^AyONnrEf*~IR|3xcOzmGN!x39s zL;Kh6-d&3=-tnC=UNKaHr}0Q%f2P@6l#Q0JBbY{~KVU|GVnn*f42>x~!R~)B6U9TH zuX7Wr{UTNR7%(}~n~#U|x4(FgVv%^UV_kztvspeGbLwjJtU)B*ezeB2i^@d(RK#Mb zpUdQs10aCn2NRHzy&~0gw1Ei7x!yJRHCd7fLMe*?2nz^xzKh;X$0RW6E>!bQ$Zfk} z-W;Ri0TKfMB%s26g963u0xqr>N9XdRM^C|}%M5+7MfXUv_7=V3T0>oe1i=?oyq?`h zU4wh4Mu%|he8PY=bvVPRUfiBT#p{y?O0E5>t?RBYjZxd7$kLihqf`v?(VF-LP(USV z*!PaO>?mSHo(I>~3I%$e7GrR!(0FqlJcz_21i$yCLmQ}qPqGm)?nG`+MUZV{l}2b;~S}KiStqvKw;4 zkE`QSJ#_=|&txFFx}0fWQK3vFzc(!TMRR?FL#k>Ua5tn98B9uX1})ci)Gdr zA9-5frRbz6dk@)^$;{ty^fiB6XPr)yz{e8xzB)?mNhwt$^_Z(asiAWz(k@MWqg|S} zNz|Kk{P$Z-m<=JpN~3K9(&SnG$^AjAD@yGNLT$gJMbBs`M_dYa8t79l)?%Oiu7xSa8H9`0q=ZqnuwL+DEt{BIiuk8y=d425&5`%43J^Fw!b5;$pirpyc9A zszQkuZU0k^iGxs?tY4ltNqfgCiA1kVAxi`M?7Uu_EmCu6-nF0TOW2`8` zUP5{d3YYpOO*KPnM0HS7h1j<6?$<3+#@SJY{3rFaZTF!ssCzsY-E8rJqICZFDYP8L zVLzLSYkNc_ta$u0ZZ5vSroO@Cnw;iB^E6*E+3MU+2U2UL%$L}rhRHQ*3<}d3(76wI z<*8?(|9$bNDEf^aB|^t*5mH)PzbFGI;hJJQYw8An|3WD|JFWlw;V?@U-ki=G?)QA& zmeR6%Eh7_sh)-g)ANzHGmoDz%EgLQ1Ne-Iw^g6`8ji$`B|6Z2T^(&PN1wI%@R=Z|` zuRpWgjN7-cCrAxxBn$rqDFw)ZQ2IE0u*J7t-2 zEY$v!o&Zi2qe_^G=*hMqeLti@n8B|;p?vi1nT^ZyFF`kqAH1b<&H8w3X zv9al-dF2EO9a!PjWEk_y%u#ZEkTNmJkzSmM0M+@uUJvk)t*_1S*M+lR#IYtZIXlY4 z$*5U$@s7pwo~yFzoX+1ts*&qZ?N4Gmi8Mq&3xO&;*T8f*$PwR%?KO4XHq-G<_zgST zwsqlSHr-h6bLFA}Aa~yyw>=&r1P<7zELz{AfUP=9skzn7YPusf6to#ErT(WwRasO%9l&Cep>Axa67|=Ws(A_G+L0gD|qxovos>RFkzGhgE$kt>%`3 zd;ht-X=n)TsU*%#eD3u@=p&&-W-J`d3h(+cO4M$+@-TsuS(w2A~#!G*W((wY?~%1E@Uy09|zj+ zmvS|%&D+dh&6XN471N>C2)asqqfYMr%lDfHkf|yT4kd*=_DYKbHYvq)f(@bI)%PRl z|5jXDr85ej1zR~eQUs7gjn(xBaJiU~0aYFpO7Di|kkS6q{sx7bR_3@oY#>}y+QvVTNKR4<4@xx2UWAUHg zy*Z0U6$Zf<#Kk*;t$9^L>syC7fyVpY^|wMku<-xuk?oe{?mFz$xf}JYeFPZ1$ytXq zhn*A#GA$PC!zauyCuhfHO&ZQfmF)0;{nNdj-S@e^98w|QTY7P}k_`H)H5T29$mt3G zk&*10>p2-ILq93J)8lYIT0udz2Sl!Z_IGYucbF%+rR>9xMkG_Ab0Vj69CH5sI2^1D z1Q3U)!*#u=W=0Y-p=SPnJH&}cK3-+-VII5JFe$A=jeK4C^TUgv>3DWHcmNXwN zD*AT}iDN@uL(WHHx8vDhpJg^3LkdEpGuGm2iPD96&_`L)P>uP_=aI~&+V`$R0jG_S6UD{ zUren*zf3yH%JkQ9M1tF({O9>*wMl*0^?vddMCjM9WoG{UuJ?8Ld;8q&Fq)?Gz=v8S zjg7{WxM7RYAm3Fh0S5?4J^F4>U0)7*$KyeW&e0hvknq(y)rnfy9C>7gA_v3;onf?r z5A9|8sAh*{F#Wo3`GLs8DNi;~*QDi6WTm7W%3yqOyEKROuG@=s$YfIZQ1eH#bti1| z*6!au60UQ&3W|NEHvZ3xQoKKc;xHlYZhW*+Zz^J&E4bs*o{B)0;1a6}o+#Yay=suO z=6&mf`f0uLbDu{q zBaIybN*vZDWRy5KMhZyy&hH~ntG3LKBJA-}u)qIGqVxift)I6R`MZwJ{YO5TJF7Y2 zL1D$a`t~1n)_=Ldkk`YInnfEBG7{ma|B(kOra_-@$-m)A3cC`Bx9VaOP9T2};;a2V)XWW`}Zwke+|Bae2XJ> z@Rm3b$_Tp|JZ=qT{HJN;rH3$~T)CV<_-$$1*Rb`2rdVDh2PSq?1iYA6ZQCH)ZM|vXl~vJyUfTXdA-`#I_$hl{+emgPcpRY zE)^`zglUnE7Am2A+nm)2J@S~f*cteK8iV1pbiTL_+UbkNuUcW|hfDt8G~^M?!|rbu zzt?5O8~2ZdNiW#yA6LjHz2Y)gHP2#kI^5Bpws||9C2gA(9^66k4kNw8%fuA+qSnvS z){{S9qc9c}du)>hK8wUn?a zb|+;p=jo)=gq1q7bF!c2IRaK!94Fd9itf)33{Xr_|4^{l&Oju|q?PWRD5je)*(3^c_}8NN~{u zJCyS&n|7nczyX@3;jLOb)uV-9j`6|4TYvLZ2;Q6uSB-y_?+ME?W%rqg^qT>#J z%k`!ua9LB|Z6kmD8I%SPCUr@PTP^9N<5jOX!-6|xpJ1jZzo3q4a91MN_thxUmj7DC zWQu%O97e~?8nUtdA|m-9-0ba@9W7kFS%1U?eZhxB@?9Bv>;<3WV&;JVJ%47{s66iY z`uk^NF)9sKI577Q3qXy|f`fmP*rC)u2@C2s9IG!Ce{$tTiN39}UCb|sHXIPeWAJIeJJYRrt+^e=PD7~j@)f?QR`IlH z1x9?zo7X$lbRz)&+U1?_COUhAC$mG}l&4U=vjMFha7>TZx68V(?fBSqx73;!Jx@qZ zu&8?@SpF)c&+SJ^O?W zzt{C!p)4sX&4UMW5hfO+RwXF@LW{^?|=IrI;q(9U?&_novc<*K2}JO-XMvS9Y(HkUgiV)V3jWdnBx zV6O^eXX_<(#>9l3(j28FFet4duhN4S>XOC;Qn1u* ziHNz%GhS(+>U;j0b`hEE9CIGHPuLoLpF6!q$4|6Rq$zmw@oiQ8a(FnsurC>U3D+)v z*YeEKS-@b3yvG!_JS!PNds+WGE9tlEC_@2uV&VO@mA`Vr<#~;b{(-?v@lSzd`oS(% zwfD`lb(afKBt9mg8%WWMr{*!*BV09c*t8PgkjLYzX*IrhnIi z+714Guke8poJ;z6Ywrfd4WPpc$@p&j(%@H%d?;7Q83J-1?)Wnh57@hJfAVUT4`rqa z5j*Jg^Pr!DwpVX=B>DEjkS5eM8t+pMFB>O)ei?ar&VNsY{65j#-E0sRP<%vwQ)2$! zeV_9_pPX+bac&@1xb2_8!B0fd2wbym0s5u-10-tV_N#j|KIff2qsGU#WA6ShQaW(|H;T{yxo6~2VbJ>yI<-~yc9H*c*HH&V zJ|>hOO+f@xMT2~uDzj&}qDm>dJwEYZ663hBdlwm0(RK;K$p=29kq?9x#zpQgpB>Zv za}8ZFYlG0R4}6{BYy?Ohoj~_ zLvx!wlT@d=J(6j*tA_5q!jK(WNE#Cx`!p+*wi3VvkwEF&!B+hfM#H`Qjn)ifk+Gt_ zNR}H7M9~V8BMtAM6!xlncl~_kqsWcbA-_1mfibWFTBKD5;D~@wh=KgJ#4!?JxNAwP zK7o2S2i}I?Pt10J1?>6LM&YC3msxCNWl2bH*_y!A|L}&Qz|K${Ao2WD^Rd}Q_@1Ob z^DA+)b)asp04!^}ZiSEPZTDdv_bob}9s`K6(7~bFJ7fO6IwbHI?y%!fCr!vV`CJjV z#hKGu!Bp)z!TAZ;l`DpJde+omli01ziMcKwjfFun%aS}V;5H7fza7TDU+Ly3C2tr| zH-ybuo+pYN2z{VRDleYh?5{6c*>fqu1cjHWy(oynOD*9&A0YWnml#Afs*CO6QNWm2 zShU97-}pGjz+*Nmm;!ambv3mlFz62I#DXNwaw`A9h)u22COd)f%U@Iy*MJ3NKK`SW zA`oE(fI!%HdXb1>q-XJek%B43`;L)RwJ{S#wZ$>tweM9wwP3T6>kEuxH=lHcg+n(Z9hoWjEMuB9hb-W{Sg>wYfPk| zegSqfkq@cN@fk@|WB@l`WK^jvlG9YVIjk|GQJ@(Iz<^!THTLR!kKc+vcg%}t*QW(A zp(x;|%-bCaOu=)8q7-myELM%@|gJE-r5En816Q6P6G{=H3l z&}}Z9EjTBfAC2bq*STMbyPebkES5H`ow;AhvsPCf~<<1HRii$Dtq zvy(OxH@qq>$M`J`dYVpnOV&=n-e7-0u6Y`t(Ia;UMCKhsVSV(8l|8fB15@gVlJ&(4 zEF=soyS8DBz|RzYZ*r8T>a|9Y-nr>E9Xod}yl|Tna%;Ir3y;IWOT`Mm#`Cc!}u%(0vK3{u!2L&n-G{n)? zp6R}NCbbbD_G$bG5r8paBO<*DDOR&XIP#K{9a5F2c`DqFn0ayGGIDjlOjjN_M|?V8xicDqplQ%6?AHtm;fWs&ofZh28 z7jnCH>YoMICm07?DS~4#<7dgaYcSasC}c z20_2AAR`kGrtBuS^Dsimh<+5}zkf3iaBq80T5cGsnUhmSUIPa5Qo$WP{R*!ozr8Ml zFWuU)-?S5l|v=2*2 z46cfx)x!P>10-Q^6o(UQVAlkH#Gc@ERDyP31^RBr?E!sopy@_n8~YXct;tTs;S5jo zUAq$m=XRveWO<8HoBHpWbbg^6^8EuYREb3vEUAJrtmmJfRUrRQ+c-Wnm~BadDV zqYB5M2GBGDkD|n2LQ>NDpvwszc91K4@FK#Q56nN{8z9_q;f=ujRBhY={z^eh|R0NjW-az>r;R5Jd#=fnIhK;jSFK}T7$3m)rGNV&Q*&L zwi1AP@N+C25kwGrc}WBYTWDkZUHli5LNjA^5!u<|!E1PYL-F z63jZsV%cg%A_=`9Qt<#GN4((4)~tv}KY$A8vFq!M7g>71V z?|wov{lEpS8DEjLlo$d8EwVcN2GJp#j%s!$3)Ye8 z)wvaaXWjg5g^<6dlPB{oS^8|#)T=^}bKBmrrGKGR$j15j8+X2+4MW}=h#DWh(=1vO zLBlT8GVSgc;l{abOc^*2eGJyHMTzeGNw22#a=dFJbz5l%Fgkr00%I|RJ;v7_BadLy zexg@O5Ox0Dk`q4R!N3$H{&BKe49QeP_8*ec17HCU4bSAr0$eW^j>JdnlnbUZzhz-$CkzoD7k8>E4!Z4uu!z66^zI=^}yBXe;~_2b4-bcG3WG00=Y+fsgNn zCx~UT0ebP^>JDaim~p>NVgxcEb6BRjE*ykV-@0(yjO;)_9i@>kBaaD@cVs zqBBNoIH^HFQWdr)%BR-41TVGrVDwN8v8?XN%B!YmH9?2hO9%Z)7~oh(F^|otgmFfs zuJej3KrzW2obiJ=Gg z1C^p?8Oy!Qfc?IdGiLFLN;xmz-f2EjT{;L7jXhXYL!NbJTm zAUtN-6gT?^<%d1FLz=S0QLL$;`v(5^Aq8w_U;^N_3Z%RIs*h%>8KZaYRvBQF<`MBMtP@|9?U=fBMx|tYCw0Un7 za;wkf+|Zl@oz2%&!JAzy<;u4g?;f8*=Da`OWCUInJ0lt`T*h{Uhxfnw>w0tIBWvi$ zA_J6HjIR54fMkI>KDE#F>4z^KXY`;QLwNy}KT7laRCTC;?e7xhCr3q(S-;*-S1&YZ zp)z_rgrqzoQxR<3O?ga+bv9H5M>!F4-kH|GxYs!MbOyjb2kF5|G{KQRI*%F9`*&0sf7w}SH|-RKaTpbyPk(@!53NX@oR_HO zrddrVQVypFS6zLivYXQ1q-Y)-{VsBjH?Z1x@n!Pl&?|!%xM^7p8|5oM2w}Z$XwO)^ zck$ru%R5p`YB^!pvxhE67_p6WKx~j}K{MEWFwhdw^*I-&w*7XN?C7P#jOmr$b|9s0 zS0J7#rlaq1%j<8>YinsVZyvs@IrJcGt(%Mb1_-B5JQX0iUzcm|7?H*@#XsV=`(q}U zrM~{x?$FHI8~uLmT8Z=tJJ+XO)y=?3*KSu@@k4TU*W`_hOd7>uMkT|Dr z`aQ8?5rF2P1KbML2_@lcm!M_Q70M)YM6%(J+51)>V4_*vu2^dnycR`eDL(x`kCBIB zx7ZdA0HIpU;u{2I@pJn1UVwF~o{otnVOzSuHvaR?PpkZbSH5SGbx9|X2l$sxfB%q8 zZ!Q48w}$~P#w0-&Juh3Y86DL7iDW!4`#l2!;>L4kJxz}uRwflzvANZMaFH(7-qX1I z@TysjBr#$`H!?BlR}oXVjPDH(9zaE%F%y`Vb@WBaRNm`m$P=_4Fcmz4Sp9%6C=de} z6GNZ zR~=w<5bmin^9D5W5of~TbQuMO11zUc_QB3n5Oh{1KWUYE=lf7cUsuE!!Yl4MZY8xr zy{O9b#B#TBQ)Nso4#uB!$9Oa0tgDvXId!A70*TT#;!Lll)*$gEVR+>w-J(+E+cHu-Qxu6r5C1w1&KALqu2rK4}h}b7J6G7h! z%0t5$&vhWxY5Als%*oARWBtqfwZ(9?gd--6p@n*RiNqkOsyb4-knn~=7p2zv<4=z= zKTsr>KA8?wuQQuN9yX(H^f>}D`H{FSt@RzT$38KXif_(WaCxc;+dcdP1ApC-JaMb{ zeJmIF7LN~Q{-Syholhmv#sK>+pKZDgo(Cof*!~tj#%j)0+d0g6_`@RZDe$ThKTCN^U#2AUs@&wABN;`XXH-YXt3IEVrIh$O z_eTkAdav9U2SqIDyYdD*?Pgt(VIYaNj*e=kmXeIwjO>G1FTF3qyXjArl1bf8S|ViQ zVXM#wDwo*&C;@wmiXE7UptOlCk@R4P`m7(wr7etT7A!VMdPz!sp`|q^xiudlzarR9 z`Ae_Ewa`f1f^0(3bD_tvEQd zbUeXZpI0yEUA@nfylVwjUDh05_Tz^g>iD~)nH`uY^tla1xnA#49h9joHENgZ0)#ry zrTNzgzd4-|8fqEy@9eI4-~4ggI(`-AKDt(>$ukk7@MwZ45y`P^iarN^x`a9K)l7nrkQ7|nIbTz%ml$_YWlaVl@5(?IBVt-x&(XEwu z@p013L5F}~>KL0lBNFNt#oRiv$Z5)*<;R~Cen%kN>! z;I`O{IA$4!2Cw=qg!6#BC61;~-c3l$4WA4UEFOAMJ}|%u)*^oYvT-$ayO$j;^S1~<&V={MMlYYs^9d}5gbfl!5X41@80K+fOo>WIV?)N=> zuiIN(ql?#mQeUpx`1Vm^Nfce0^5P$x-o>Lnc{kIg0mz3oo)e_iXs4yv2Yu!t|f{631`3C*Hi1fJKt^OF&A4n!{9<9iO!e! zO?zoM2BgA$|HI%I3_n9@z$ZPQ zQids#&-)~)5>Au#=2^%vy}{z3olS4}A|JK+iGhFcn0n($^Q)r}>%{%P>TwFKsCuT< zdsi9ack{ zreFPT-e1YMnb!1|H78qx3I;Gd-bBKajqEuK}^or#boC? zE6)H|)awp1(QzgB8T*p+nO#Py4(&zntOTk}td`x0p9*p(LSLwPUwOX29`rGALf1K1 z?bX=CX_3v1)x6^MC{YBwvUAgwWPz;Eo?+0mg4Z>>Pio+@n=T~_#c`(2hd z0FKAhgZszdZ)(FYn!P^E(mcD7AodykuJ^htmGU|(j+9N{8Qi;{GCFw6?<5(`pnPK1 zv8+{aFhf3xYk&HHXe2o@-KbM_W5|%IJ1k6tnQ5f;!evbGj^33!koEX%5R4RPXd!;% zleGE!oB&vsn~`}ZHAusd06=2}$Jb~(TFumR+2L;yu$!W9wR^?RyIRsaF-n?wKI&u$ z!zFAJLZLM{T28>V&vxVBIHP~Fi;4t}+og#0nV4tb@A@!4_+Le^ygX0c4@^Gm1}krY z?*p#y*7>c>>q=mx_OcIx34|VzRXbCnM~nF9IMMcI}nFwor zb#LM;8-michxhZ2fPKi=yzqbQLmi<#J!s5m1*;mSB1f$lD|!UUJbM9=jd>RpVSs{?T)PmzQn zUN0^yBVqegf$6v~QS;VZSYfjGVdoS~8LFO&-91~RV_%u(;J;O3UuytBNBw;M5?rf=k>^1T;Dj_+1+ z;0D(%xFI^|Kj4PS6&xn-$mMoPl#STC57ukH4M;rlS;#o-)su+Fn5pHJHjx)GcOZ<4 z`_%4+Tysu+p1Llt)THmZAvoGlPKjJpMp{rPzC0%KozENmfMo3z7zQ2cebd-{@kDrk zeRp1*+2!hN^}7`8mvuvB!<(7x{6R^l>O4E+F|WVs7>_$#8uvGFM{bJQYZwhZ+im|h zLE#gW=;Hr_prC9+(oB;Rk{n9GXZ{|y6tGp02A^c-p6=O&bIRde^C8nNDypH8ALK*1 zot3@CYF8Qh7Y;R*`tBO_1~q>%n0MRp#3$91L<;|J{*;YZ0cWrDwR;!=!!-P3(00gn zz+<)kerLERF?1K0jQPST0*=?J-gYOEfl0O|nTVb@?)M%36(FRT3S)=U z>Q@ne{V|Jyz>{1T#|*l*b}8pFmZ0|H(fP_N$8~;RCHwx<#_Mx=jTGw8NL0_mLEoL^ zSX$p-WPFN}i~PA5BuHdJ7u(%0l34GfG$cQz)04?W7Zp9~?CXsB^C}?QrIgn0a9qZT zas~_C#$rlJBZiYjvm*GBU|RB;nBdDA%IyxH1$-7z61u(T3<%Y(i0 zk72$usNKD>mT76fn_queMhtIMz-FW`=>Oc!QB|bpbt>f)&%xo%YDn7;xuQABuXEnM z6kbCdWL;E3PV(y(8t5VWKd^fS(r1cy6evf#;x+m5Ru+xr)cud)HkLGh5p>&aAU^m_ zWWF1c>5eF%%{!j6oKbaHdWMF5lj@*w26iX=XWy078ulgy5E7wfKAnSute z0zn-i4RywPyJy1iX|ikrPb|%=Z523TU4^KhX_8EqOG8h#819|GIdWm%24BLdI9l%t zw^?UhtW_jMFJebG&?}`e1vsxjY!1~YF*b%eXBf{9)wZKvp%lV5NH}i7T@qGN94<=u zfqJ^#c(t!XN-Zz4qa}hkO3#|j3Mm2?Koc`74cbZg&@WWt$L8ejbm9M)*x}R4p*57l z6<>5dDOag$+8M}P<_q0ypSb=Z`8--oq+<&~hPtg0LL;an6ffjsUj_X3ej0Oppv2GR zyz`77EOUw>mr1{n0NGi=-H>7in=IW%R=t%PKV&4F`8qv~naIzetiR}7>uK6-;>?OzdN>TSHt@QA z-Zg(dZ!imXm~rd>F!h3M1_39)Y+Jm7Kem))L;3D4nOOkpxQpOloPJoW0www92jJv3 z{C`W{^soR16`Wtb^d2HB;^~i2Etf0vrxz~_lAy>~)B@G)Z2#Wh3iDYT+VhpPTNCfTmavqw-gT*+>6BLOvI~EU$J~PArET+xY2#Cu(hW zqn3`K$bdWtSTPPkztXdVm4c&yci+@x3$G~}wQ{Y6d1os(`MMcdAjeq8cWMr!<(89H z4U|iy@k~9>69$(HkxTb#sw$=0x1F%88 z=Y*J5t3;Z`$B<_6xBr^O|K*SgqC2}_3a=TDbwm5R;tH!*OWh;VxVHAVle)Q9Q|LgF zIQ7ab3bYmJ60(LAiaQLq4)UslH`Z=^?wk(|Xn@u1;cSJ$Z4 z5it{k_yp|0Jj`?~NJ$+#!ud~2XG}F$PxNwkIZ|3|L{A)Hfkbg(;Ilxk-pr<+GzuOp zMK}-IiuoDaj26}CD%FY|Yn7C{yB#U(Xdik{lHeN5EK!Vz5O=&Vu-|svs1mu^Pu?K2 z7{S0JBL&Rj;*}5WU$&*-9dWE0vS2xUv9vxf$5*MNdtbHr{t*iCeV>)1FD@VZ(;l56 zl5>q_1ZK&Vz(!zxN&Qyw*a<)*7A~mlg2)Q~1Jv`I-8?Oq?aE$tm7nZ?|8P#95?T;4 zTC0aV6%PUoD-&Y$0d4BKcM~d)N;c~@!0WXKoW7%br+pWQUhY~6ftd*%uzNDoz;4?T zOU{)hFgHAc#S=!e$wtgp+$1(xsibv8+e@p1sD=90urs%1*+gPj)C0Y7=%mtN?eA+} zV1Q)eYUepRZ~plrB4Htv9@VT>_MS=&ox#O?&?DDl@_+63gpYowRRC}LRy}OD{)O)v9)&3TPdd{u<~qx8RM zQ0)PtkHrJ=wZu|%vNx$5RaCr5x@LST?=Pgc>V1h0Mb(ou?KP8nl^y=t(*El$J?D z+i1~CI|{z~19p>4bX*+W1V4YYc3vbgho;0AYQawHEJ5xApXuSB;ROtY&x%i8MaQbY zR!r$p^<}fJ$2rOjp*7Wc>O3D1kgapDHjFU>x!{>{q<56{nrM?}B2;-A`AqyVg+*Hr zd+p2PBKdJS4!OPUsq9axndkD=mv-H?%Rx3qrf2*6?fdkAn0KQBt92jigZp1YL~0E; z?O|OKQovAGS3|iuQ9s)|g*wGJIW#oW^o5$rPJeJMWhP=vRv|nOM^_h4}Za55Z{6uahC{o;@yp(p_qrmXNZ)Jh46U zG0v>tw8%N80eA=qXh_L<_p3b`xz3$StEwR~ZnOH@Bqz5TJC7eib}gaYx>f8&kgSgr z(mDl-YkjFjM97U;Vh$@Asuy4l?)w|DJQA=;9Bp2b2IgzXYl5%^ZK|tSuPS7=Wxat- zCxxD)$w3vQTzP)0byq1W1EGUKLu7?DRYTB_ySVM;QV8SnW(jg^R4GO#7BCguVBqFf z1MZKmnVD?c9jdt%#2gOduaVlCLo|Q*w|>HDbx`$VVZR=h?Ls zLVdMM<)P*L;nCY`{G0&uuNUvq>Ju=3;MP)h2O9HuV-E3&a8mcK^d)kKjWyW}Io4+PQAm9sI0K_d2)cnx`qFxioc7bbBdyv1M}m0=~J%!o)xc>|y%$ z{}K}qV~1*CN$KxF2PGBX>fI%lP3U8Tau`gnz4K+$voq?-R^zhSd4@DLlBffX8jxBt z#g9|?{N^2ds^cUc{nOp~i~Ls?*_kKa3omUHp%|8$rX6uN1&qkVfrnz1A@l`?tBO^1 zQ^*v5AIyruhH}w4pN$E;F7>Fs@ltikhO9LD#0?j{+i&@o+f!PoRt!UvRYjEAIJM8K zD(e6?@6bV8t+}&};TW!Wzhydq7XN|Ke)hpi3IS|LwL3KgKmV|G8@dI8>CD^2}DS_-}4oq@9D2TP^Cl40zYY`Tf`Cj>8x zse_eDQBhZR-}x^OWQgwrvuyQokX?Z{NL`oL->>Oa0C~6AcC=CMdO*4{gsY=wq~sVl zEcoH9XGkgkx-rlOY|R_%J;&`ua{SR7KY^sYapr$^^{g$j%0eh|!o99@zQ3#H*UN9u zZU0uc+7H`I)9R@ruRU_u6}GvKCxT%?=6x?NYF5#@i*rNU!1AC(6*WyfVYe)CiMGuE z`KR&+{Bn)x=>ys3(bKbolbJ3buWGGa52F^#Jjgc#+WbO@U5nHC(^bw}*%Z^MA!l4I2(Wcp9Eg~#1g=33v7@mecCghz!B5E$Ko>|ItvrO(5vY5I5zT=7Qw4#G?k2wRmG(8e%8mQ{4efk>C78V8P5}HI4(&({RUDP0D>&W8)i8FBJ}&i z=U}Ge@$-iahG4LeOY=I<-nyDuXV^rT~vJNq-&S9AXF-x@7Zys%;=GW1Za zRC0Y*6|608%f;o^(Z}Y95!0c1ee&J;o9swI8=u|!f-$-wRz>|2v|xD}KQ?*OvA)y- zDmZ2j4p%PLM*X@gFM+fz&n2Sph+1-LmU9}niJOBF+DN#yel|L@0#EWLvT{jZ{&U(e z7GvLH`{f8wc(h z#mEvc6L$+|d1z&d;1*0^>$bxXU|jYmC<}uan#OWGgUo|v^69f&5zbw`{BYc{7*pylBMh(6{J zd}iZ1FsKYrbn6ONsSdvTD}~WWo&XuF`jw*-u`i2?kcCI3r_#XV%^&!A2TSfK*v^j6 z-n+keP_q3<wF+6m9K=4q(eyS#0jrNly?b%K#lmOT(G2;>#uv*P{^EmQM{^U`=q|3G-+5rgQcG?=}STv4CT?&LyE`kPZuS#jjWHj z^?E%|VyA~TtF103X0=?{bz{G6WRa>@A9x}K9#niaiQMiA@{SNn-hEsqUL(mU-y-EF z8W^Q{d{A0@BdrXu(7-(KyZ2Qd88kVzhw~$P3e%3X|9N^XwK=ETq|e@m^M?r~@T@P) ze#-J_k*Ynrsi8rO5cj%!Jo)O#S|R+Y=KJuvIiJ3O~wu z22|Qq_|q>#DCHde6D$)AYcqnR(PYb%Z6L&n5No#)z8+Ge=VU&aOqYFqlECM*e?lk& z^N6OWlYW#ymL~GeTD$D;Pnc47kxnj9Qy~SKnfdS0BX*L6>%v%d)scH}g_@nwQ7Vd! zSI0tNi>45dT=lQ80;{R+SI^a{5F)iSbfYKA0 z#0XfcQU%G>kSRHjF^saL;-Z0ph&@67IDzFMD4HN$cRuT@^8lo$s(77Rb7=Y}8}O+! zf5fdM_1m;8sa)lqp8a&dM6H%<$h3f$BxCg2v5mOIVst_k>k#_`ck28fA}J5C$B#CM zv{Kj^(3Gq93nbQu^M8Pygd;w5jDsO1egJUGH98&mz~yZaZ%oi@c%~Cy&%c?y1@C_x z8_XmkpJb<=qa2QcE`1I-Tqy6#-#2?$UPGwVvS8Io|4wY7JG&+u>+@`K+_^ zIK`C4vm)DLU&e@!;$&y>0)ryscZ}w4U=V}*jvrbpjNRs5Jo6F}&`fX(mE zxYQoMNtn{2MXrOr(yMpH(KB+@XTWk+S$~mEyZQ$*`9hT}ZXF&YEW$|Ldt%m`qZeW9 zwv~*9ta&5Khu7U4I?Qw75}PkF8jMnLIiC)GU@aJjd1juy_kG>>6%p>Aujc%yl-^uya}J^j zn>5-ftn?c^kr0lg?>PBHrqieIx zAM*c{I+)zxZ9iuVdE)>W|6%+lbAiu|xy9Nv=Y-s_NuaCytT+uteX72aCZVAS6_yi7 zZgVY$nq*JNL%_^Up0Io7lvFX&_D==Nt`F;b+l(1A_I42ff_g#nNukEvU z+)muON0k_?X{PK*nakap_^9cr7TFRZtfq3}pX}PG#@(oRpHX|lOB@g289pVLje)0G zZARy?+9$uw3{ofT-LC0|HRq1#GJ7L6TBRjsBDICE_@Hz0IQ`GqYib_8xmU98dDUfT zHq7Ni-jDNe!)oFAN#iaL@@f&j=Gt)y`%wWKwTCkt<&=l zmS=0R)B00*O1|2JSKeG5pqML?v#_m}GX36GNgIJ>|A?JM% zq2e2IbU{5D4R*n4 ztlMG5plO%3OY9<5JN;=08WBChaq3LXw=AQ%^yly`ed@Z~Me@)tal@}Vn3 z%BqwSY2IB5m9*sjKtmoLxoh|FpD21jD|tkM?`v*z&3S5^;&qu!{-LU72KU1W17tN# z$JV+XaFRjDjY(%uPh4CaTgj8`v(@q7>_yVRY#Wef`@Zd5aH1@-%xIKY-TtcPFu)3u zX)Bd@RR*H_&}}LpCU%aFqh-pON9&5h8(L)byOEKw+T8`h3{PHctY~%P9KKyOs?0sE zs)sY^B|9YS%vaTzZCM@2xY0@0v;>^`SXd8!KwM4coE=l|%bPukG$Z2SSFdwof@Dh& z@{yWv(B&F->GP)oigvO=&-lJ5@oN#kS_=>cfwb-+;uIe4}K0OGQsb4X3n~ zT`557hf`Iu$eRLoH>|HC+=s1(ubCO76kW1fc=cv_t`WM^>%Up5i5|M_5%0QvFK53z zsU4dx+Eh-AM*(SK)8WXAEvyNb6OA%mZm7ao4b?yj4RdoSfA537YK&?zCU`y8s!&jM zxY-Ok5%QiiQTM3lBKBQY4ZkM20OyE=f^_*s;q^&Qby}^TCh%^B{1$cTy}YW8Lrc{s z?g~v;tQ~8M^hFv}Urjz_qTUDH40(EL;9x|;cC8i-pKgk$@q;LglI*EspAI`u9ZC@!`# z&TKIs5D)-eq0WO97O4zcd7&4LZ}r@6+A{4uIv_D?QAwSVq}US`g9*p)!RJEV1dO77 z;#-s%buB2CS*l+dzYWC^kCJhDS(^x<{r(dEOtR>2-d1={(72gNz_$M2>Zl7F+M-}c zz(%cN{u)Psgg;Y)lkFej61j%UCKCtC@!!(0id2e#gGOy>MNMi8TmjRDft-%T<$hFA zXK+*AU#5Q&aqe?n5aqa$On-aBS#S=2(=7E65>X~5a*eX-5JZ@vg=-<$-@ef%Rg1-e zqceJAgb?mrBfq7GKtySwQtv3i_KM~k} z*~b13LMt;`e|l-7wgK+B_0bd?-n-wnBMd{RBW^dkEzdU!ri?F3MvGP|Dv*?yyt|7U zCg5rVeR8@-8ZTbI79`S1d4b6vN8c}?qx2C&5(X=Z`7%{IsI?5@)WOOHLZnPHaF5yp z<0O~ZgajHZ8_VS#>)$ng7pdnwXT7bM_^wBqmFC&#WJjOtFEgQL8!k@fBaWw=AlMS_ z)LBp5yTxe&JoB*mF7C1|+qu49kxHQrv7i3nbz{QRC3NJ3J?=IIji<@#)_Sz< zP*Shij|h`I{bB1V3+{8*1A~-I^6G4dD$3VrZ0Q_4wsRGp4ZM=ZeY}*HCo5JXs`+ux zWCb*p6cpV^1#zUg)G-)eKD*1|i!&UV0i0>dc^eoMhOq9tkFP3g^YqI1aumHn_tqbcluN&{I<{?S1X_KTPVx9|Tvg5JNMY96mewea($MlbQ?fiVRn zQ?7J;6z)S4L^!}DS1*%!v<(bEIvFn*wV`*}p&|E7!z|pTV~(~ph53Gm1aJAh>yKiy z)qL9a60c=7Rt`F6Q}A^K<$ACN$Yy#DoNc@mC2nKmT}lS|uB8lJ>IBn2 zKijXq%zH;a?V*E3qyBqMT(~OnctsQbum-oFe?82%J&IOYtHos2u(r`a4|M95QLZK4 z{QDB}bef}%Hsk3zE2P*#7ffEev@acQo`i`wKrqnYrnkO`%3sm|1q(;M!gA^=1?<-m zlfT^vd{;o%22FCeNgSB`P2mWCTL2#&{2+2oyjPvgrmGoK&b-m}j?Wzr7DP@1!KJlj zd(!q4?-Y2@7`0RwJAHhAUgEx!<5vC>upsBlfD|}#esJKjN&DbHGB%v?7Dz{g;mMwf zjQpS>W`R!gnc#dW_L7_$!nez)?+A9S^~qK3EC$E7ALHU|n)&(xAkxst1kgxD+GhKY z!$?KZM>9jiIK`o#6)c#}30y3c3r@s+Rl`FpoGNTKl7%`2)!BesX;5i4WIx7mqFOPz z40OY)y@1u$SyyRcw9>QQ{=@?wDN4v~81<{uz#HfXnc#@^8jTiMp?=YLRqPI-0`Ffb z_g5Vkt+K%m_t=-8U|RKM%fJI6CgUxpc)hd8h^N|v#ddblaPGpKS}V<}L=>J1^H20+ z+QbuW?r@8ws)5F%-}04Vb>IGgob67KY(89pjLv0xUS$a!FFFpwNj=msNIF?QY9IO0={d^7oXcb=rF|u%owhx1-gPxz~U!@;}t0S*S^pE2YZM+mNb^|-Xr(1 ztpmph`Zc#p-F4yCO1m^MQ1`2f1$+!watYnk%o*uLZS=jOY`|n*}c0Oj%>@KQymqD3U)J`;aT<*h+W6}}y!UG}O zA;ArIhz1R#r4_g}gfn_Pi#5bhc=B^=+CppY!}8k|DT`_D2~s(=M}Iki5=J)a9^L zD}kQ!xL}ZY>i?mhkp*n9ATsaaHBDv~U}$i_dfBTE+-BddptsCTyO`CjA9_G0m-z46LqA>=vzBLS%qZv&h2^(w zKJ}%9ivNXiJFyS;9+_gkpt;^pYAmQ#kPQ{kwtMv-`cETT7wGq8aHEBN^(m-Tz=m+0Nl`bvr-A0kvkUfJX) zr|k0E2!5wAl%uM-rUU7v4f`MY+U5W|TIKlTtBRodTU||2p}&w2uIUmDCo9;{g8Um3 zLfF1GM=l%pjm^-%j(@+~q$+%p;j={ZQ_n;~ImITNH_+gvWz*c+roW9sf; zbZUQCrUY`Qv_R`zby8>XV4ogq|CmB2DgrYne^Cr zt4Awl5NeOjqQxDEo6kX=w%xr!V8|0m-0k1h&5qTx_)W{z9no^iRf5cIN$z_n)$Y2R zjU?3PZW|Z|gSG3Ob=&*da?7twLxi;hU}8(B>{t~_g=!&_tf*V5Ij2VizTChG6>J(? z4riT-=_F!|vjHgBzyC>Pz78#zn?Kll$!# zmlnmXyCobrfd?$>UO=VcucR}YNN4)zKXr&FIsiNP{0D7>tZZz{pgfy`Vfb1RZZZX@ z?=d5S)o=8BDIR{vKIE@uNVzu|lN<=-w$br`f(nj(9&1TZYC?eKn~C1v1=;f5TveFQ zl`JGhU9_PLSJyHrc=56e;pUy`5^e!d#icqGg+6{K zX#Om9M>Jr6se@ZYz*-Xg>mHv3w-9^@C3E}&P;4KSsu2h{SE?KltR+{! zS2?!PfgZ>`M7d1r`Fo%$B=wtQy#XIDN@PhN0K*VStaz*r$KQ_}IR72OEoWG}dU!#q za|7@(1h$~7dRV+z(T_qe;1-R4i(T+!gD&0*SAQP>?k99=`M5pQ>Hp*DL3)!!4@#%# zoo^WjiwG{a0bUqj$XieV{fj|U6|J<8?)zXe*BiM|C;|26;RN+JYR=D6TdC zqhc{as}6Tn%SixTd8f&2hgE&^x-)cDu1N*ijwxZ*UR&uTSY}S3Rfsd)`gME#Hw|_e z3pfO?6`ANx#%`{Ept-mDnd#N`Ttd$U%-fExNo|<5u4JiX!gjouY#Kge((|l5=nmag z|DOug=X+jjE%$y`dSJd1s9`1o3Rg=|x-Jqw0E<17`fsgVb8(DTfVk^4Hh~`{ov5VY zy!{K-&3GyK5rgl=_8(m3L%r;bw~`=CHcf6QnRPiYj7a$xfj&d>fcq(+3AzbfnN!L`Ll_aLlaxD75k ze_+;>XZvnZvZlD#yiPLz=$*nd?XBVrZwh}A3RJ2+9zDsg7yKc*;Q)e`H_D%Z9XU>x zl~qQf+}MQtd!a(3dyG`7r&1-8wr8u;Tz9l2leEIL%6IV+xR0Itk={sndZFnc(uj3s zG==0ZP_&Awy1JjPn^Xj@q}@uz7>u?jnE+G5(uML~ZH2q6Njtl)4xkRfMfX0{GQ7q2xeBkAPRcCSrI#En=F6`8Mn2DlB~^1P zY17Hhs9iCDd$oe>JM*74^d`hKoka3 zk3<#xmZax$1%HcZb)!PnLP_@H4a%?H^AAllQ+;kGa({ST$`lxdb0F0|%x~1IhwKf@ z>HEFmExAu>GcfD{&|SyRi0%jRLn_JfFsYH@RoM5Q8)UV;U) z!PW=GXa$a0l3##lJQ#_M(H@2vj}@ob?|OL+mjdIVuDJM|Us_sq`PTnRTC~a8OvNB& z6$5_pI^hNI0vv}D`17*-(I@wRVW`?e#Z$1x9YgKxu`gnwjh(lid77IF<>fM;%u!aG zd@4A2b>5-nH!({)$EZTz@*5(eW%9(2DQ6SyuznJ?MMHC= zfiis&Ydkp<*J$y#EUUPxYMN>IPv){dyS9f->FdMG^*6x?#E*sTW2LwzpB^)6R?Dd1 zO9A)87#+}ERu|(rLoE(f_hHg(65I>nUN~GK#k_2VD0E6e286w&Y^X6fhXl<-o>D4d zqeE9cj*f!OJ26-p?S*Fjn1I#Tnhl$l)aeQ}YD}#ikLirgSQ>rY zhJKg&!Cgv+P=yV{!p|CSa#j`*k~?ePEctqJuuNAIAVErBWj!=n8QJ@H`csUskNQKvh{CHaMP($+yjW=cK|hm@wi7-ax1v5o z`e(v(I&1PZiTmggZd#rT^hQN+PS+a2#WE;X@6qY>yuejdw)L;U#8-3+O>XV9#2m>A z6BGl-6wTM8b;tShE4iFArBs@M?tY!3=BAjZeV4Y?lpDhv*SPq&dAH}SXrm?`uaHhp zr(!RUkgt6bSbnH4?*aP73S06V+i#bPP44s}4UYJ1NxUwgdKn?ze@Kg^n<)UundkIc zbrqMF)H38C^p*wh&Z|t{nU7>vTDEzbz`Y&lmBnbrN&ydv^I}P;9ovb&Ba%4FJ2D@k zhHm!KINSe)p3e%_CR0)fDnf(9#TY4%vPTwj#+Z|Ejtq)qi(%ou4i1iKB3A+gD3oJm zWsn=xsRoXojoUAL<8Y084E=(|`ypmO*pbkyrS35$!Yf$^&;`vO=NPsML3rZCzd791 zE0!r^+MnuIz`5kqn3x+w_`|c_C(x6PDAP5F_v75ge!akL%6HqjqV>IfrH1;J|h~ zM)tXs|1xw$#snkmUhVZrZO9F>dTl%KBr7x-m?cI(@J~U4e2T}GeQ{uKA09W|{cy!> zA)KywF>|o(I1zseHU*15NKHMh`f}C*am}Ru3e3U(_S`^yc31dhQr~4N-Sy%A^)pZm z;T?k3TK3=89sI<1G%?8vUr>*2e_X>r>JKWPyRM<{Q;{yO#!DhlVsY5a`~+|X6kvW< z;(Z`|j#|Psa0iZRTCD!TISK4dbs_)zucrj{gTRQV{JDK_crqgSpoU=)-saWS%=xm! zKb)HEM+p9K0{giU_``(v|IY&M-~CqbH-x{1i0lc=w}P3?cV{H)?laJ+CE;!Qi&b9! z8>;+Mw?dyPMZFU_2eT*z)lJhQ3SCFeM8Kr%{;!7iJAv;Wg-$N3JUsbhsEN&NMhC(P z*DLgq-e(w11@*`ErSd@%r#~u~rNNNp@PixMDI$HqPJ5XiVef2Ko%7_An?=?KW<;-!*niG z zobzLQ^7~cucYq|K`i_#89I6=7I1EDCm~cE01MpULQ|iugnb6T%A3*Y3qsqZ?m6`6CB8kkf zFN;D)pAzhI(JJS|olr~0@fs0e8dSmn*qQ1R|4;rxIs}O}zbK&i=x@3d=*uXf4BTpZ zDcoQN)_|5u9|+Op6>2X80|k_9LW@4$%=RSQp#XnoX+A#Q_FVloiH!+=H2(L67x$`- z4DU;07AEI3B@sjhwPq!Lr`g0sU8rK8eZDh%4DAS=w=-WN=>b=%X1IyYZP8AAFFC+z zV}U&N6`Jcz!O;9f{arhQn)*7*Bf?*H8O~+`RvX?-=>iG3 zoaR@d)T!V?CBJo!63X4iCJUl4xTb*lfj0;~h3Q8Zv90NAA~W>0ghJ19tM!UBDo1i- z-DZ0v_9MYMdJtG|=_e?8H8dFoUPW3HGyTJs2}$7cX_WyStlq+%tXzf~)Ukm>zU@S{ zOG(F`>6(`gliPWA7qC+J90W+U*rPVd!QiO^!@%<_N{jVnFrgpk#{Q*pq~3Z}YQF-) zjpv)A#h((`a*5v_^nYfdSO%I+5PAT*Z}gwyA(Ip3-JO{tM&XPWfL5gdDr;3Z&FWQe zMTM|Iuv3n-q@67GwsR}%pKsPIkLlB_w>}&HL&uV~X@lD5PfDyT4~36(18`TONl$k& zD1f(+kOvh=IB&3!4>@#lxDU@DMt8%^=UDa-tH7|YlBS~P+h+`S%BvDh8z>Nh`hx~fseM1`c%sg|<2Qo;X=%~cO z7-EE7Q4!l{s^b+__KaA`i z$Qa}c@fhmPNdohNuP4vx^lsq{F=@>1KVKDDD!1kz=8ogtcDWz-ZIv+F8y<}nG~apy zKG_H>2C2K4bTe?jAmruZ9_qN&Yw>pg{w>VXNC9J7_}6FzNCV@tk>sbIm_1*_vyH}z zr?#O+ZFY|iqgDN24{jqP1^6>^UK{#@g&VYtso0 z>&D})bzL&;Bo-|)ucIDUG*++?d=(1?#x%+N*Lj0RMI2VpUoZxB;5tMz(r7yP2sqd7 z=@_i`S-y6^`hp|PNG~1|sN`@o{>Z@8HeL(%V%94@yB+c2O9@!T&2BLE;D{=IG$5=2 zJS7eA)UZj7Q}3^MrX_7r9&`e^_s(0Lu(nW#>b6iOvS`upzCH!DuYv)s2S-Dr4dJ3c zZoXDOhdEuISb~U#;0+9AE~gERY8lw^X8%@+T2r0VAsNB>!e(QSCL8_iZa^HVr5fwLGpTHiNeSJe8N+gMH*r!j^_2mqhb@DGh zekW3I5exL7K!9fLHt|%OBSN9O^(C&u0eI!K%1@Q|8DLmL!W@$u5LY&&z>FOpHx`3- zdFAl?f+rsyWoumygha9Rg3Iq1(@9gn7q9!Qql*71E+zD(r79aw`NsK!wxdqpjlvS9 zi&o|m8zG@L!CsF1oS+$1kPPou@>EZ{aTUKph&X#R`C#p z8e3PVJp?C+{JW=wFX?QATLsBQcD*ji*e}Kr2{B0N*E03#0`P{@-bSbKn^B+zNF|V} z%O(G8iP*YUl>G#QoACQ@F=}DtSrz1xhhbB{>2UBHuSTi+?Oa z^U#S_C~z*qRpyZxsQV7dd@44;5izFoTA^xdV(9Ep*1`t${?vkbU+L=hIo^v~E- zM+L>GB^YahPR@)2H#;|tjXnx$xh$LXtsHh2q7ADkYQ-p+v@B#fhjr>yX8`}oiLll$t0N*vy;lx zS&{RbQElivzrHSRA&x*++ZZ421-*A3J@!w7IQGCLJkbNJMv;7%c5IU@h zc1(3e#ce72?S?TS9|LGK=oAtOSchGpQ<(}5z(XXIi|%y?%kpZp|0;|)Ww#h(EZj)d z*apcS6GTyZd<)UsYNEUw$Rij+kG-7+;Ny;CLMXmA*T#%aw8(5V6ICIx*a7OK)5(^> zh7i`FuNxs-d#dg%V<*DsS4Fi4zI9*jhc}y;`h#n(u+6wY#~g@nZc33W1}tJ<9BI0Qw0_{hZ!9+sClYRs@n|Qy{qz%1HgP_fZLH7FFBcmGUMenD zg-NBa`+XWBx%0_p|7oDfZPX|v_y^W>6ECAxGSfi@^??_(twkoH!lS<@w!}7v;U2T_ zJA_N53a8$2#ZNHQ;e%#w1hs(l9S2dS1=M}GGJI@Mt-2fB|ESxA2HE>5@5X%ozd3oB zO+$PN->s5d+^Rj1NOT;*vIkT$LmIq^r)_xJ!dyX3%!bZU3s~@qIMgyFV}C?Cj05Vz zZTh!x2h%-uXk?!)azhZIjbLl&gU=%oHWvb~0Zv_?&P^MPUyz?9*6i}f#4m7Z*<7vX z{=|y|2Bz(?-GL3LU zMaKG`i;{BZR$RW+E+lq9b*xfL+((~JwQy|ro2rkI&B>J~p2x8&2R9RAiM|dS(L{bI zU%L4xzI3O~%eCvR<}L99?UPv!KBjA*qDc5}7JI(jQhJnAn#EfB<|}&}Y6e-I1=~!X zF&J&c?z`@8QjN4C=V>f+m2;y9@qDWuRES(R6yP5Bqwz0*{d>N5y5i;KZcB=c42eN4UPHpn5@CoC-{n6pu~}wFbLgedq_aZi zK*G>0WbUah3(>Zy_9#{lDu1#RzfLkH B6s7>& output_tensors) { (void)cur_token; (void)output_tensors; - IO* ptr = static_cast(data_ptr_.get()); if (!is_bert_) { // update v_cache @@ -1041,7 +1040,6 @@ void SmartMaskIoMgr::update_kv_io( int64_t pos, std::vector>& output_tensors) { IO* ptr = static_cast(data_ptr_.get()); - size_t cache_len = std::max(kv_cache_len_, prefill_cache_len_); // update input_tok *ptr->kv_input_toks = use_int64_token_ ? cur_token : static_cast(cur_token); @@ -1065,7 +1063,7 @@ void SmartMaskIoMgr::update_kv_io( for (int i = 0; i < k_cache_in.size(); ++i) { uint8_t* ptr_in = k_cache_in[i]->mutable_data() + pos; const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = 0; j < head_dim_; ++j, offset += cache_len) { + for (size_t j = 0, offset = 0; j < head_dim_; ++j, offset += kv_cache_len_) { ptr_in[offset] = ptr_out[j]; } } @@ -1086,7 +1084,6 @@ void SmartMaskIoMgr::prepare_prefill_io( IO* ptr = static_cast(data_ptr_.get()); std::unordered_map io_bytes_map = get_io_bytes(); - int32_t cache_len = methods_meta[0]->input_tensor_meta(0)->sizes()[1]; // [I]: pre_input_tokens Result prefill_input_toks = methods_meta[0]->input_tensor_meta(0); prefill_input_toks_ = std::make_unique( @@ -1303,7 +1300,6 @@ void SmartMaskIoMgr::update_prefill_io( int64_t pos, std::vector>& output_tensors) { (void)output_tensors; - IO* ptr = static_cast(data_ptr_.get()); if (!is_bert_) { // update v_cache From 7d9a14ed4da910130ee1a7ad3013c42de9748c3c Mon Sep 17 00:00:00 2001 From: shewu-quic Date: Wed, 19 Feb 2025 14:06:18 +0800 Subject: [PATCH 4/4] fixed linting --- examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp index 6786f132ef9..cfa3b392894 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp @@ -1063,7 +1063,8 @@ void SmartMaskIoMgr::update_kv_io( for (int i = 0; i < k_cache_in.size(); ++i) { uint8_t* ptr_in = k_cache_in[i]->mutable_data() + pos; const uint8_t* ptr_out = k_cache_out[i]->data(); - for (size_t j = 0, offset = 0; j < head_dim_; ++j, offset += kv_cache_len_) { + for (size_t j = 0, offset = 0; j < head_dim_; + ++j, offset += kv_cache_len_) { ptr_in[offset] = ptr_out[j]; } }