```
conda create -n vllm python=3.12 pip
conda activate vllm
pip install vllm bitsandbytes

hf auth login

vllm serve CohereLabs/aya-expanse-8b --quantization bitsandbytes --gpu_memory_utilization=0.8 --max_num_seqs=64

vllm serve tencent/Hunyuan-MT-7B-fp8 --gpu_memory_utilization=0.8 --max_num_seqs=128 --max_model_len 4096

```

In [None]:
import asyncio
import os
import time
from concurrent.futures import ThreadPoolExecutor

import openai
from openai import OpenAI
from sacrebleu import BLEU, CHRF
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio

In [None]:
bleu = BLEU()
chrf = CHRF()
client = OpenAI(api_key="", base_url="http://localhost:8000/v1", timeout=120)

In [None]:
model_id = client.models.list().data[0].id

In [None]:
# response = client.chat.completions.create(
#     model=model_id,
#     messages=[
#         {
#             "role": "system",
#             "content": "You will be provided with input text and your task is to translate it into English.",
#         },
#         {"role": "user", "content": "C'est la vie"},
#         {"role": "assistant", "content": "<translation>That's life</translation>"},
#         {"role": "user", "content": "Le chat est sur la chaise."},
#         {"role": "assistant", "content": "<translation>"},
#     ],
#     temperature=0.0,
#     stop=["</translation>"],
# )

In [None]:
# response.choices[0].message.content

In [None]:
# with open("/home/mark/mt/ja-en/valid.en", "rt") as myfile:
#     valid_ref = [i.strip() for i in myfile]
# with open("/home/mark/mt/ja-en/valid.ja", "rt") as myfile:
#     valid_src = [i.strip() for i in myfile]

In [None]:
# def translate(x):
#     """Sends a single async request to the OpenAI API."""
#     try:
#         response = client.chat.completions.create(
#             model=model_id,
#             messages=[
#                 {
#                     "role": "system",
#                     "content": "You will be provided with input text and your task is to translate it accurately into English without commentary or explanation.",
#                 },
#                 {"role": "user", "content": valid_src[2]},
#                 {
#                     "role": "assistant",
#                     "content": f"<translation>{valid_ref[2]}</translation>",
#                 },
#                 {"role": "user", "content": valid_src[8]},
#                 {
#                     "role": "assistant",
#                     "content": f"<translation>{valid_ref[8]}</translation>",
#                 },
#                 {"role": "user", "content": valid_src[19]},
#                 {
#                     "role": "assistant",
#                     "content": f"<translation>{valid_ref[19]}</translation>",
#                 },
#                 {"role": "user", "content": x},
#                 {"role": "assistant", "content": "<translation>"},
#             ],
#             temperature=0.1,
#             top_k=20,
#             top_p=0.6,
#             repetition_penalty=1.05,
#             stop=["</translation>"],
#         )
#         return response.choices[0].message.content
#     except Exception as e:
#         return f"Error: {e}"

In [None]:
def translate(x):
    """Sends a single async request to the OpenAI API."""
    try:
        response = client.chat.completions.create(
            model=model_id,
            messages=[
                {
                    "role": "user",
                    "content": f"Translate the following into English, without commentary or explanation.\n\n{x}",
                }
            ],
            temperature=0.0,
            max_tokens=2048,
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error: {e}")
        return ""

In [None]:
def translate_parallel(items):
    with ThreadPoolExecutor(max_workers=256) as executor:
        results = list(tqdm(executor.map(translate, items), total=len(items)))
    return results

In [None]:
from datasets import load_dataset

ds = load_dataset("openlanguagedata/flores_plus", "default")

In [None]:
src_lang = "jpn_Jpan"
src = [
    i["text"]
    for i in ds["devtest"]
    if i["iso_639_3"] == src_lang.split("_")[0]
    and i["iso_15924"] == src_lang.split("_")[1]
]
assert len(src) == 1012

In [None]:
tgt_lang = "eng_Latn"
ref = [
    i["text"]
    for i in ds["devtest"]
    if i["iso_639_3"] == tgt_lang.split("_")[0]
    and i["iso_15924"] == tgt_lang.split("_")[1]
]
assert len(ref) == 1012

In [None]:
results = translate_parallel(src)

In [None]:
results[:10]

In [None]:
bleu.corpus_score(results, [ref])

In [None]:
chrf.corpus_score(results, [ref])

In [None]:
with open(f"aya8b.txt", "wt") as myfile:
    myfile.write("".join([i + "\n" for i in results]))

In [None]:
comet = ! ~/miniforge3/envs/comet/bin/comet-score -s /home/mark/mt/ja-en/flores.ja -r /home/mark/mt/ja-en/flores.en -t aya8b.txt --batch_size 32 --only_system

In [None]:
100 * float(comet[-1].split(" ")[-1])

## Japanese -> English

|                                             |   bleu |   chrf2 |   comet22 |   Time (s) |
|:--------------------------------------------|-------:|--------:|----------:|-----------:|
| CohereLabs/aya-expanse-8b (vllm, bnb quant) | 26.37  | 56.94   | 87.97     |       74   |


## Arabic -> English

|                                             |   bleu |   chrf2 |   comet22 |   Time (s) |
|:--------------------------------------------|-------:|--------:|----------:|-----------:|
| CohereLabs/aya-expanse-8b (vllm, bnb quant) |  39.90 |   65.57 | 89.1     |       74   |