## Extract the keywords from the sentence

In [None]:
from langchain_huggingface.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM,AutoModelForSeq2SeqLM
model_name = "princeton-nlp/gemma-2-9b-it-SimPO"
cache_directory = "/data/data_user_alpha/public_models"

# Load model and tokenizer with optimization
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_directory,
    torch_dtype=torch.float16,
    device_map="auto"
)
# Instantiate the LLM
from transformers import pipeline
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 200,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

In [None]:
sentence="Apple is looking at buying U.K. startup for $1 billion."
outputs = pipe([{"role": "user", "content": 
                 f"Given the sentence {sentence} I want to extract important keywords from the list. Your answer should contain only list of important keywords."}],
                   do_sample=False,
                      eos_token_id=[pipe.tokenizer.convert_tokens_to_ids("<end_of_turn>"), pipe.tokenizer.eos_token_id],
                      max_new_tokens=200)
dat=(outputs[0]['generated_text'])
words=[x.lstrip() for x in dat[1]['content'].split(",")]

## Translate the each words to build the COD prompt

In [None]:

model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# Load model and tokenizer with optimization
tran_tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
tran_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    cache_dir=cache_directory,
    torch_dtype=torch.float16,
    device_map="auto"
)


In [20]:
from transformers import AutoTokenizer, pipeline
aux=['fra_Latn','deu_Latn','por_Latn']
text='I want you.'
tran={}
for lang in aux:
    source_lang='eng_Latn'
    target_lang = lang
    translator = pipeline('translation', model=tran_model, tokenizer=tran_tokenizer, src_lang=source_lang, tgt_lang=target_lang, max_length = 400)
    for x in words:
        output = translator(x)
        if tran.get(x)==None:
            tran[x]=[]
            tran[x].append(output[0]['translation_text'])
        else:
            tran[x].append(output[0]['translation_text'])

## Develop the COD prompt

In [None]:
prompt=''
for x in words:
    print(tran[x])
    prompt+=f'{x}'
    for y in tran[x]:
        prompt+=f' means "{y}"'
    prompt+='. '
prompt+='\nTranslate the following text from English into Central Kurdish with Arabic script: Using ships to transport goods is by far the most efficient way to move large amounts of people and goods across oceans.'
prompt

## Translate the whole sentence

In [None]:
from transformers import AutoModelForSeq2SeqLM
model_name = "bigscience/bloom-7b1"
cache_directory = "/data/data_user_alpha/public_models"

# Load model and tokenizer with optimization
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=cache_directory,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
outputs = pipe([{"role": "user", "content": 
                 f"{prompt}"}],
                   do_sample=False,
                      eos_token_id=[pipe.tokenizer.convert_tokens_to_ids("<end_of_turn>"), pipe.tokenizer.eos_token_id],
                      max_new_tokens=200)
dat=(outputs[0]['generated_text'])

In [None]:
dat[1]['content']