In [None]:
!pip install datasets
!pip install transformers
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
from datasets import load_dataset
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [None]:


def is_translation_request(text):
    keywords = ["translate", "translation", "how do you say"]
    return any(keyword in text.lower() for keyword in keywords)

def generate_translation(input_text, target_language):
    # Assume the tokenizer and model are set up for translation
    inputs = tokenizer(input_text, return_tensors="pt", padding=True)
    translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[target_language])
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return translated_text

def process_conversation(message, target_languages):
    print(message)
    instruction_name = "instruction"
    output_name = "output"



    instruction_translation  = generate_translation(message[instruction_name], target_languages[0])
    output_translation = generate_translation(message[output_name], target_languages[0])
    message_data = {
        "system":message["system"],
        f"{instruction_name}": message[instruction_name],
        f"{output_name}": message[output_name],
        f"instruction_{target_languages[0]}": instruction_translation,
        f"output_{target_languages[0]}": output_translation,
    }

    return message_data

def process_conversations(dataset, target_languages):
    all_conversations = {}
    for i, item in enumerate(dataset['train']):
        conversation_key = f"conversation_{i+1}"
        all_conversations[conversation_key] = []
        processed_message=[]
        processed_convo = (process_conversation(item, target_languages))
        print(processed_convo)
        all_conversations[conversation_key].append(processed_convo)

    return all_conversations

def main():
    dataset = load_dataset("Crystalcareai/slimorca-dedup-alpaca-100k")
    target_languages=["tgl_Latn"]
    #target_languages = ['Javanese',  'Indonesian Bahasa', 'Malaysian Bahasa', 'Filipino (Tagalog)', 'Sundanese', 'Thai', 'Vietnamese', 'Chinese (Mandarin)']

    all_translations = []
    all_filtered_conversations = []
    all_translations, filtered_texts = process_conversations(dataset, target_languages)

    with open('mod_translations_filtered.json', 'w', encoding='utf-8') as json_file:
        json.dump(all_translations, json_file, ensure_ascii=False, indent=4)

    with open('filtered_translations_requests.json', 'w', encoding='utf-8') as json_file:
        json.dump(all_filtered_conversations, json_file, ensure_ascii=False, indent=4)

    print("Filtered translation process completed. Check 'mod_translations_filtered.json' and 'filtered_translations_requests.json'.")

if __name__ == "__main__":
    main()
      # Ensure the thread finishes execution

In [None]:
# Load the tokenizer and model, and move the model to the GPU
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to('cuda')

def generate_translation(input_text, target_language):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True).to('cuda')
    with torch.no_grad():
        translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[target_language])
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return translated_text

def process_conversation(item, target_languages):
    instruction_translation = generate_translation(item['instruction'], target_languages[0])
    output_translation = generate_translation(item['output'], target_languages[0])

    item.update({
        f"instruction_{target_languages[0]}": instruction_translation,
        f"output_{target_languages[0]}": output_translation,
    })
    return item

def main():
    # Load the dataset
    dataset = load_dataset("Crystalcareai/slimorca-dedup-alpaca-100k", split='train')
    target_languages = ["tgl_Latn"]

    # Process each item in the dataset using .map with a lambda function
    processed_dataset = dataset.map(lambda item: process_conversation(item, target_languages), batched=False)

    # Save the processed dataset to a file
    processed_dataset.to_json('processed_translations.json')

    print("Filtered translation process completed. Check 'processed_translations.json'.")

if __name__ == "__main__":
    main()
