In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [2]:
!git clone https://github.com/stokome/Hinglish-Translation-AI-llama2.git

fatal: destination path 'Hinglish-Translation-AI-llama2' already exists and is not an empty directory.


In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from datasets import Dataset
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [4]:
# LLAMA-2 model
model_name = "meta-llama/Llama-2-7b-hf"

# Fine-tuned model name
new_model = "/content/Hinglish-Translation-AI-llama2/llama-2-7b-hinglish_weights/custom_dataset_weights"

# Load the entire model on the GPU 0
device_map = {"": 0}

In [5]:

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
                                                  model_name,
                                                  low_cpu_mem_usage=True,
                                                  return_dict=True,
                                                  torch_dtype=torch.float16,
                                                  device_map=device_map,
                                                  use_auth_token="your_huggingface_auth_token"
                                                  )
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_auth_token="your_huggingface_auth_token")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [6]:
import re
def translate_to_hinglish(query, max_length=64):
    non_english_chars_pattern = re.compile(r'[^a-zA-Z]+')
    system_prompt = "Convert English to Hinglish"
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length)
    result = pipe(f"<s>[INST] <<SYS>> {system_prompt} <</SYS>>{query}[/INST]")
    result = result[0]['generated_text'].split('[/INST]')[1].split('।')[0]+ "।"
    return result

In [11]:
queries = ["Definitely share your feedback in the comment section.",
         "I was waiting for my bag.",
         "So even if it's a big video, I will clearly mention all the products."]

In [12]:
for query in queries:
    print(f"English: {query}\n Hinglish: {translate_to_hinglish(query, max_length=100)}\n")

English: Definitely share your feedback in the comment section.
 Hinglish: बिल्कुल, comment section में अपनी feedback साझा करें।

English: I was waiting for my bag.
 Hinglish: मैं अपनी bag का इंतजार कर रहा था।

English: So even if it's a big video, I will clearly mention all the products.
 Hinglish: हे बड़े video के बारे में सभी products को क्लेअर करने के लिए बेताब हो स।



In [13]:
queries = ["My name is Yatharth Anand.",
           "We are going to unbox new Iphone 20.",
           "The sponsor of this video is Kurkure.",
           "I am vlogging in the streets of Barcelone.",
           "Please like and subscribe my youtube channel."]

In [14]:
for query in queries:
    print(f"English: {query}\n Hinglish: {translate_to_hinglish(query, max_length=100)}\n")

English: My name is Yatharth Anand.
 Hinglish: मेरा नाम Yatharth Anand है।

English: We are going to unbox new Iphone 20.
 Hinglish: हम नए Iphone 20 के unboxing करने जा रहे हैं।

English: The sponsor of this video is Kurkure.
 Hinglish: यह video के sponsor Kurkure है।

English: I am vlogging in the streets of Barcelone.
 Hinglish: मैं बर्सलोन के सड़कों पर vlogging कर रहा हूँ।

English: Please like and subscribe my youtube channel.
 Hinglish: कृपया मेरे youtube channel को like और subscribe करें।

