In [2]:
!pip install transformers
!pip install nltk
!pip install sentencepiece

from transformers import MarianMTModel, MarianTokenizer

model_en_hi = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")  # from English to Hindi
tokenizer_en_hi = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import re

lemmatizer = WordNetLemmatizer()



Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
Col

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [41]:
def find_nouns_and_verbs(en_text):
    words = word_tokenize(en_text)
    tagged_words = pos_tag(words)
    auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', 'has', 'had']
    nouns = [word for word, pos in tagged_words if pos.startswith('NN')]
    verbs = [word for word, pos in tagged_words if pos.startswith('VB') and word not in auxiliary_verbs]
    verbs = [lemmatizer.lemmatize(verb, pos='v') for verb in verbs]

    translation_dict = {
        'feedback': 'प्रतिक्रिया',
        'definitely': 'निश्चित रूप से',
        'section': 'खंड'
    }

    for noun in nouns:
        hin_noun = translate_to_hindi(noun)
        translation_dict[noun] = hin_noun

    for verb in verbs:
        hin_verb = translate_to_hindi(verb)
        modified_value = hin_verb.split(' ', 1)[0]
        translation_dict[verb] = modified_value

    return translation_dict

def translate_to_hindi(en_text):
    inputs = tokenizer_en_hi.encode(en_text, return_tensors="pt")
    translated_id = model_en_hi.generate(inputs, max_length=150, num_return_sequences=1, num_beams=4)
    translated_output = tokenizer_en_hi.decode(translated_id[0], skip_special_tokens=True)
    translated_output = translated_output.replace('\u200d', '')  # Handling ZWJ characters
    return translated_output

def translate_to_hinglish_custom(en_text):
    nouns_verbs = find_nouns_and_verbs(en_text)
    hin_text = translate_to_hindi(en_text)
    hin_text = switch_hindi_nouns_to_english(nouns_verbs, hin_text)

    for eng_word, hin_word in nouns_verbs.items():
        hin_text = hin_text.replace(hin_word, eng_word)

    return hin_text

def switch_hindi_nouns_to_english(nouns, hinglish_text):
    for key, value in nouns.items():
        matches = re.findall(r'\b' + re.escape(value) + r'\b', hinglish_text)
        for match in matches:
            hinglish_text = hinglish_text.replace(match, key)
    return hinglish_text

def handle_plural_forms_in_hinglish(hin_text):
    plural_replacements = {
        # "productsों का": "products का",
        "ों": "",
        "याँ":"",
        "एँ":"",
        "ओं":""
    }

    for plural, singular in plural_replacements.items():
        hin_text = hin_text.replace(plural, singular)

    return hin_text


def execute(en_text_list):
    for en_text in en_text_list:
        hinglish_translation = translate_to_hinglish_custom(en_text)
        hinglish_translation = handle_plural_forms_in_hinglish(hinglish_translation)
        print(f"English input: {en_text}")
        print(f"Hinglish output: {hinglish_translation}")
        print()




In [42]:
en_text_list = [
    "1. Definitely share your feedback in the comment section.",
    "2. So even if it's a big video, I will clearly mention all the products.",
    "3. I was waiting for my bags.",
]

execute(en_text_list)

English input: 1. Definitely share your feedback in the comment section.
Hinglish output: 1 definitely comment खण्ड में आपकी feedback share करें.

English input: 2. So even if it's a big video, I will clearly mention all the products.
Hinglish output: 2 अगर यह एक बड़ा video है, तो भी मैं स्पष्ट रूप से सभी products का mention करेंगे।

English input: 3. I was waiting for my bags.
Hinglish output: 3 मैं अपने बैग के लिए इंतज़ार कर रहा था.



In [43]:
##sample statements
en_text_list = [
    "1. Go fetch water from filter.",
    "2. he lives in the forest.",
    "3. I am studying right now.",
]
execute(en_text_list)

English input: 1. Go fetch water from filter.
Hinglish output: 1. जाओ filter से water लाने.

English input: 2. he lives in the forest.
Hinglish output: 2 वह forest में रहता है.

English input: 3. I am studying right now.
Hinglish output: 3 मैं अभी पढ़ रहा हूँ.

