In [None]:
from transformers import MarianMTModel, MarianTokenizer
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import words

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('words')

# Translation setup
def translate_text(text, src_lang, tgt_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # Encode and translate
    tokens = tokenizer(text, return_tensors="pt", padding=True)
    translated_tokens = model.generate(**tokens)
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    return translated_text

# Simplification setup
def simplify_text(text):
    sentences = sent_tokenize(text)
    simplified_sentences = []
    stop_words = set(stopwords.words('english'))

    for sentence in sentences:
        words_list = word_tokenize(sentence)
        filtered_words = [word.lower() for word in words_list if word.lower() not in stop_words]
        simplified_sentence = ' '.join(filtered_words)
        simplified_sentences.append(simplified_sentence)

    return ' '.join(simplified_sentences)

# Main process
def process_text(text):
    # Translate from Marathi to English
    english_text = translate_text(text, 'mr', 'en')


    simplified_english_text = simplify_text(english_text)

    final_marathi_text = translate_text(simplified_english_text, 'en', 'mr')

    return final_marathi_text

marathi_text = "तुम्ही किती सुंदर आहात हे एक मोठे प्रशंसा आहे."
result = process_text(marathi_text)
print(result)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


किती सुंदर आहेत.


In [None]:
from transformers import MarianMTModel, MarianTokenizer
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Translation setup
def translate_text(text, src_lang, tgt_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # Encode and translate
    tokens = tokenizer(text, return_tensors="pt", padding=True)
    translated_tokens = model.generate(**tokens)
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    return translated_text

def simplify_text(text):

    model_name = "google/pegasus-xsum"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

    tokens = tokenizer(text, return_tensors="pt", truncation=True)
    simplified_tokens = model.generate(**tokens)
    simplified_text = tokenizer.decode(simplified_tokens[0], skip_special_tokens=True)

    return simplified_text

def process_text(text):
    english_text = translate_text(text, 'mr', 'en')
    simplified_english_text = simplify_text(english_text)
    final_marathi_text = translate_text(simplified_english_text, 'en', 'mr')

    return final_marathi_text

marathi_text = "मुघल सम्राट औरंगजेबाच्या विरोधात छत्रपती शिवाजी महाराजांनी प्रचंड प्रतिकार केला."
result = process_text(marathi_text)
print(result)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


सा. यु. पू.


In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Load pre-trained model and tokenizer
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

# Input complex Marathi sentence
input_sentence = "तो नेहमी वेळेवर पोहोचतो कारण तो जबाबदार आहे."

# Prepare the input for the model
input_ids = tokenizer.encode("simplify: " + input_sentence, return_tensors="pt")

# Generate simplified sentence
outputs = model.generate(input_ids)
simplified_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(simplified_sentence)


config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


<extra_id_0> नाही
