In [1]:

from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import torch

# Initialize model and tokenizer (done once at the start)
model_name = 'facebook/m2m100_418M'
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

# Define the batch size
batch_size = 16  # Adjust this based on your hardware capabilities

# Function to analyze sentiment and categorize emotion
def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    analyzer = SentimentIntensityAnalyzer()
    vader_scores = analyzer.polarity_scores(text)
    return polarity, vader_scores

def categorize_emotion(polarity):
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

# Batch translation function
def translate_batch(texts, target_lang="hi"):
    tokenizer.src_lang = "en"
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

    # Set the target language
    target_lang_id = tokenizer.get_lang_id(target_lang)
    model.config.forced_bos_token_id = target_lang_id

    # Perform batch translation
    translated_tokens = model.generate(**inputs)
    translated_texts = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    return translated_texts

# Optimized process_dataset function with batch processing
def process_dataset(file_path, target_lang="hi"):
    with open(file_path, 'r') as file:
        lines = [line.strip() for line in file.readlines()]

    results = []
    for i in range(0, len(lines), batch_size):
        batch_lines = lines[i:i+batch_size]
        translated_texts = translate_batch(batch_lines, target_lang=target_lang)

        for original, translated in zip(batch_lines, translated_texts):
            polarity, vader_scores = analyze_sentiment(original)
            emotion = categorize_emotion(polarity)
            results.append((original, translated, emotion, vader_scores))
    
    return results

# List of target languages with their codes
target_languages = {
    "Hindi": "hi",
    "Tamil": "ta",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Chinese": "zh"
}

# Translate the dataset into all specified languages
for lang_name, lang_code in target_languages.items():
    print(f"Translating to {lang_name}...")
    train_results = process_dataset('clean_train.txt', target_lang=lang_code)
    test_results = process_dataset('clean_test.txt', target_lang=lang_code)
    # test_results = process_dataset('clean_test.txt', target_lang=lang_code)
    val_results = process_dataset('clean_val.txt', target_lang=lang_code)

    # You can save or further process `train_results` and `test_results` for each language
    with open(f'final_train_results_{lang_code}.txt', 'w') as f:
        for original, translated, emotion, vader_scores in train_results:
            f.write(f"Original: {original}\n")
            f.write(f"Translated ({lang_name}): {translated}\n")
            f.write(f"Emotion: {emotion}\n")
            f.write(f"VADER Scores: {vader_scores}\n")
            f.write("-" * 50 + "\n")

    with open(f'final_test_results_{lang_code}.txt', 'w') as f:
        for original, translated, emotion, vader_scores in test_results:
            f.write(f"Original: {original}\n")
            f.write(f"Translated ({lang_name}): {translated}\n")
            f.write(f"Emotion: {emotion}\n")
            f.write(f"VADER Scores: {vader_scores}\n")
            f.write("-" * 50 + "\n")




Translating to Hindi...




In [None]:
import pickle
with open('model5.pickle','wb') as f:
    pickle.dump(model,f)