In [None]:
from transformers import AutoProcessor, SeamlessM4Tv2ForTextToText
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "facebook/seamless-m4t-v2-large"
processor = AutoProcessor.from_pretrained(model_name)
tr_model = SeamlessM4Tv2ForTextToText.from_pretrained(model_name).to(device)

lang_code_map = {
    'chinese': 'cmn',
    'hindi': 'hin',
    'urdu': 'urd',
    'german': 'deu',
    'spanish': 'spa',
    'english': 'eng'
}

In [None]:
def translate_from_english_seamless(text, target_lang):
    src_lang = lang_code_map['english']
    tgt_lang = lang_code_map[target_lang]
    
    inputs = processor(text=text, src_lang=src_lang, tgt_lang=tgt_lang, return_tensors="pt").to(device)
    
    with torch.no_grad():
        generated = tr_model.generate(**inputs, tgt_lang=tgt_lang)
        decoded = processor.batch_decode(generated, skip_special_tokens=True)
    
    return decoded

def translate_to_english_seamless(text, source_lang): 
    src_lang = lang_code_map[source_lang]
    tgt_lang = lang_code_map['english']
    
    inputs = processor(text=text, src_lang=src_lang, tgt_lang=tgt_lang, return_tensors="pt").to(device)
    
    with torch.no_grad():
        generated = tr_model.generate(**inputs, tgt_lang=tgt_lang)
        decoded = processor.batch_decode(generated, skip_special_tokens=True)
    
    return decoded

In [None]:
from tqdm import tqdm

emfd_df = pd.read_csv("/shared/2/projects/moral-project/topic_modelling/eMFD_wordlist.csv")
emfd_df.fillna(0, inplace=True)

emfd_dict_english = {}
for _, row in emfd_df.iterrows():
    word = row['word']
    emfd_dict_english[word] = row.drop('word').to_dict()

emfd_dict_chinese, emfd_dict_german, emfd_dict_hindi, emfd_dict_spanish, emfd_dict_urdu = {}, {}, {}, {}, {}

for key,value in tqdm(emfd_dict_english.items()):
    tr_key = translate_from_english_seamless(key, target_lang="chinese")[0]
    emfd_dict_chinese[tr_key] = value

    tr_key = translate_from_english_seamless(key, target_lang="german")[0]
    emfd_dict_german[tr_key] = value

    tr_key = translate_from_english_seamless(key, target_lang="hindi")[0]
    emfd_dict_hindi[tr_key] = value

    tr_key = translate_from_english_seamless(key, target_lang="spanish")[0]
    emfd_dict_spanish[tr_key] = value

    tr_key = translate_from_english_seamless(key, target_lang="urdu")[0]
    emfd_dict_urdu[tr_key] = value


In [None]:
import pickle

with open("MoralDicts/emfd_dict_english_seamless.pkl", "wb") as f:
    pickle.dump(emfd_dict_english, f)
with open("MoralDicts/emfd_dict_chinese_seamless.pkl", "wb") as f:
    pickle.dump(emfd_dict_chinese, f)
with open("MoralDicts/emfd_dict_german_seamless.pkl", "wb") as f:
    pickle.dump(emfd_dict_german, f)
with open("MoralDicts/emfd_dict_hindi_seamless.pkl", "wb") as f:
    pickle.dump(emfd_dict_hindi, f)
with open("MoralDicts/emfd_dict_spanish_seamless.pkl", "wb") as f:
    pickle.dump(emfd_dict_spanish, f)
with open("MoralDicts/emfd_dict_urdu_seamless.pkl", "wb") as f:
    pickle.dump(emfd_dict_urdu, f)

### Verifying the translations

In [None]:
from tqdm import tqdm

emfd_df = pd.read_csv("/shared/2/projects/moral-project/topic_modelling/eMFD_wordlist.csv")
emfd_df.fillna(0, inplace=True)

emfd_dict_english = {}
for _, row in emfd_df.iterrows():
    word = row['word']
    emfd_dict_english[word] = row.drop('word').to_dict()

eng_keys = list(emfd_dict_english.keys())
batch_size = 8

import torch

chinese_keys = []
for i in tqdm(range(0, len(eng_keys), batch_size)):
    batch = eng_keys[i:i+batch_size]
    chinese_keys.extend(translate_from_english_seamless(batch, target_lang="chinese"))
torch.cuda.empty_cache()

german_keys = []
for i in tqdm(range(0, len(eng_keys), batch_size)):
    batch = eng_keys[i:i+batch_size]
    german_keys.extend(translate_from_english_seamless(batch, target_lang="german"))
torch.cuda.empty_cache()

hindi_keys = []
for i in tqdm(range(0, len(eng_keys), batch_size)):
    batch = eng_keys[i:i+batch_size]
    hindi_keys.extend(translate_from_english_seamless(batch, target_lang="hindi"))
torch.cuda.empty_cache()

spanish_keys = []
for i in tqdm(range(0, len(eng_keys), batch_size)):
    batch = eng_keys[i:i+batch_size]
    spanish_keys.extend(translate_from_english_seamless(batch, target_lang="spanish"))
torch.cuda.empty_cache()

urdu_keys = []
for i in tqdm(range(0, len(eng_keys), batch_size)):
    batch = eng_keys[i:i+batch_size]
    urdu_keys.extend(translate_from_english_seamless(batch, target_lang="urdu"))
torch.cuda.empty_cache()

In [None]:
batch_size = 8

chinese_keys_back = []
for i in tqdm(range(0, len(chinese_keys), batch_size)):
    batch = chinese_keys[i:i+batch_size]
    chinese_keys_back.extend(translate_to_english_seamless(batch, source_lang="chinese"))
torch.cuda.empty_cache()

german_keys_back = []
for i in tqdm(range(0, len(german_keys), batch_size)):
    batch = german_keys[i:i+batch_size]
    german_keys_back.extend(translate_to_english_seamless(batch, source_lang="german"))
torch.cuda.empty_cache()

hindi_keys_back = []
for i in tqdm(range(0, len(hindi_keys), batch_size)):
    batch = hindi_keys[i:i+batch_size]
    hindi_keys_back.extend(translate_to_english_seamless(batch, source_lang="hindi"))
torch.cuda.empty_cache()

spanish_keys_back = []
for i in tqdm(range(0, len(spanish_keys), batch_size)):
    batch = spanish_keys[i:i+batch_size]
    spanish_keys_back.extend(translate_to_english_seamless(batch, source_lang="spanish"))
torch.cuda.empty_cache()

urdu_keys_back = []
for i in tqdm(range(0, len(urdu_keys), batch_size)):
    batch = urdu_keys[i:i+batch_size]
    urdu_keys_back.extend(translate_to_english_seamless(batch, source_lang="urdu"))
torch.cuda.empty_cache()

In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

results = []
for eng_key, lang_key, back_key in zip(eng_keys, chinese_keys, chinese_keys_back):
    emb1 = model.encode(eng_key, convert_to_tensor=True)
    emb2 = model.encode(back_key, convert_to_tensor=True)
    similarity = util.cos_sim(emb1, emb2).item()
    results.append((eng_key, lang_key, back_key, similarity))
df = pd.DataFrame(results, columns=[
    'Original English', 
    'Chinese Translation', 
    'Back-translated English', 
    'Cosine Similarity'
])
df_sorted = df.sort_values(by='Cosine Similarity')
print(df_sorted.head(5))
print("Chinese average: ", sum(df_sorted['Cosine Similarity']) / len(df))

results = []
for eng_key, lang_key, back_key in zip(eng_keys, german_keys, german_keys_back):
    emb1 = model.encode(eng_key, convert_to_tensor=True)
    emb2 = model.encode(back_key, convert_to_tensor=True)
    similarity = util.cos_sim(emb1, emb2).item()
    results.append((eng_key, lang_key, back_key, similarity))
df = pd.DataFrame(results, columns=[
    'Original English', 
    'German Translation', 
    'Back-translated English', 
    'Cosine Similarity'
])
df_sorted = df.sort_values(by='Cosine Similarity')
print(df_sorted.head(5))
print("German average: ", sum(df_sorted['Cosine Similarity']) / len(df))

results = []
for eng_key, lang_key, back_key in zip(eng_keys, hindi_keys, hindi_keys_back):
    emb1 = model.encode(eng_key, convert_to_tensor=True)
    emb2 = model.encode(back_key, convert_to_tensor=True)
    similarity = util.cos_sim(emb1, emb2).item()
    results.append((eng_key, lang_key, back_key, similarity))
df = pd.DataFrame(results, columns=[
    'Original English', 
    'Hindi Translation', 
    'Back-translated English', 
    'Cosine Similarity'
])
df_sorted = df.sort_values(by='Cosine Similarity')
print(df_sorted.head(5))
print("Hindi average: ", sum(df_sorted['Cosine Similarity']) / len(df))

results = []
for eng_key, lang_key, back_key in zip(eng_keys, spanish_keys, spanish_keys_back):
    emb1 = model.encode(eng_key, convert_to_tensor=True)
    emb2 = model.encode(back_key, convert_to_tensor=True)
    similarity = util.cos_sim(emb1, emb2).item()
    results.append((eng_key, lang_key, back_key, similarity))
df = pd.DataFrame(results, columns=[
    'Original English', 
    'Spanish Translation', 
    'Back-translated English', 
    'Cosine Similarity'
])
df_sorted = df.sort_values(by='Cosine Similarity')
print(df_sorted.head(5))
print("Spanish average: ", sum(df_sorted['Cosine Similarity']) / len(df))

results = []
for eng_key, lang_key, back_key in zip(eng_keys, urdu_keys, urdu_keys_back):
    emb1 = model.encode(eng_key, convert_to_tensor=True)
    emb2 = model.encode(back_key, convert_to_tensor=True)
    similarity = util.cos_sim(emb1, emb2).item()
    results.append((eng_key, lang_key, back_key, similarity))
df = pd.DataFrame(results, columns=[
    'Original English', 
    'Urdu Translation', 
    'Back-translated English', 
    'Cosine Similarity'
])
df_sorted = df.sort_values(by='Cosine Similarity')
print(df_sorted.head(5))
print("Urdu average: ", sum(df_sorted['Cosine Similarity']) / len(df))