In [1]:
# !pip install googletrans==4.0.0-rc1
# !python -m spacy download fr_core_news_sm
# !python -m spacy download en_core_web_md

In [2]:
import spacy
import nltk
from googletrans import Translator
from nltk.corpus import wordnet

# Download required NLTK data
nltk.download('wordnet')

# Load the spaCy models for English and French
nlp_en = spacy.load('en_core_web_sm')
nlp_fr = spacy.load('fr_core_news_sm')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Define the sentences to be analyzed
en_sentence = "The cat sat on the mat."
# fr_sentence = "Le chat s'est assis sur le tapis."

# Wrong translation - The dog plays with the bone
fr_sentence = "Le chien joue avec l'os" 

In [4]:
# Use spaCy to parse the English sentence
en_doc = nlp_en(en_sentence)

# Use spaCy to parse the French sentence
fr_doc = nlp_fr(fr_sentence)

In [5]:
# Get word pairs with the same parts of speech
en_pairs = []
fr_pairs = []

neglect = ["DET","PUNCT"]

for en_token in en_doc:
    for fr_token in fr_doc:
        if en_token.pos_ == fr_token.pos_ and en_token.pos_ not in neglect:
            en_pairs.append(en_token.text)
            fr_pairs.append(fr_token.text)

In [6]:
for en_token in en_doc:
  print(f"{en_token} - {en_token.pos_}")
print()
for fr_token in fr_doc:
  print(f"{fr_token} - {fr_token.pos_}")

The - DET
cat - NOUN
sat - VERB
on - ADP
the - DET
mat - NOUN
. - PUNCT

Le - DET
chien - NOUN
joue - VERB
avec - ADP
l' - DET
os - NOUN


In [7]:
print(en_pairs)
print(fr_pairs)

['cat', 'cat', 'sat', 'on', 'mat', 'mat']
['chien', 'os', 'joue', 'avec', 'chien', 'os']


In [8]:
# Translate the word pairs to English using Googletrans
translator = Translator(service_urls=['translate.google.com'])

en_translations = []

for fr_pair in fr_pairs:
  en_translations.append(translator.translate(fr_pair, src='fr', dest='en').text)

In [9]:
from nltk.corpus import wordnet as wn
import numpy as np

nlp = spacy.load("en_core_web_md")

def word_similarity(word1, word2):
    # Get synsets for both words using WordNet
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    
    max_sim = 0.0
    
    # Iterate over all synsets of both words and calculate maximum similarity
    for synset1 in synsets1:
        for synset2 in synsets2:
            sim = synset1.wup_similarity(synset2)
            if sim is not None and sim > max_sim:
                max_sim = sim

    return max_sim

In [10]:
# Calculate the semantic similarity between the translated words and the original English words
similarity_scores_dict = {}
for i in range(len(en_pairs)):
    en_word = en_pairs[i].lower()
    en_translation = en_translations[i].lower()

    if en_word not in similarity_scores_dict.keys():
      similarity_scores_dict[en_word] = word_similarity(en_word,en_translation)
    else:
      similarity_scores_dict[en_word] = max(similarity_scores_dict[en_word],word_similarity(en_word,en_translation))

In [11]:
# Print the results
print("English word pairs with the same parts of speech:")
print(en_pairs)
print()
print("French word pairs with the same parts of speech:")
print(fr_pairs)
print()
print("English translations of French word pairs:")
print(en_translations)
print()
print("Semantic similarity scores:")
print(similarity_scores_dict)
print()
print("Combined semantic score:")
print(sum(similarity_scores_dict.values())/len(similarity_scores_dict.values()))

English word pairs with the same parts of speech:
['cat', 'cat', 'sat', 'on', 'mat', 'mat']

French word pairs with the same parts of speech:
['chien', 'os', 'joue', 'avec', 'chien', 'os']

English translations of French word pairs:
['dog', 'bone', 'cheek', 'with', 'dog', 'bone']

Semantic similarity scores:
{'cat': 0.8571428571428571, 'sat': 0.4, 'on': 0.0, 'mat': 0.6666666666666666}

Combined semantic score:
0.4809523809523809
