In [21]:
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Load multilingual BERT tokenizer and model

tokenizer = BertTokenizer.from_pretrained("model/tuned-bert-tokenizer")
model = BertModel.from_pretrained("model/tuned-bert")

# Load German-to-English translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en") 

def get_token_embedding(sentence, target_word):
    """
    Get the embedding for a specific word in a sentence.
    Handles subword tokenization by averaging embeddings of subword tokens.
    """
    
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.encode(sentence, return_tensors="pt")  # Includes [CLS] and [SEP]
    
    with torch.no_grad():
        outputs = model(token_ids)
    last_hidden_state = outputs.last_hidden_state.squeeze(0)  # Shape: [seq_len, hidden_size]

    word_pieces = tokenizer.tokenize(target_word)
    indices = [i for i, tok in enumerate(tokens) if tok in word_pieces]

    if not indices:
        raise ValueError(f"Word '{target_word}' not found in tokenized sentence: {tokens}")

    embedding = torch.stack([last_hidden_state[i + 1] for i in indices], dim=0).mean(dim=0)
    return embedding.detach().numpy()

def compare_embeddings(embedding1, embedding2):
    """
    Compute cosine similarity between two embeddings.
    """
    return cosine_similarity([embedding1], [embedding2])[0][0]

german_sentence = "Ich habe mein Handy verloren."
translated_sentence = translator(german_sentence)[0]['translation_text']
print("Translation:", translated_sentence)

try:
    embedding_de = get_token_embedding(german_sentence, "Handy")
    embedding_en_mobile = get_token_embedding(translated_sentence, "phone")
    embedding_en_handy = get_token_embedding("I lost my handy.", "handy")

    
    sim_to_mobile = compare_embeddings(embedding_de, embedding_en_mobile)
    sim_to_handy = compare_embeddings(embedding_de, embedding_en_handy)

    print(f"Similarity with 'phone': {sim_to_mobile:.4f}")
    print(f"Similarity with 'handy': {sim_to_handy:.4f}")

    if sim_to_mobile > sim_to_handy:
        print("'Handy' likely means 'phone' — false loanword detected.")
    else:
        print("'Handy' could mean 'handy' — no issue detected.")
except ValueError as e:
    print(f"Error: {e}")

Device set to use cpu


Translation: I lost my phone.
Similarity with 'phone': 0.4556
Similarity with 'handy': 0.6671
'Handy' could mean 'handy' — no issue detected.


In [75]:
import pandas as pd
from tqdm import tqdm
from langchain_groq import ChatGroq

from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

from dotenv import load_dotenv
from sacrebleu import sentence_bleu
from nltk.translate.meteor_score import meteor_score  # For METEOR score
from transformers import pipeline

import nltk
nltk.download('wordnet')

load_dotenv()

# model = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)

# llm = HuggingFaceEndpoint(
#     repo_id="HuggingFaceH4/zephyr-7b-beta",
#     task="text-generation",
#     max_new_tokens=512,
#     do_sample=False,
#     repetition_penalty=1.03,
# )

# model = ChatHuggingFace(llm=llm)


model = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)

df = pd.read_csv('data/production_train_test/English-German/balanced/English-German-train_production_balanced.csv')
df = df.head(10)  

def generate_gpt_context(word):
    prompt = f"Write a natural German very small sentence using the word '{word}' in context."
    answer = model.invoke(prompt)
    # print(f"Generated Sentence: {answer.content}")
    return answer.content

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en")

def calculate_bleu(reference, hypothesis):
    bleu_score = sentence_bleu(hypothesis, [reference]).score / 100  # Normalize to [0, 1]
    return bleu_score

def calculate_meteor(reference, hypothesis):
    # Tokenize sentences into lists of words
    reference_tokens = reference.split()
    hypothesis_tokens = hypothesis.split()
    meteor = meteor_score([reference_tokens], hypothesis_tokens)
    return meteor

def generate_reference_sentence(sentence):
    prompt = f"Translate this German sentence: '{sentence}' to English. Write only the translated sentence, nothing else."
    answer = model.invoke(prompt)
    # print(f"Reference Sentence: {answer.content}")
    return answer.content

context_sentences = []
print("Generating context sentences...")
for _, row in tqdm(df.iterrows(), total=len(df)):
    loan_word = row['original_word']
    context = generate_gpt_context(loan_word)
    context_sentences.append(context)

df['generated_context'] = context_sentences

bleu_scores = []
meteor_scores = []
translated_sentence = []
ref = []
ref_word = []

print("Translating and evaluating sentences...")
for _, row in tqdm(df.iterrows(), total=len(df)):
    german_sentence = row['generated_context']
    loan_word = row['original_word']
    
    english_translation = generate_reference_sentence(german_sentence)
    translated_sentence.append(english_translation)

    reference_sentence = translator(german_sentence)[0]['translation_text']
    ref.append(reference_sentence)

    reference_word = translator(loan_word)[0]['translation_text']
    ref_word.append(reference_word)
    

    bleu_score = calculate_bleu(reference_sentence, english_translation)
    bleu_scores.append(bleu_score)
    
    meteor_score_value = calculate_meteor(reference_sentence, english_translation)
    meteor_scores.append(meteor_score_value)
    
    # print(f"Loanword: {loan_word}")
    # print(f"German Sentence: {german_sentence}")
    # print(f"Translated Sentence: {english_translation}")
    # print(f"Reference Sentence: {reference_sentence}")
    # print(f"BLEU Score: {bleu_score:.4f}, METEOR Score: {meteor_score_value:.4f}")


df['reference_sentence'] = ref
df["translated_sentence"] = translated_sentence
df['bleu_score'] = bleu_scores
df['meteor_score'] = meteor_scores
df['reference_word'] = ref_word

important_columns = ['loan_word', 'original_word', 'generated_context', 'reference_sentence', 'translated_sentence',"reference_word",'label' ,'bleu_score', 'meteor_score']
df_important = df[important_columns]

output_path = 'loanwords_with_context_and_scores.csv'
df_important.to_csv(output_path, index=False)
print(f"Saved with context and scores to: {output_path}")

[nltk_data] Downloading package wordnet to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Device set to use cpu


Generating context sentences...


 50%|█████     | 5/10 [00:02<00:02,  2.12it/s]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.3-70b-versatile` in organization `org_01jn1j78k0ex7sr2vk46c8jfse` service tier `on_demand` on requests per day (RPD): Limit 1000, Used 1000, Requested 1. Please try again in 1m26.196s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'requests', 'code': 'rate_limit_exceeded'}}

In [78]:
df_important

Unnamed: 0,loan_word,original_word,generated_context,reference_sentence,translated_sentence,reference_word,label,bleu_score,meteor_score
0,Mirth,Fröhlichkeit,Die Weihnachtsfeier strahlte vor Fröhlichkeit.,The Christmas party shone with joy.,The Christmas party radiated joyfulness.,Cheerfulness,synonym,0.274825,0.499058
1,Schnorr,Chromatogramm,Das Chromatogramm zeigt die verschiedenen Best...,The chromatogram shows the various components ...,The chromatogram shows the various components ...,Chromatogram,random,1.0,0.999314
2,Zettelkasten,Zettelkasten,Ich habe endlich meinen Zettelkasten aufgeräumt.,I finally cleaned up my paperbox.,I have finally tidied up my slip box.,Paper box,loan,0.119901,0.509073
3,Meiring,Meiring,Meiring ist ein bekannter deutscher Familienname.,Meiring is a well-known German surname.,Meiring is a well-known German surname.,Meiring,loan,1.0,0.997685
4,Speth,Speth,Herr Speth ist unser neuer Nachbar.,Mr. Speth is our new neighbor.,Mr. Speth is our new neighbor.,Speth,loan,1.0,0.997685
5,Important,Gefeiert,Das Jubiläum wurde gestern Abend groß gefeiert.,The anniversary was celebrated very much last ...,The anniversary was celebrated in a big way la...,Celebrated,synonym,0.339325,0.718157
6,Breisgau,Ennen,"Ich war schon ennen hier, als das Restaurant n...",I was already here when the restaurant didn't ...,I was already here before the restaurant existed.,Eureka,random,0.295387,0.600907
7,Abstruse,Unklar,Die Bedingungen des Vertrags sind mir unklar.,The terms of the contract are unclear to me.,The conditions of the contract are unclear to me.,Unclear,synonym,0.782542,0.881944
8,Uncommon,Fantastisch,"Das Konzert war fantastisch, ich habe mich seh...","The concert was fantastic, I had a very good c...","The concert was fantastic, I had a really grea...",Fantastic,synonym,0.631555,0.69898
9,Wemhoff,Gut,Das ist gut.,That's good.,That is good.,Good,random,0.319472,0.238095


In [83]:
import torch
from transformers import BertModel


class LoanWordClassifier(torch.nn.Module):
    def __init__(self, num_phonetic_embeddings, num_labels=2):
        super().__init__()
        self.bert = BertModel.from_pretrained("model/tuned-bert")
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.phonetic_embedder = torch.nn.Embedding(num_phonetic_embeddings, embedding_dim=64)
        
        bert_hidden_size = self.bert.config.hidden_size 
        phonetic_size = 64
        unicode_size = 25 
        other_size = 1
        
        total_input_size = bert_hidden_size + phonetic_size + unicode_size + other_size
        
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(total_input_size, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, phonetic_seq, unicode_feature, other_feature):
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state 
        pooled_output = last_hidden_states.mean(dim=1) 
        phonetic_emb = self.phonetic_embedder(phonetic_seq).mean(dim=1) 

    
        unicode_feature = unicode_feature.view(unicode_feature.size(0), -1)  
        other_feature = other_feature.view(other_feature.size(0), -1)      

    
        combined = torch.cat([
            pooled_output,       
            phonetic_emb,        
            unicode_feature,     
            other_feature        
        ], dim=1)               
        
        logits = self.classifier(combined)
        return logits

import torch
from transformers import BertTokenizer
import epitran

# Load model and tokenizer
model = LoanWordClassifier(111024)
model.load_state_dict(torch.load("model/loan_word_model.pth", map_location=torch.device('cpu')))
model.eval()
tokenizer = BertTokenizer.from_pretrained("model/tuned-bert-tokenizer")
epi = epitran.Epitran("fra-Latn")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word, max_len=25):
    try:
        loan_epitran = epi.transliterate(word)
        phonetic_seq = [ord(c) for c in loan_epitran] 
    except IndexError as e:
        print(f"Transliteration failed for '{word}': {e}")
        phonetic_seq = [0] 

    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)

    if len(unicode_features) < max_len:
        unicode_features = unicode_features + [0] * (max_len - len(unicode_features))
    else:
        unicode_features = unicode_features[:max_len]  # Truncate if longer

    return phonetic_seq, unicode_features, [len(word)]

  model.load_state_dict(torch.load("model/loan_word_model.pth", map_location=torch.device('cpu')))


In [85]:
import torch
from transformers import BertTokenizer
import epitran

# Load model and tokenizer
model.eval()
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
epi = epitran.Epitran("deu-Latn")

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word, max_len=25):
    try:
        loan_epitran = epi.transliterate(word)
        phonetic_seq = [ord(c) for c in loan_epitran] 
    except IndexError as e:
        print(f"Transliteration failed for '{word}': {e}")
        phonetic_seq = [0] 

    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)

    if len(unicode_features) < max_len:
        unicode_features = unicode_features + [0] * (max_len - len(unicode_features))
    else:
        unicode_features = unicode_features[:max_len]  # Truncate if longer

    return phonetic_seq, unicode_features, [len(word)]



sentence = "The Christmas party radiated joyfulness"
words = sentence.split()

false_loans = []
for word in words:
    phonetic_seq, unicode_feature, other_feature = extract_features(word)
    inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device),
            phonetic_seq=torch.tensor([phonetic_seq], dtype=torch.long).to(device),
            unicode_feature=torch.tensor([unicode_feature], dtype=torch.float).to(device),
            other_feature=torch.tensor([other_feature], dtype=torch.float).to(device)
        )
        probs = torch.softmax(logits, dim=1)
        print(word , probs)
        
        if torch.argmax(probs) == 1:  
            false_loans.append(word)

print("False loan words:", false_loans)

The tensor([[0.7009, 0.2991]])
Christmas tensor([[0.7576, 0.2424]])
party tensor([[0.5908, 0.4092]])
radiated tensor([[0.5134, 0.4866]])
joyfulness tensor([[0.6997, 0.3003]])
False loan words: []
