In [2]:
import pandas as pd
from tqdm import tqdm
from nltk.translate.meteor_score import meteor_score  # For METEOR score
from transformers import pipeline
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
tokenizer = BertTokenizer.from_pretrained("model/tuned-bert-tokenizer")
model = BertModel.from_pretrained("model/tuned-bert")

# Load German-to-English translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en") 

def get_token_embedding(sentence, target_word):
    """
    Get the embedding for a specific word in a sentence.
    Handles subword tokenization by averaging embeddings of subword tokens.
    """
    
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.encode(sentence, return_tensors="pt")  # Includes [CLS] and [SEP]
    
    with torch.no_grad():
        outputs = model(token_ids)
    last_hidden_state = outputs.last_hidden_state.squeeze(0)  # Shape: [seq_len, hidden_size]

    word_pieces = tokenizer.tokenize(target_word)
    indices = [i for i, tok in enumerate(tokens) if tok in word_pieces]

    if not indices:
        raise ValueError(f"Word '{target_word}' not found in tokenized sentence: {tokens}")

    embedding = torch.stack([last_hidden_state[i + 1] for i in indices], dim=0).mean(dim=0)
    return embedding.detach().numpy()

def compare_embeddings(embedding1, embedding2):
    """
    Compute cosine similarity between two embeddings.
    """
    return cosine_similarity([embedding1], [embedding2])[0][0]

german_sentence = "Ich habe mein Handy verloren."
translated_sentence = translator(german_sentence)[0]['translation_text']
print("Translation:", translated_sentence)

try:
    embedding_de = get_token_embedding(german_sentence, "Handy")
    embedding_en_mobile = get_token_embedding(translated_sentence, "phone")
    embedding_en_handy = get_token_embedding("I lost my handy.", "handy")

    
    sim_to_mobile = compare_embeddings(embedding_de, embedding_en_mobile)
    sim_to_handy = compare_embeddings(embedding_de, embedding_en_handy)

    print(f"Similarity with 'phone': {sim_to_mobile:.4f}")
    print(f"Similarity with 'handy': {sim_to_handy:.4f}")

    if sim_to_mobile > sim_to_handy:
        print("'Handy' likely means 'phone' — false loanword detected.")
    else:
        print("'Handy' could mean 'handy' — no issue detected.")
except ValueError as e:
    print(f"Error: {e}")

Device set to use cpu


Translation: I lost my phone.
Similarity with 'phone': 0.4556
Similarity with 'handy': 0.6671
'Handy' could mean 'handy' — no issue detected.


In [4]:
import torch
from transformers import BertModel


class LoanWordClassifier(torch.nn.Module):
    def __init__(self, num_phonetic_embeddings, num_labels=2):
        super().__init__()
        self.bert = BertModel.from_pretrained("model/tuned-bert")
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.phonetic_embedder = torch.nn.Embedding(num_phonetic_embeddings, embedding_dim=64)
        
        bert_hidden_size = self.bert.config.hidden_size 
        phonetic_size = 64
        unicode_size = 25 
        other_size = 1
        
        total_input_size = bert_hidden_size + phonetic_size + unicode_size + other_size
        
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(total_input_size, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, phonetic_seq, unicode_feature, other_feature):
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state 
        pooled_output = last_hidden_states.mean(dim=1) 
        phonetic_emb = self.phonetic_embedder(phonetic_seq).mean(dim=1) 

    
        unicode_feature = unicode_feature.view(unicode_feature.size(0), -1)  
        other_feature = other_feature.view(other_feature.size(0), -1)      

    
        combined = torch.cat([
            pooled_output,       
            phonetic_emb,        
            unicode_feature,     
            other_feature        
        ], dim=1)               
        
        logits = self.classifier(combined)
        return logits

import torch
from transformers import BertTokenizer
import epitran

# Load model and tokenizer
model = LoanWordClassifier(111024)
model.load_state_dict(torch.load("model/loan_word_model.pth", map_location=torch.device('cpu')))
model.eval()
tokenizer = BertTokenizer.from_pretrained("model/tuned-bert-tokenizer")
epi = epitran.Epitran("fra-Latn")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word, max_len=25):
    try:
        loan_epitran = epi.transliterate(word)
        phonetic_seq = [ord(c) for c in loan_epitran] 
    except IndexError as e:
        print(f"Transliteration failed for '{word}': {e}")
        phonetic_seq = [0] 

    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)

    if len(unicode_features) < max_len:
        unicode_features = unicode_features + [0] * (max_len - len(unicode_features))
    else:
        unicode_features = unicode_features[:max_len]  # Truncate if longer

    return phonetic_seq, unicode_features, [len(word)]

  model.load_state_dict(torch.load("model/loan_word_model.pth", map_location=torch.device('cpu')))


In [5]:
import torch
from transformers import BertTokenizer
import epitran

# Load model and tokenizer
model.eval()
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
epi = epitran.Epitran("deu-Latn")

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word, max_len=25):
    try:
        loan_epitran = epi.transliterate(word)
        phonetic_seq = [ord(c) for c in loan_epitran] 
    except IndexError as e:
        print(f"Transliteration failed for '{word}': {e}")
        phonetic_seq = [0] 

    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)

    if len(unicode_features) < max_len:
        unicode_features = unicode_features + [0] * (max_len - len(unicode_features))
    else:
        unicode_features = unicode_features[:max_len]  # Truncate if longer

    return phonetic_seq, unicode_features, [len(word)]



sentence = "The Christmas party radiated joyfulness"
words = sentence.split()

false_loans = []
for word in words:
    phonetic_seq, unicode_feature, other_feature = extract_features(word)
    inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device),
            phonetic_seq=torch.tensor([phonetic_seq], dtype=torch.long).to(device),
            unicode_feature=torch.tensor([unicode_feature], dtype=torch.float).to(device),
            other_feature=torch.tensor([other_feature], dtype=torch.float).to(device)
        )
        probs = torch.softmax(logits, dim=1)
        print(word , probs)
        
        if torch.argmax(probs) == 1:  
            false_loans.append(word)

print("False loan words:", false_loans)

The tensor([[0.7009, 0.2991]])
Christmas tensor([[0.7576, 0.2424]])
party tensor([[0.5908, 0.4092]])
radiated tensor([[0.5134, 0.4866]])
joyfulness tensor([[0.6997, 0.3003]])
False loan words: []
