In [123]:
import pandas as pd
from tqdm import tqdm
from nltk.translate.meteor_score import meteor_score  # For METEOR score
from transformers import pipeline
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [124]:
df = pd.read_csv("score.csv")

In [125]:
import torch
from transformers import BertModel


class LoanWordClassifier(torch.nn.Module):
    def __init__(self, num_phonetic_embeddings, num_labels=2):
        super().__init__()
        self.bert = BertModel.from_pretrained("model/tuned-bert")
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.phonetic_embedder = torch.nn.Embedding(num_phonetic_embeddings, embedding_dim=64)
        
        bert_hidden_size = self.bert.config.hidden_size 
        phonetic_size = 64
        unicode_size = 25 
        other_size = 1
        
        total_input_size = bert_hidden_size + phonetic_size + unicode_size + other_size
        
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(total_input_size, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, phonetic_seq, unicode_feature, other_feature):
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state 
        pooled_output = last_hidden_states.mean(dim=1) 
        phonetic_emb = self.phonetic_embedder(phonetic_seq).mean(dim=1) 

    
        unicode_feature = unicode_feature.view(unicode_feature.size(0), -1)  
        other_feature = other_feature.view(other_feature.size(0), -1)      

    
        combined = torch.cat([
            pooled_output,       
            phonetic_emb,        
            unicode_feature,     
            other_feature        
        ], dim=1)               
        
        logits = self.classifier(combined)
        return logits

In [126]:
import torch
from transformers import BertTokenizer
import epitran

# Load model and tokenizer
model = LoanWordClassifier(111024)
model.load_state_dict(torch.load("model/loan_word_model.pth", map_location=torch.device('cpu')))
model.eval()
tokenizer = BertTokenizer.from_pretrained("model/tuned-bert-tokenizer")
epi = epitran.Epitran("deu-Latn")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word, max_len=25):
    try:
        loan_epitran = epi.transliterate(word)
        phonetic_seq = [ord(c) for c in loan_epitran] 
    except IndexError as e:
        print(f"Transliteration failed for '{word}': {e}")
        phonetic_seq = [0] 

    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)

    if len(unicode_features) < max_len:
        unicode_features = unicode_features + [0] * (max_len - len(unicode_features))
    else:
        unicode_features = unicode_features[:max_len]  # Truncate if longer

    return phonetic_seq, unicode_features, [len(word)]



sentence = "The government governed a new abordage policy."
words = sentence.split()

false_loans = []
for word in words:
    phonetic_seq, unicode_feature, other_feature = extract_features(word)
    inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device),
            phonetic_seq=torch.tensor([phonetic_seq], dtype=torch.long).to(device),
            unicode_feature=torch.tensor([unicode_feature], dtype=torch.float).to(device),
            other_feature=torch.tensor([other_feature], dtype=torch.float).to(device)
        )
        probs = torch.softmax(logits, dim=1)
        print(word , probs)
        
        if torch.argmax(probs) == 1:  
            false_loans.append(word)

print("False loan words:", false_loans)

  model.load_state_dict(torch.load("model/loan_word_model.pth", map_location=torch.device('cpu')))


The tensor([[0.7009, 0.2991]])
government tensor([[0.5804, 0.4196]])
governed tensor([[0.7286, 0.2714]])
a tensor([[0.7255, 0.2745]])
new tensor([[0.4282, 0.5718]])
abordage tensor([[0.5413, 0.4587]])
policy. tensor([[0.7183, 0.2817]])
False loan words: ['new']


In [127]:
new_df = df.head(10)
def false_loan(sentence):
    words = sentence.split()
    false_loans = ''
    for word in words:
        phonetic_seq, unicode_feature, other_feature = extract_features(word)
        inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)

        with torch.no_grad():
            logits = model(
                input_ids=inputs["input_ids"].to(device),
                attention_mask=inputs["attention_mask"].to(device),
                phonetic_seq=torch.tensor([phonetic_seq], dtype=torch.long).to(device),
                unicode_feature=torch.tensor([unicode_feature], dtype=torch.float).to(device),
                other_feature=torch.tensor([other_feature], dtype=torch.float).to(device)
            )
            probs = torch.softmax(logits, dim=1)
            
            if torch.argmax(probs) == 1:  
                false_loans += "," + word
    return false_loans
    
false_loanwords = []
for _, row in tqdm(new_df.iterrows(), total=len(new_df)):
    translated_sentence = row['translated_sentence']
    result = false_loan(translated_sentence)
    false_loanwords.append(result)
new_df['false_loanword_model'] = false_loanwords
new_df

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:03<00:00,  3.21it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['false_loanword_model'] = false_loanwords


Unnamed: 0,loan_word,original_word,generated_context,reference_sentence,translated_sentence,reference_word,label,bleu_score,meteor_score,false_loanword_model
0,Mirth,Fröhlichkeit,"Die Frau lächelte plötzlich und sagte: ""Ich fü...","The woman smiled suddenly and said, ""I feel ve...","The woman smiled suddenly and said: ""I feel ve...",Cheerfulness,synonym,0.80032,0.905455,",woman"
1,Schnorr,Chromatogramm,Der Chemiker studierte die Chromatogramme der ...,The chemist studied the chromatograms of the v...,The chemist studied the chromatograms of diffe...,Chromatogram,random,0.582823,0.77156,",chromatograms,substances"
2,Zettelkasten,Zettelkasten,"Ich habe meine Zettelkasten vollgefüllt, um al...",I filled up my paperbox to write down all the ...,I have filled my notebook with all ideas and t...,Paper box,loan,0.431177,0.627753,",notebook,ideas"
3,Meiring,Meiring,"Der kleine Hund rannte durch den Wald, um nach...",The little dog ran through the forest to look ...,The little dog ran through the woods in search...,Meiring,loan,0.423118,0.72196,",little,dog,ran,dog,Meiring."
4,Speth,Speth,"Der Speth fuhr durch die Felder, um frische Ge...",The Speth drove through the fields to buy fres...,The farmer drove through the fields to buy fre...,Speth,loan,0.630807,0.750388,",drove"
5,Important,Gefeiert,"""Mein Vater feierte seinen Geburtstag mit eine...","""My father celebrated his birthday with a spec...","""My father celebrated his birthday with a spec...",Celebrated,synonym,0.769161,0.888021,
6,Breisgau,Ennen,"Ich gehe Ennen zum Café, um ein Kaffee zu trin...",I'm going to the café to have a coffee.,I go to the café in front of me to drink a cof...,Eureka,random,0.184769,0.715371,",me"
7,Abstruse,Unklar,"Der Lehrer sagte, dass das Buch sehr unklar wa...",Teacher said that the book was very unclear an...,The teacher said that the book was very unclea...,Unclear,synonym,0.861174,0.992727,",teacher,book"
8,Uncommon,Fantastisch,"Der Film war fantastisch, er erzählte eine Ges...","The movie was fantastic, he told a story that ...","The film was fantastic, it told a story that m...",Fantastic,synonym,0.686507,0.850446,",it,told,story,me"
9,Wemhoff,Gut,"""Das Essen war sehr gut.""","""The food was very good.""","""the food was very good.""",Good,random,0.707107,0.996,


In [128]:
from langchain_ollama import OllamaLLM
from tqdm import tqdm

llm = OllamaLLM(model="translator")

new_df = df.head(10)

def false_loanword(sentence):
    prompt = f"""
        You are an expert linguistic model. 
        Given a sentence, identify if there is a **false loanword** (a word that seems borrowed but is wrongly used or misinterpreted).
        - If a false loanword is present, output only the false loanword (one word, no explanation).
        - If no false loanword is found, output exactly: no
        Sentence: {sentence}
        Output Response should must contain one single word only
        Output Format: Output_word
        Do don't include any other thing just give one single word output!
    """
    answer = llm.invoke(prompt)
    return answer.strip()

sentence = "The government governed a new abordage policy."
result = false_loanword(sentence)

false_loanwords = []
for _, row in tqdm(new_df.iterrows(), total=len(new_df)):
    translated_sentence = row['translated_sentence']
    result = false_loanword(translated_sentence)
    false_loanwords.append(result)
new_df['false_loanword'] = false_loanwords
new_df

100%|██████████| 10/10 [00:06<00:00,  1.53it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['false_loanword'] = false_loanwords


Unnamed: 0,loan_word,original_word,generated_context,reference_sentence,translated_sentence,reference_word,label,bleu_score,meteor_score,false_loanword
0,Mirth,Fröhlichkeit,"Die Frau lächelte plötzlich und sagte: ""Ich fü...","The woman smiled suddenly and said, ""I feel ve...","The woman smiled suddenly and said: ""I feel ve...",Cheerfulness,synonym,0.80032,0.905455,false
1,Schnorr,Chromatogramm,Der Chemiker studierte die Chromatogramme der ...,The chemist studied the chromatograms of the v...,The chemist studied the chromatograms of diffe...,Chromatogram,random,0.582823,0.77156,false
2,Zettelkasten,Zettelkasten,"Ich habe meine Zettelkasten vollgefüllt, um al...",I filled up my paperbox to write down all the ...,I have filled my notebook with all ideas and t...,Paper box,loan,0.431177,0.627753,false
3,Meiring,Meiring,"Der kleine Hund rannte durch den Wald, um nach...",The little dog ran through the forest to look ...,The little dog ran through the woods in search...,Meiring,loan,0.423118,0.72196,Meiring
4,Speth,Speth,"Der Speth fuhr durch die Felder, um frische Ge...",The Speth drove through the fields to buy fres...,The farmer drove through the fields to buy fre...,Speth,loan,0.630807,0.750388,false
5,Important,Gefeiert,"""Mein Vater feierte seinen Geburtstag mit eine...","""My father celebrated his birthday with a spec...","""My father celebrated his birthday with a spec...",Celebrated,synonym,0.769161,0.888021,false
6,Breisgau,Ennen,"Ich gehe Ennen zum Café, um ein Kaffee zu trin...",I'm going to the café to have a coffee.,I go to the café in front of me to drink a cof...,Eureka,random,0.184769,0.715371,false
7,Abstruse,Unklar,"Der Lehrer sagte, dass das Buch sehr unklar wa...",Teacher said that the book was very unclear an...,The teacher said that the book was very unclea...,Unclear,synonym,0.861174,0.992727,false
8,Uncommon,Fantastisch,"Der Film war fantastisch, er erzählte eine Ges...","The movie was fantastic, he told a story that ...","The film was fantastic, it told a story that m...",Fantastic,synonym,0.686507,0.850446,false
9,Wemhoff,Gut,"""Das Essen war sehr gut.""","""The food was very good.""","""the food was very good.""",Good,random,0.707107,0.996,false


In [130]:
new_df

Unnamed: 0,loan_word,original_word,generated_context,reference_sentence,translated_sentence,reference_word,label,bleu_score,meteor_score,false_loanword
0,Mirth,Fröhlichkeit,"Die Frau lächelte plötzlich und sagte: ""Ich fü...","The woman smiled suddenly and said, ""I feel ve...","The woman smiled suddenly and said: ""I feel ve...",Cheerfulness,synonym,0.80032,0.905455,false
1,Schnorr,Chromatogramm,Der Chemiker studierte die Chromatogramme der ...,The chemist studied the chromatograms of the v...,The chemist studied the chromatograms of diffe...,Chromatogram,random,0.582823,0.77156,false
2,Zettelkasten,Zettelkasten,"Ich habe meine Zettelkasten vollgefüllt, um al...",I filled up my paperbox to write down all the ...,I have filled my notebook with all ideas and t...,Paper box,loan,0.431177,0.627753,false
3,Meiring,Meiring,"Der kleine Hund rannte durch den Wald, um nach...",The little dog ran through the forest to look ...,The little dog ran through the woods in search...,Meiring,loan,0.423118,0.72196,Meiring
4,Speth,Speth,"Der Speth fuhr durch die Felder, um frische Ge...",The Speth drove through the fields to buy fres...,The farmer drove through the fields to buy fre...,Speth,loan,0.630807,0.750388,false
5,Important,Gefeiert,"""Mein Vater feierte seinen Geburtstag mit eine...","""My father celebrated his birthday with a spec...","""My father celebrated his birthday with a spec...",Celebrated,synonym,0.769161,0.888021,false
6,Breisgau,Ennen,"Ich gehe Ennen zum Café, um ein Kaffee zu trin...",I'm going to the café to have a coffee.,I go to the café in front of me to drink a cof...,Eureka,random,0.184769,0.715371,false
7,Abstruse,Unklar,"Der Lehrer sagte, dass das Buch sehr unklar wa...",Teacher said that the book was very unclear an...,The teacher said that the book was very unclea...,Unclear,synonym,0.861174,0.992727,false
8,Uncommon,Fantastisch,"Der Film war fantastisch, er erzählte eine Ges...","The movie was fantastic, he told a story that ...","The film was fantastic, it told a story that m...",Fantastic,synonym,0.686507,0.850446,false
9,Wemhoff,Gut,"""Das Essen war sehr gut.""","""The food was very good.""","""the food was very good.""",Good,random,0.707107,0.996,false


In [131]:
tokenizer = BertTokenizer.from_pretrained("model/tuned-bert-tokenizer")
model = BertModel.from_pretrained("model/tuned-bert")

# Load German-to-English translation pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en") 

def get_token_embedding(sentence, target_word):
    """
    Get the embedding for a specific word in a sentence.
    Handles subword tokenization by averaging embeddings of subword tokens.
    """
    
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.encode(sentence, return_tensors="pt")  # Includes [CLS] and [SEP]
    
    with torch.no_grad():
        outputs = model(token_ids)
    last_hidden_state = outputs.last_hidden_state.squeeze(0)  # Shape: [seq_len, hidden_size]

    word_pieces = tokenizer.tokenize(target_word)
    indices = [i for i, tok in enumerate(tokens) if tok in word_pieces]

    if not indices:
        raise ValueError(f"Word '{target_word}' not found in tokenized sentence: {tokens}")

    embedding = torch.stack([last_hidden_state[i + 1] for i in indices], dim=0).mean(dim=0)
    return embedding.detach().numpy()

def compare_embeddings(embedding1, embedding2):
    """
    Compute cosine similarity between two embeddings.
    """
    return cosine_similarity([embedding1], [embedding2])[0][0]

german_sentence = "Ich habe mein Handy verloren."
translated_sentence = translator(german_sentence)[0]['translation_text']
print("Translation:", translated_sentence)

try:
    embedding_de = get_token_embedding(german_sentence, "Handy")
    embedding_en_mobile = get_token_embedding(translated_sentence, "phone")
    embedding_en_handy = get_token_embedding("I lost my handy.", "handy")

    
    sim_to_mobile = compare_embeddings(embedding_de, embedding_en_mobile)
    sim_to_handy = compare_embeddings(embedding_de, embedding_en_handy)

    print(f"Similarity with 'phone': {sim_to_mobile:.4f}")
    print(f"Similarity with 'handy': {sim_to_handy:.4f}")

    if sim_to_mobile > sim_to_handy:
        print("'Handy' likely means 'phone' — false loanword detected.")
    else:
        print("'Handy' could mean 'handy' — no issue detected.")
except ValueError as e:
    print(f"Error: {e}")

Device set to use cpu


Translation: I lost my phone.
Similarity with 'phone': 0.4556
Similarity with 'handy': 0.6671
'Handy' could mean 'handy' — no issue detected.
