In [1]:
import pandas as pd
from tqdm import tqdm
from nltk.translate.meteor_score import meteor_score  # For METEOR score
from transformers import pipeline
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("score.csv")

In [3]:
import torch
from transformers import BertModel


class LoanWordClassifier(torch.nn.Module):
    def __init__(self, num_phonetic_embeddings, num_labels=2):
        super().__init__()
        self.bert = BertModel.from_pretrained("model/tuned-bert")
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.phonetic_embedder = torch.nn.Embedding(num_phonetic_embeddings, embedding_dim=64)
        
        bert_hidden_size = self.bert.config.hidden_size 
        phonetic_size = 64
        unicode_size = 25 
        other_size = 1
        
        total_input_size = bert_hidden_size + phonetic_size + unicode_size + other_size
        
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(total_input_size, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, phonetic_seq, unicode_feature, other_feature):
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state 
        pooled_output = last_hidden_states.mean(dim=1) 
        phonetic_emb = self.phonetic_embedder(phonetic_seq).mean(dim=1) 

    
        unicode_feature = unicode_feature.view(unicode_feature.size(0), -1)  
        other_feature = other_feature.view(other_feature.size(0), -1)      

    
        combined = torch.cat([
            pooled_output,       
            phonetic_emb,        
            unicode_feature,     
            other_feature        
        ], dim=1)               
        
        logits = self.classifier(combined)
        return logits

In [4]:
import torch
from transformers import BertTokenizer
import epitran

# Load model and tokenizer
model = LoanWordClassifier(111212)
model.load_state_dict(torch.load("model/loan_word_model.pth", map_location=torch.device('cpu')))
model.eval()
tokenizer = BertTokenizer.from_pretrained("model/tuned-bert-tokenizer")
epi = epitran.Epitran("deu-Latn")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word, max_len=25):
    try:
        loan_epitran = epi.transliterate(word)
        phonetic_seq = [ord(c) for c in loan_epitran] 
    except IndexError as e:
        print(f"Transliteration failed for '{word}': {e}")
        phonetic_seq = [0] 

    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)

    if len(unicode_features) < max_len:
        unicode_features = unicode_features + [0] * (max_len - len(unicode_features))
    else:
        unicode_features = unicode_features[:max_len]  # Truncate if longer

    return phonetic_seq, unicode_features, [len(word)]



sentence = "The government governed a new abordage policy."
words = sentence.split()

false_loans = []
for word in words:
    phonetic_seq, unicode_feature, other_feature = extract_features(word)
    inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device),
            phonetic_seq=torch.tensor([phonetic_seq], dtype=torch.long).to(device),
            unicode_feature=torch.tensor([unicode_feature], dtype=torch.float).to(device),
            other_feature=torch.tensor([other_feature], dtype=torch.float).to(device)
        )
        probs = torch.softmax(logits, dim=1)
        print(word , probs)
        
        if torch.argmax(probs) == 1:  
            false_loans.append(word)

print("False loan words:", false_loans)

  model.load_state_dict(torch.load("model/loan_word_model.pth", map_location=torch.device('cpu')))


The tensor([[0.9558, 0.0442]])
government tensor([[0.4960, 0.5040]])
governed tensor([[0.8994, 0.1006]])
a tensor([[0.7115, 0.2885]])
new tensor([[0.6860, 0.3140]])
abordage tensor([[0.2496, 0.7504]])
policy. tensor([[0.6157, 0.3843]])
False loan words: ['government', 'abordage']


In [9]:
from langchain_ollama import OllamaLLM
from tqdm import tqdm


new_df = df[(df['label'] == 'hard_negative') | (df['label'] == 'loan')]

def false_loan(sentence):
    words = sentence.split()
    false_loans = []  
    for word in words:
        phonetic_seq, unicode_feature, other_feature = extract_features(word)
        inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)

        with torch.no_grad():
            logits = model(
                input_ids=inputs["input_ids"].to(device),
                attention_mask=inputs["attention_mask"].to(device),
                phonetic_seq=torch.tensor([phonetic_seq], dtype=torch.long).to(device),
                unicode_feature=torch.tensor([unicode_feature], dtype=torch.float).to(device),
                other_feature=torch.tensor([other_feature], dtype=torch.float).to(device)
            )
            probs = torch.softmax(logits, dim=1)
            
            if torch.argmax(probs) == 1: 
                false_loans.append(word.lower()) 
    return false_loans

false_loanwords = []

for _, row in tqdm(new_df.iterrows(), total=len(new_df)):
    translated_sentence = row['translated_sentence']
    result = false_loan(translated_sentence) 
    false_loanwords.append(result) 


new_df['false_loanword_model'] = false_loanwords

llm = OllamaLLM(model="translator")

def false_loanword(sentence):
    prompt = f"""
        You are an expert linguistic model. 
        Given a sentence, identify if there is a **false loanword** (a word that seems borrowed but is wrongly used or misinterpreted).
        - If a false loanword is present, output only the false loanword (one word, no explanation).
        - If no false loanword is found, output exactly: no
        Sentence: {sentence}
        Output Response should must contain one single word only
        Output Format: Output_word
        Do don't include any other thing just give one single word output!
    """
    answer = llm.invoke(prompt)
    return answer.strip()

false_loanwords = []
for _, row in tqdm(new_df.iterrows(), total=len(new_df)):
    translated_sentence = row['translated_sentence']
    result = false_loanword(translated_sentence)
    false_loanwords.append(result)
new_df['false_loanword'] = false_loanwords

100%|██████████| 2941/2941 [15:55<00:00,  3.08it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['false_loanword_model'] = false_loanwords
100%|██████████| 2941/2941 [32:53<00:00,  1.49it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['false_loanword'] = false_loanwords


In [10]:
df

Unnamed: 0,loan_word,original_word,generated_context,reference_sentence,translated_sentence,reference_word,label,bleu_score,meteor_score
0,Mirth,Fröhlichkeit,"Die Frau lächelte plötzlich und sagte: ""Ich fü...","The woman smiled suddenly and said, ""I feel ve...","The woman smiled suddenly and said: ""I feel ve...",Cheerfulness,synonym,0.800320,0.905455
1,Schnorr,Chromatogramm,Der Chemiker studierte die Chromatogramme der ...,The chemist studied the chromatograms of the v...,The chemist studied the chromatograms of diffe...,Chromatogram,random,0.582823,0.771560
2,Zettelkasten,Zettelkasten,"Ich habe meine Zettelkasten vollgefüllt, um al...",I filled up my paperbox to write down all the ...,I have filled my notebook with all ideas and t...,Paper box,loan,0.431177,0.627753
3,Meiring,Meiring,"Der kleine Hund rannte durch den Wald, um nach...",The little dog ran through the forest to look ...,The little dog ran through the woods in search...,Meiring,loan,0.423118,0.721960
4,Speth,Speth,"Der Speth fuhr durch die Felder, um frische Ge...",The Speth drove through the fields to buy fres...,The farmer drove through the fields to buy fre...,Speth,loan,0.630807,0.750388
...,...,...,...,...,...,...,...,...,...
5289,Meisinger,Meisinger,"Der kleine Hund rannte durch den Wald, um nach...",The little dog ran through the forest to look ...,"Der kleine Hund rannte durch den Wald, um nach...",Meisinger,loan,0.024456,0.000000
5290,Frankenberger,bleiben lassen,"Der Hund bleibt lassen, wenn man ihn nicht füt...",Keep the dog if you don't feed it.,The dog will not leave if you do not feed him.,Keep,hard_negative,0.085166,0.537349
5291,esteemed,geehrt,Mein Vater gehert mich immer noch.,My father's still insinuating me.,My father still loves me.,Honored,synonym,0.193049,0.300000
5292,Meier,Kauffmann,Der kleine Kaufmann kaufte ein Stück Brot auf ...,The little merchant bought a piece of bread in...,The small merchant bought a loaf of bread on t...,Kauffmann,random,0.178275,0.576994


In [11]:
tokenizer = BertTokenizer.from_pretrained("model/tuned-bert-tokenizer")
similarity_model = BertModel.from_pretrained("model/tuned-bert")

def get_token_embedding(sentence, target_word):
    tokens = tokenizer.tokenize(sentence.lower())
    token_ids = tokenizer.encode(sentence.lower(), return_tensors="pt")  # Includes [CLS] and [SEP]
    
    with torch.no_grad():
        outputs = similarity_model(token_ids)
    last_hidden_state = outputs.last_hidden_state.squeeze(0)  # Shape: [seq_len, hidden_size]

    word_pieces = tokenizer.tokenize(target_word.lower())
    indices = [i for i, tok in enumerate(tokens) if tok in word_pieces]

    if not indices:
        raise ValueError(f"Word '{target_word}' not found in tokenized sentence: {tokens}")

    embedding = torch.stack([last_hidden_state[i + 1] for i in indices], dim=0).mean(dim=0)
    return embedding.detach().numpy()

def compare_embeddings(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]

In [12]:
similarity = []
for _, row in tqdm(new_df.iterrows(), total=len(new_df)):
    translated_sentence = row['translated_sentence'].lower()
    generated_context = row['generated_context'].lower()
    false_loanword = row["false_loanword"].lower()
    original_word = row["original_word"].lower()
    if false_loanword != 'false':
        try:
            embedding_orignal = get_token_embedding(generated_context, original_word)
            embedding_eng = get_token_embedding(translated_sentence, false_loanword)
            embedding_ger = get_token_embedding(translated_sentence.replace(false_loanword,original_word), original_word)
            sim_to_eng = compare_embeddings(embedding_orignal, embedding_eng)
            sim_to_ger = compare_embeddings(embedding_orignal, embedding_ger)
            similarity.append([sim_to_eng,sim_to_ger])
        except ValueError as e:
            # print(f"Error: {e}")
            similarity.append([0,0])
        
    else:
        similarity.append('')
new_df['sim_eng_ger'] = similarity

100%|██████████| 2941/2941 [03:32<00:00, 13.83it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['sim_eng_ger'] = similarity


In [13]:
new_df

Unnamed: 0,loan_word,original_word,generated_context,reference_sentence,translated_sentence,reference_word,label,bleu_score,meteor_score,false_loanword_model,false_loanword,sim_eng_ger
2,Zettelkasten,Zettelkasten,"Ich habe meine Zettelkasten vollgefüllt, um al...",I filled up my paperbox to write down all the ...,I have filled my notebook with all ideas and t...,Paper box,loan,0.431177,0.627753,[notebook],false,
3,Meiring,Meiring,"Der kleine Hund rannte durch den Wald, um nach...",The little dog ran through the forest to look ...,The little dog ran through the woods in search...,Meiring,loan,0.423118,0.721960,[],Meiring,"[0.42811403, 0.42811403]"
4,Speth,Speth,"Der Speth fuhr durch die Felder, um frische Ge...",The Speth drove through the fields to buy fres...,The farmer drove through the fields to buy fre...,Speth,loan,0.630807,0.750388,[vegetables],false,
10,Kaus,cool,Der neue Kühlschrank ist sehr cool.,The new fridge is very cool.,The new refrigerator is very cool.,cool,hard_negative,0.488923,0.806667,[refrigerator],false,
13,nekton,Nekton,Der Nekton schwamm durch die dunklen Gewässer ...,The Nekton swam through the dark waters of the...,The shark swam through the dark waters of the ...,Necton,loan,0.826517,0.905455,"[shark, swam]",false,
...,...,...,...,...,...,...,...,...,...,...,...,...
5286,Zimmermann,Zimmermann,Der ehemalige Bundeskanzler Otto von Bismarck ...,The former Chancellor Otto von Bismarck was or...,Der ehemalige Bundeskanzler Otto von Bismarck ...,Carpenter,loan,0.083717,0.150000,"[bundeskanzler, otto, bismarck, deutsche, regi...",Zimmermann,"[1.0, 1.0]"
5288,Hoeffner,Höffner,Der Höffner von der Firma kaufte ein neues Fah...,The Höffner from the company bought a new bicy...,The Höffner from the company bought a new bike...,Höffner,loan,0.500909,0.744141,"[it, basket.]",false loanword,"[0, 0]"
5289,Meisinger,Meisinger,"Der kleine Hund rannte durch den Wald, um nach...",The little dog ran through the forest to look ...,"Der kleine Hund rannte durch den Wald, um nach...",Meisinger,loan,0.024456,0.000000,"[rannte, meisinger-schokoladenladen]",Der,"[0.37442356, 0.8250093]"
5290,Frankenberger,bleiben lassen,"Der Hund bleibt lassen, wenn man ihn nicht füt...",Keep the dog if you don't feed it.,The dog will not leave if you do not feed him.,Keep,hard_negative,0.085166,0.537349,[],false,


In [14]:
similarity = []
fl_ger = []

for _, row in tqdm(new_df.iterrows(), total=len(new_df)):
    translated_sentence = row['translated_sentence'].lower()
    generated_context = row['generated_context'].lower()     
    false_loanwords = row["false_loanword_model"]           
    original_word = row["original_word"].lower()             

    if isinstance(false_loanwords, str): 
        false_loanwords = eval(false_loanwords)

    if false_loanwords:  
        word_similarities = []
        higher_german_similarity_words = []
        
        for false_loanword in false_loanwords:
            try:
                embedding_original = get_token_embedding(generated_context, original_word)
                embedding_eng = get_token_embedding(translated_sentence, false_loanword)
                embedding_ger = get_token_embedding(translated_sentence.replace(false_loanword, original_word), original_word)

                sim_to_eng = compare_embeddings(embedding_original, embedding_eng)
                sim_to_ger = compare_embeddings(embedding_original, embedding_ger)
            
            except ValueError as e:
                # print(f"Error: {e}")
                word_similarities.append([0, 0])
                continue
            
            word_similarities.append([sim_to_eng, sim_to_ger])
            

            if sim_to_ger > sim_to_eng:
                higher_german_similarity_words.append(false_loanword)
        
        similarity.append(word_similarities)
        fl_ger.append(higher_german_similarity_words)
    else:
        similarity.append([])
        fl_ger.append([])


new_df['similarity_word_eng_ger'] = similarity
new_df['fl_ger'] = fl_ger


100%|██████████| 2941/2941 [11:29<00:00,  4.27it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['similarity_word_eng_ger'] = similarity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['fl_ger'] = fl_ger


In [15]:
filtered_rows = []
for _, row in new_df.iterrows():
    bleu_score = row['bleu_score']
    meteor_score = row['meteor_score']
    sim_eng_ger = row['sim_eng_ger']
    false_loanword = row['false_loanword']
    # Condition 1: BLEU score < 0.5
    cond1 = bleu_score < 0.5

    # Condition 2: METEOR score < 0.5
    cond2 = meteor_score < 0.5

    # Condition 3: German embedding similarity is higher ([eng, ger] -> ger > eng)
    cond3 = False
    if sim_eng_ger and isinstance(sim_eng_ger, list) and len(sim_eng_ger) == 2:
        cond3 = sim_eng_ger[1] > sim_eng_ger[0]

    # Condition 4: False loanword is an actual word (not "false") and matches false_loanword_model
    cond4 = false_loanword != 'false' and false_loanword != 'false_loanword'

    # Count how many conditions are true
    conditions_met = sum([cond1, cond2, cond3, cond4])

    if conditions_met >= 3:
        filtered_rows.append(row)

filtered_df = pd.DataFrame(filtered_rows)
filtered_df.head()

Unnamed: 0,loan_word,original_word,generated_context,reference_sentence,translated_sentence,reference_word,label,bleu_score,meteor_score,false_loanword_model,false_loanword,sim_eng_ger,similarity_word_eng_ger,fl_ger
14,enzyme,Enzym,"Der Enzym, der die Zucker zu Sauerstoff umwand...",The enzyme that converts sugar into oxygen is ...,"Der Enzym, der die Zucker zu Sauerstoff umwand...",Enzyme,loan,0.02287,0.0,[zucker],Der,"[0.7002429, 0.83268857]","[[0.3908607, 0.9491485]]",[zucker]
32,Klees,Klees,Der Kleesmann kümmerte sich um die Reinigung d...,The Kleesmann took care of the cleaning of the...,The translator of the given German sentence is...,Klees,loan,0.04815,0.292245,[kleesmann],Kleesmann,"[0.76906705, 0.6750045]","[[0.76906705, 0.6750045]]",[]
46,Pischke,Pischke,Der Pischke im Garten ist sehr groß und hat vi...,The pishke in the garden is very large and has...,"The pichk is a type of bird, not an animal tha...",Pishke,loan,0.058166,0.122951,"[pichk, an]",pichk,"[0.594713, 0.65164655]","[[0.594713, 0.65164655], [0.24290466, 0.692956]]","[pichk, an]"
54,Wermuth,Wermuth,Der kleine Hund ran schnell um den Wermuth herum.,The little dog quickly ran around the wormwood.,The little dog ran quickly around the worm.,Wermuth,loan,0.250986,0.793367,[],worm,"[0.2375343, 0.714456]",[],[]
58,Reifsteck,Reifsteck,"Der Reifstein war sehr trocken, daher musste i...","The maturation stone was very dry, so I had to...","Der Reifstein war sehr trocken, daher musste i...",Mature plug,loan,0.024178,0.0,[],Der,"[0.586233, 0.77951574]",[],[]


In [16]:
filtered_rows = []
for _, row in filtered_df.iterrows():
    sim_eng_ger = row['sim_eng_ger']
    
    # Ensure sim_eng_ger is a list-like structure (e.g., "[0.2, 0.8]")
    if isinstance(sim_eng_ger, str):
        try:
            sim_eng_ger = eval(sim_eng_ger)  # Convert string representation of list to actual list
        except Exception as e:
            print(f"Error parsing sim_eng_ger: {e}")
            continue
    
    # Check if German embedding similarity is higher
    if isinstance(sim_eng_ger, list) and len(sim_eng_ger) == 2:
        eng_similarity, ger_similarity = sim_eng_ger
        if ger_similarity > eng_similarity:
            filtered_rows.append(row)
            
loan_df = pd.DataFrame(filtered_rows)

In [17]:
print(len(loan_df)/len(filtered_df))

0.6579476861167002


In [18]:
loan_df

Unnamed: 0,loan_word,original_word,generated_context,reference_sentence,translated_sentence,reference_word,label,bleu_score,meteor_score,false_loanword_model,false_loanword,sim_eng_ger,similarity_word_eng_ger,fl_ger
14,enzyme,Enzym,"Der Enzym, der die Zucker zu Sauerstoff umwand...",The enzyme that converts sugar into oxygen is ...,"Der Enzym, der die Zucker zu Sauerstoff umwand...",Enzyme,loan,0.022870,0.000000,[zucker],Der,"[0.7002429, 0.83268857]","[[0.3908607, 0.9491485]]",[zucker]
46,Pischke,Pischke,Der Pischke im Garten ist sehr groß und hat vi...,The pishke in the garden is very large and has...,"The pichk is a type of bird, not an animal tha...",Pishke,loan,0.058166,0.122951,"[pichk, an]",pichk,"[0.594713, 0.65164655]","[[0.594713, 0.65164655], [0.24290466, 0.692956]]","[pichk, an]"
54,Wermuth,Wermuth,Der kleine Hund ran schnell um den Wermuth herum.,The little dog quickly ran around the wormwood.,The little dog ran quickly around the worm.,Wermuth,loan,0.250986,0.793367,[],worm,"[0.2375343, 0.714456]",[],[]
58,Reifsteck,Reifsteck,"Der Reifstein war sehr trocken, daher musste i...","The maturation stone was very dry, so I had to...","Der Reifstein war sehr trocken, daher musste i...",Mature plug,loan,0.024178,0.000000,[],Der,"[0.586233, 0.77951574]",[],[]
72,Blumenkopf,Blumenkopf,Der Blumenkopf taucht langsam aus dem Wald.,The flower head slowly dives out of the forest.,The Blumenkopf is slowly emerging from the for...,Flower head,loan,0.180444,0.354635,[blumenkopf],The,"[0.37544906, 0.39622292]","[[0.76703185, 0.76703185]]",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5256,flehman,flehmen,"Der Hund flöhmen konnte nicht mehr, weil er mü...",The dog couldn't flea anymore because he was t...,The dog could no longer cry because he was tired.,Floats,loan,0.403528,0.647131,[],no,"[0.27304706, 0.3717686]",[],[]
5261,Bechtel,Bechtel,"Der Politiker, der für die Landesregierung kan...",The politician who ran for the state governmen...,The politician who ran for state government wa...,Weight,loan,0.431949,0.663511,[government],Bechtle,"[0.62812185, 0.7227488]","[[0.30271804, 0.6824347]]",[government]
5276,Roberg,Roberg,Der Tourist fand den Roberg am Strand sehr schön.,The tourist found the Roberg on the beach very...,The tourist found the Robertsgate at the beach...,Roberg,loan,0.350844,0.691837,[beach],Robertsgate,"[0.6308725, 0.7515869]","[[0.2992113, 0.72408533]]",[beach]
5282,Schwaller,Schwaller,Der Schwaller von der Stadt führte mich zum kl...,The swarm from the city led me to the little c...,The Schwaller von der Stadt is likely a misspe...,Swallows,loan,0.011525,0.031646,"[stadt, district, stadt, mich, café,]",Schwabing,"[0.59586823, 0.67951965]","[[0.37616584, 0.6403628], [0.16528827, 0.64619...","[stadt, district, stadt, mich, café,]"


In [19]:
filtered_df.to_csv('Modelfiles/wrong.csv',index=False)
loan_df.to_csv('Modelfiles/misclassified.csv',index=False)

In [20]:
new_df.to_csv('results.csv',index=False)