In [2]:
import pandas as pd
import numpy as np
import phonetics
import torch
import epitran
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from scipy.spatial.distance import cosine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
import pandas as pd
import os

# List of language pairs
language_pairs = [
    ("Azerbaijani", "Arabic"),
    ("Catalan", "Arabic"),
    ("Chinese", "English"),
    ("English", "French"),
    ("English", "German"),
    ("Finnish", "Swedish"),
    ("German", "French"),
    ("German", "Italian"),
    ("Hindi", "Persian"),
    ("Hungarian", "German"),
    ("Indonesian", "Dutch"),
    ("Kazakh", "Russian"),
    ("Persian", "Arabic"),
    ("Polish", "French"),
    ("Romanian", "French"),
    ("Romanian", "Hungarian"),
]

def read_language(lang1, lang2):
    file_path = f"data/production_train_test/{lang1}-{lang2}/alldata/{lang1}-{lang2}-train_production_alldata.csv"
    
    if os.path.exists(file_path):  # Check if file exists
        df = pd.read_csv(file_path)
        df = df.drop(columns=[col for col in ['Unnamed: 0.1', 'Unnamed: 0'] if col in df.columns], errors="ignore")
        df["language_pair"] = f"{lang1}-{lang2}"  # Add language pair column
        return df
    else:
        print(f"File not found: {file_path}")
        return None

dfs = [read_language(lang1, lang2) for lang1, lang2 in language_pairs]
dfs = [df for df in dfs if df is not None]  # Remove None values

final_df = pd.concat(dfs, ignore_index=True)
print(final_df.info())
final_df.to_csv("all_languages_combined.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210684 entries, 0 to 210683
Data columns (total 21 columns):
 #   Column                                       Non-Null Count   Dtype  
---  ------                                       --------------   -----  
 0   loan_word                                    210684 non-null  object 
 1   original_word                                210684 non-null  object 
 2   loan_word_epitran                            210684 non-null  object 
 3   original_word_epitran                        210684 non-null  object 
 4   loan_english                                 209624 non-null  object 
 5   original_english                             209631 non-null  object 
 6   Fast Levenshtein Distance Div Maxlen         210684 non-null  float64
 7   Dolgo Prime Distance Div Maxlen              210684 non-null  float64
 8   Feature Edit Distance Div Maxlen             210684 non-null  float64
 9   Hamming Feature Distance Div Maxlen          210684 non-nul

In [4]:
# need to install - sudo apt-get install flite

epi = epitran.Epitran('fra-Latn')

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word):
    try:
        loan_epitran = epi.transliterate(word)
        print(f"Transliterated text: {loan_epitran}")
    except IndexError as e:
        print(f"Transliteration failed: {e}")
        loan_epitran = "N/A"
    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)
    return {
        "word": word,
        "phonetic": loan_epitran,
        "unicode": unicode_features,
        "length": len(word)
    }

extract_features("khana")

Transliterated text: kana


{'word': 'khana',
 'phonetic': 'kana',
 'unicode': [4.0, 1.0, -6.0, 7.0, -6.0],
 'length': 5}

In [5]:
epi = epitran.Epitran('fra-Latn')

def normalize(unicode_values):
    if not unicode_values:  # Handle empty case
        return [0] * 25  # Return a zero vector of fixed size

    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word):
    try:
        loan_epitran = epi.transliterate(word)  
    except Exception as e:  # Catch all exceptions
        print(f"Transliteration failed for '{word}': {e}")
        loan_epitran = ""  # Default value to avoid NoneType errors

    unicode_features = [ord(c) for c in word] if word else [0]  # Ensure it's never empty
    unicode_features = normalize(unicode_features)  

    return {
        "phonetic": loan_epitran,
        "unicode": unicode_features,
        "length": len(word)
    }


df_features = final_df["loan_word"].apply(extract_features).apply(pd.Series)
df_features
df = pd.concat([final_df, df_features], axis=1)

In [6]:
df['label'] = df["label"].replace(['random', 'hard_negative', 'loan', 'synonym'],[0,1,0,0])
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")


def tokenize_and_encode(text, tokenizer, max_length=120):
    encoding = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return encoding["input_ids"].squeeze(0), encoding["attention_mask"].squeeze(0)

df['phonetic_'], df['attention_masks'] = zip(*df['phonetic'].apply(
    lambda x: tokenize_and_encode(x, tokenizer)
))

def pad_sequence(seq, maxlen, pad_value=0):
    if len(seq) < maxlen:
        return seq + [pad_value] * (maxlen - len(seq))
    else:
        return seq[:maxlen] 

max_unicode_len = 25

df['unicode_padded'] = df['unicode'].apply(lambda x: pad_sequence(x, max_unicode_len))


  df['label'] = df["label"].replace(['random', 'hard_negative', 'loan', 'synonym'],[0,1,0,0])


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [7]:
df.head()

Unnamed: 0,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,...,DNN_logits,MBERT_cos_sim,XLM_cos_sim,language_pair,phonetic,unicode,length,phonetic_,attention_masks,unicode_padded
0,Qorxulu,سيى,ɡɔrxulu,siːa,It's scary,bad,1.0,0.714286,0.58631,0.654762,...,-14.315687,0.879653,0.584688,Azerbaijani-Arabic,kɔʀɡzyly,"[-28.714285714285708, 1.2857142857142918, 4.28...",7,"[tensor(101), tensor(100), tensor(102), tensor...","[tensor(1), tensor(1), tensor(1), tensor(0), t...","[-28.714285714285708, 1.2857142857142918, 4.28..."
1,hamilə,أمثل,hɑmɪlæ,aٔmθl,pregnant,optimum,0.666667,0.5,0.361111,0.409722,...,-6.240551,0.892106,0.546299,Azerbaijani-Arabic,amil,"[-83.33333333333334, -90.33333333333334, -78.3...",6,"[tensor(101), tensor(16992), tensor(10161), te...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[-83.33333333333334, -90.33333333333334, -78.3..."
2,təvəkkül,تآكل,tævækkyl,tʔaːkl,put your trust,Eat,0.625,0.5,0.375,0.416667,...,-3.535368,0.875178,0.528974,Azerbaijani-Arabic,təvəkkÿl,"[-135.25, 349.75, -133.25, 349.75, -144.25, -1...",8,"[tensor(101), tensor(188), tensor(11562), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[-135.25, 349.75, -133.25, 349.75, -144.25, -1..."
3,Sınaq,لوم,sɯnɑɡ,luːm,Experiment,blame,1.0,0.8,0.433333,0.483333,...,-6.91746,0.885054,0.673596,Azerbaijani-Arabic,sınak,"[-58.599999999999994, 163.4, -31.5999999999999...",5,"[tensor(101), tensor(187), tensor(18932), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[-58.599999999999994, 163.4, -31.5999999999999..."
4,elan et,يخبر,elɑn et,iːxbr,momentum and,Tells,1.0,0.833333,0.427083,0.486111,...,-10.755716,0.822872,0.708838,Azerbaijani-Arabic,əlan ɛ,"[6.0, 13.0, 2.0, 15.0, -63.0, 6.0, 21.0]",7,"[tensor(101), tensor(87449), tensor(10206), te...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[6.0, 13.0, 2.0, 15.0, -63.0, 6.0, 21.0, 0, 0,..."


In [8]:
class LoanWordDataset(Dataset):
    def __init__(self, loan_words, phonetic_seqs, unicode_features, other_features, labels, tokenizer, max_len=128):
        self.loan_words = loan_words
        self.phonetic_seqs = phonetic_seqs
        self.unicode_features = unicode_features
        self.other_features = other_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.loan_words)

    def __getitem__(self, idx):
        loan_word = self.loan_words[idx]
        phonetic_seq = self.phonetic_seqs[idx]
        unicode_feature = self.unicode_features[idx]
        other_feature = self.other_features[idx]
        label = self.labels[idx]

        # Tokenize loan word
        encoding = self.tokenizer.encode_plus(
            loan_word,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            # 'loan_word': loan_word,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'phonetic_seq': torch.tensor(phonetic_seq, dtype=torch.long),
            'unicode_feature': torch.tensor(unicode_feature, dtype=torch.float),
            'other_feature': torch.tensor(other_feature, dtype=torch.float),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [9]:
loan_words = df['loan_word'].tolist()  # Convert to list
phonetic_seqs = df['phonetic_'].apply(lambda x: x.tolist() if isinstance(x, torch.Tensor) else list(x)).tolist()
unicode_features = df['unicode_padded'].tolist()
other_features = df['length'].apply(lambda x: [x]).tolist()  # Convert to list
labels = df['label'].tolist()  # Convert to list

dataset = LoanWordDataset(loan_words, phonetic_seqs, unicode_features, other_features, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

In [10]:
import torch
from transformers import BertModel


class LoanWordClassifier(torch.nn.Module):
    def __init__(self, num_phonetic_embeddings, num_labels=2):
        super().__init__()
        self.bert = BertModel.from_pretrained("/kaggle/input/false-loan-model/output/tuned-bert")
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.phonetic_embedder = torch.nn.Embedding(num_phonetic_embeddings, embedding_dim=64)
        
        bert_hidden_size = self.bert.config.hidden_size 
        phonetic_size = 64
        unicode_size = 25 
        other_size = 1
        
        total_input_size = bert_hidden_size + phonetic_size + unicode_size + other_size
        
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(total_input_size, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, phonetic_seq, unicode_feature, other_feature):
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state 
        pooled_output = last_hidden_states.mean(dim=1) 
        phonetic_emb = self.phonetic_embedder(phonetic_seq).mean(dim=1) 

    
        unicode_feature = unicode_feature.view(unicode_feature.size(0), -1)  
        other_feature = other_feature.view(other_feature.size(0), -1)      

    
        combined = torch.cat([
            pooled_output,       
            phonetic_emb,        
            unicode_feature,     
            other_feature        
        ], dim=1)               
        
        logits = self.classifier(combined)
        return logits

In [11]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

max_phonetic_index = max([max(seq) for seq in phonetic_seqs])

model = LoanWordClassifier(num_phonetic_embeddings=max_phonetic_index + 1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()


dataset = dataloader.dataset  # Get the dataset from DataLoader
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=256, shuffle=False)


for epoch in range(50):
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} [Training]")

    for batch in progress_bar:
        optimizer.zero_grad()
        logits = model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            phonetic_seq=batch["phonetic_seq"].to(device),
            unicode_feature=batch["unicode_feature"].to(device),
            other_feature=batch["other_feature"].to(device)
        )
        loss = loss_fn(logits, batch["label"].to(device))
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        progress_bar.set_postfix(loss=total_train_loss / (progress_bar.n + 1))
    
    print(f"Epoch {epoch + 1}, Training Loss: {total_train_loss / len(train_loader):.4f}")

    # Validation
    model.eval()
    total_val_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Validation]"):
            logits = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                phonetic_seq=batch["phonetic_seq"].to(device),
                unicode_feature=batch["unicode_feature"].to(device),
                other_feature=batch["other_feature"].to(device)
            )
            loss = loss_fn(logits, batch["label"].to(device))
            total_val_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = batch["label"].cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    val_loss = total_val_loss / len(val_loader)
    val_acc = accuracy_score(all_labels, all_preds)
    val_f1 = f1_score(all_labels, all_preds, average="weighted")

    print(f"Epoch {epoch+1}, Validation Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}")

# for epoch in range(1):
#     model.train()
#     total_loss = 0
#     progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")

#     for batch in progress_bar:
#         optimizer.zero_grad()
#         logits = model(
#             input_ids=batch["input_ids"].to(device),
#             attention_mask=batch["attention_mask"].to(device),
#             phonetic_seq=batch["phonetic_seq"].to(device),
#             unicode_feature=batch["unicode_feature"].to(device),
#             other_feature=batch["other_feature"].to(device)
#         )
#         loss = loss_fn(logits, batch["label"].to(device))
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#         progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))
#     print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

torch.save(model.state_dict(), "loan_word_model.pth")

Epoch 1 [Training]: 100%|██████████| 659/659 [10:58<00:00,  1.00it/s, loss=0.866]


Epoch 1, Training Loss: 0.8658


Epoch 1 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 1, Validation Loss: 0.6046, Accuracy: 0.6830, F1 Score: 0.6815


Epoch 2 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.593]


Epoch 2, Training Loss: 0.5931


Epoch 2 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 2, Validation Loss: 0.5635, Accuracy: 0.6908, F1 Score: 0.6896


Epoch 3 [Training]: 100%|██████████| 659/659 [10:55<00:00,  1.01it/s, loss=0.569]


Epoch 3, Training Loss: 0.5687


Epoch 3 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 3, Validation Loss: 0.5492, Accuracy: 0.6950, F1 Score: 0.6967


Epoch 4 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.561]


Epoch 4, Training Loss: 0.5605


Epoch 4 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 4, Validation Loss: 0.5461, Accuracy: 0.6957, F1 Score: 0.6972


Epoch 5 [Training]: 100%|██████████| 659/659 [10:55<00:00,  1.00it/s, loss=0.553]


Epoch 5, Training Loss: 0.5532


Epoch 5 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 5, Validation Loss: 0.5433, Accuracy: 0.6975, F1 Score: 0.6994


Epoch 6 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.549]


Epoch 6, Training Loss: 0.5495


Epoch 6 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 6, Validation Loss: 0.5364, Accuracy: 0.6999, F1 Score: 0.7023


Epoch 7 [Training]: 100%|██████████| 659/659 [10:55<00:00,  1.00it/s, loss=0.546]


Epoch 7, Training Loss: 0.5463


Epoch 7 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 7, Validation Loss: 0.5328, Accuracy: 0.7013, F1 Score: 0.7044


Epoch 8 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.543]


Epoch 8, Training Loss: 0.5431


Epoch 8 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 8, Validation Loss: 0.5332, Accuracy: 0.6998, F1 Score: 0.7023


Epoch 9 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.542]


Epoch 9, Training Loss: 0.5415


Epoch 9 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 9, Validation Loss: 0.5316, Accuracy: 0.7023, F1 Score: 0.7056


Epoch 10 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.54]


Epoch 10, Training Loss: 0.5401


Epoch 10 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 10, Validation Loss: 0.5298, Accuracy: 0.7018, F1 Score: 0.7044


Epoch 11 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.539]


Epoch 11, Training Loss: 0.5394


Epoch 11 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 11, Validation Loss: 0.5322, Accuracy: 0.7040, F1 Score: 0.7080


Epoch 12 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.537]


Epoch 12, Training Loss: 0.5369


Epoch 12 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 12, Validation Loss: 0.5260, Accuracy: 0.7036, F1 Score: 0.7069


Epoch 13 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.536]


Epoch 13, Training Loss: 0.5360


Epoch 13 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 13, Validation Loss: 0.5270, Accuracy: 0.7041, F1 Score: 0.7078


Epoch 14 [Training]: 100%|██████████| 659/659 [10:58<00:00,  1.00it/s, loss=0.534]


Epoch 14, Training Loss: 0.5344


Epoch 14 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 14, Validation Loss: 0.5248, Accuracy: 0.7048, F1 Score: 0.7087


Epoch 15 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.534]


Epoch 15, Training Loss: 0.5339


Epoch 15 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 15, Validation Loss: 0.5255, Accuracy: 0.7061, F1 Score: 0.7102


Epoch 16 [Training]: 100%|██████████| 659/659 [10:58<00:00,  1.00it/s, loss=0.534]


Epoch 16, Training Loss: 0.5340


Epoch 16 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 16, Validation Loss: 0.5252, Accuracy: 0.7045, F1 Score: 0.7078


Epoch 17 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.534]


Epoch 17, Training Loss: 0.5338


Epoch 17 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 17, Validation Loss: 0.5254, Accuracy: 0.7055, F1 Score: 0.7095


Epoch 18 [Training]: 100%|██████████| 659/659 [10:58<00:00,  1.00it/s, loss=0.532]


Epoch 18, Training Loss: 0.5324


Epoch 18 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 18, Validation Loss: 0.5219, Accuracy: 0.7056, F1 Score: 0.7092


Epoch 19 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.532]


Epoch 19, Training Loss: 0.5316


Epoch 19 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 19, Validation Loss: 0.5193, Accuracy: 0.7054, F1 Score: 0.7088


Epoch 20 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.531]


Epoch 20, Training Loss: 0.5311


Epoch 20 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 20, Validation Loss: 0.5226, Accuracy: 0.7068, F1 Score: 0.7109


Epoch 21 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.53]


Epoch 21, Training Loss: 0.5301


Epoch 21 [Validation]: 100%|██████████| 165/165 [02:40<00:00,  1.03it/s]


Epoch 21, Validation Loss: 0.5211, Accuracy: 0.7078, F1 Score: 0.7121


Epoch 22 [Training]: 100%|██████████| 659/659 [11:00<00:00,  1.00s/it, loss=0.53]


Epoch 22, Training Loss: 0.5297


Epoch 22 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 22, Validation Loss: 0.5251, Accuracy: 0.7065, F1 Score: 0.7108


Epoch 23 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.528]


Epoch 23, Training Loss: 0.5285


Epoch 23 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 23, Validation Loss: 0.5208, Accuracy: 0.7074, F1 Score: 0.7118


Epoch 24 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.528]


Epoch 24, Training Loss: 0.5281


Epoch 24 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 24, Validation Loss: 0.5199, Accuracy: 0.7075, F1 Score: 0.7118


Epoch 25 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.529]


Epoch 25, Training Loss: 0.5288


Epoch 25 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 25, Validation Loss: 0.5241, Accuracy: 0.7066, F1 Score: 0.7100


Epoch 26 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.527]


Epoch 26, Training Loss: 0.5275


Epoch 26 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 26, Validation Loss: 0.5198, Accuracy: 0.7067, F1 Score: 0.7102


Epoch 27 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.527]


Epoch 27, Training Loss: 0.5268


Epoch 27 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 27, Validation Loss: 0.5206, Accuracy: 0.7058, F1 Score: 0.7094


Epoch 28 [Training]: 100%|██████████| 659/659 [10:59<00:00,  1.00s/it, loss=0.526]


Epoch 28, Training Loss: 0.5264


Epoch 28 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 28, Validation Loss: 0.5216, Accuracy: 0.7068, F1 Score: 0.7106


Epoch 29 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.527]


Epoch 29, Training Loss: 0.5273


Epoch 29 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 29, Validation Loss: 0.5192, Accuracy: 0.7075, F1 Score: 0.7115


Epoch 30 [Training]: 100%|██████████| 659/659 [10:58<00:00,  1.00it/s, loss=0.525]


Epoch 30, Training Loss: 0.5250


Epoch 30 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 30, Validation Loss: 0.5275, Accuracy: 0.7043, F1 Score: 0.7081


Epoch 31 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.526]


Epoch 31, Training Loss: 0.5261


Epoch 31 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 31, Validation Loss: 0.5174, Accuracy: 0.7088, F1 Score: 0.7133


Epoch 32 [Training]: 100%|██████████| 659/659 [10:58<00:00,  1.00it/s, loss=0.525]


Epoch 32, Training Loss: 0.5249


Epoch 32 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 32, Validation Loss: 0.5205, Accuracy: 0.7078, F1 Score: 0.7117


Epoch 33 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.525]


Epoch 33, Training Loss: 0.5245


Epoch 33 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 33, Validation Loss: 0.5176, Accuracy: 0.7052, F1 Score: 0.7087


Epoch 34 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.524]


Epoch 34, Training Loss: 0.5243


Epoch 34 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 34, Validation Loss: 0.5213, Accuracy: 0.7085, F1 Score: 0.7128


Epoch 35 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.523]


Epoch 35, Training Loss: 0.5230


Epoch 35 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 35, Validation Loss: 0.5225, Accuracy: 0.7082, F1 Score: 0.7127


Epoch 36 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.523]


Epoch 36, Training Loss: 0.5233


Epoch 36 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 36, Validation Loss: 0.5155, Accuracy: 0.7100, F1 Score: 0.7144


Epoch 37 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.524]


Epoch 37, Training Loss: 0.5235


Epoch 37 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 37, Validation Loss: 0.5195, Accuracy: 0.7098, F1 Score: 0.7145


Epoch 38 [Training]: 100%|██████████| 659/659 [10:57<00:00,  1.00it/s, loss=0.523]


Epoch 38, Training Loss: 0.5234


Epoch 38 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.04it/s]


Epoch 38, Validation Loss: 0.5177, Accuracy: 0.7083, F1 Score: 0.7122


Epoch 39 [Training]: 100%|██████████| 659/659 [10:56<00:00,  1.00it/s, loss=0.523]


Epoch 39, Training Loss: 0.5234


Epoch 39 [Validation]: 100%|██████████| 165/165 [02:39<00:00,  1.03it/s]


Epoch 39, Validation Loss: 0.5168, Accuracy: 0.7083, F1 Score: 0.7121


Epoch 40 [Training]: 100%|██████████| 659/659 [10:58<00:00,  1.00it/s, loss=0.522]


Epoch 40, Training Loss: 0.5224


Epoch 40 [Validation]: 100%|██████████| 165/165 [02:40<00:00,  1.03it/s]


Epoch 40, Validation Loss: 0.5204, Accuracy: 0.7095, F1 Score: 0.7139


Epoch 41 [Training]: 100%|██████████| 659/659 [11:11<00:00,  1.02s/it, loss=0.522]


Epoch 41, Training Loss: 0.5215


Epoch 41 [Validation]: 100%|██████████| 165/165 [02:43<00:00,  1.01it/s]


Epoch 41, Validation Loss: 0.5150, Accuracy: 0.7081, F1 Score: 0.7121


Epoch 42 [Training]: 100%|██████████| 659/659 [11:13<00:00,  1.02s/it, loss=0.522]


Epoch 42, Training Loss: 0.5215


Epoch 42 [Validation]: 100%|██████████| 165/165 [02:42<00:00,  1.01it/s]


Epoch 42, Validation Loss: 0.5141, Accuracy: 0.7092, F1 Score: 0.7132


Epoch 43 [Training]: 100%|██████████| 659/659 [11:09<00:00,  1.02s/it, loss=0.522]


Epoch 43, Training Loss: 0.5220


Epoch 43 [Validation]: 100%|██████████| 165/165 [02:42<00:00,  1.02it/s]


Epoch 43, Validation Loss: 0.5138, Accuracy: 0.7078, F1 Score: 0.7118


Epoch 44 [Training]: 100%|██████████| 659/659 [11:08<00:00,  1.01s/it, loss=0.521]


Epoch 44, Training Loss: 0.5212


Epoch 44 [Validation]: 100%|██████████| 165/165 [02:41<00:00,  1.02it/s]


Epoch 44, Validation Loss: 0.5252, Accuracy: 0.7080, F1 Score: 0.7120


Epoch 45 [Training]: 100%|██████████| 659/659 [11:05<00:00,  1.01s/it, loss=0.521]


Epoch 45, Training Loss: 0.5211


Epoch 45 [Validation]: 100%|██████████| 165/165 [02:42<00:00,  1.02it/s]


Epoch 45, Validation Loss: 0.5147, Accuracy: 0.7090, F1 Score: 0.7130


Epoch 46 [Training]: 100%|██████████| 659/659 [11:09<00:00,  1.02s/it, loss=0.52]


Epoch 46, Training Loss: 0.5196


Epoch 46 [Validation]: 100%|██████████| 165/165 [02:42<00:00,  1.02it/s]


Epoch 46, Validation Loss: 0.5163, Accuracy: 0.7081, F1 Score: 0.7120


Epoch 47 [Training]: 100%|██████████| 659/659 [11:11<00:00,  1.02s/it, loss=0.52]


Epoch 47, Training Loss: 0.5203


Epoch 47 [Validation]: 100%|██████████| 165/165 [02:42<00:00,  1.02it/s]


Epoch 47, Validation Loss: 0.5164, Accuracy: 0.7105, F1 Score: 0.7151


Epoch 48 [Training]: 100%|██████████| 659/659 [11:13<00:00,  1.02s/it, loss=0.52]


Epoch 48, Training Loss: 0.5200


Epoch 48 [Validation]: 100%|██████████| 165/165 [02:42<00:00,  1.01it/s]


Epoch 48, Validation Loss: 0.5201, Accuracy: 0.7117, F1 Score: 0.7162


Epoch 49 [Training]: 100%|██████████| 659/659 [11:11<00:00,  1.02s/it, loss=0.519]


Epoch 49, Training Loss: 0.5195


Epoch 49 [Validation]: 100%|██████████| 165/165 [02:43<00:00,  1.01it/s]


Epoch 49, Validation Loss: 0.5157, Accuracy: 0.7109, F1 Score: 0.7155


Epoch 50 [Training]: 100%|██████████| 659/659 [11:14<00:00,  1.02s/it, loss=0.52]


Epoch 50, Training Loss: 0.5195


Epoch 50 [Validation]: 100%|██████████| 165/165 [02:42<00:00,  1.01it/s]


Epoch 50, Validation Loss: 0.5147, Accuracy: 0.7120, F1 Score: 0.7165


In [12]:
import torch
from transformers import BertTokenizer
import epitran

# Load model and tokenizer
model.load_state_dict(torch.load("loan_word_model.pth"))
model.eval()
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
epi = epitran.Epitran("fra-Latn")

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word, max_len=25):
    try:
        loan_epitran = epi.transliterate(word)
        phonetic_seq = [ord(c) for c in loan_epitran] 
    except IndexError as e:
        print(f"Transliteration failed for '{word}': {e}")
        phonetic_seq = [0] 

    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)

    if len(unicode_features) < max_len:
        unicode_features = unicode_features + [0] * (max_len - len(unicode_features))
    else:
        unicode_features = unicode_features[:max_len]  # Truncate if longer

    return phonetic_seq, unicode_features, [len(word)]



sentence = "The government governed a new abordage policy."
words = sentence.split()

false_loans = []
for word in words:
    phonetic_seq, unicode_feature, other_feature = extract_features(word)
    inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device),
            phonetic_seq=torch.tensor([phonetic_seq], dtype=torch.long).to(device),
            unicode_feature=torch.tensor([unicode_feature], dtype=torch.float).to(device),
            other_feature=torch.tensor([other_feature], dtype=torch.float).to(device)
        )
        probs = torch.softmax(logits, dim=1)
        print(word , probs)
        
        if torch.argmax(probs) == 1:  
            false_loans.append(word)

print("False loan words:", false_loans)

  model.load_state_dict(torch.load("loan_word_model.pth"))


The tensor([[0.9558, 0.0442]], device='cuda:0')
government tensor([[0.4837, 0.5163]], device='cuda:0')
governed tensor([[0.9047, 0.0953]], device='cuda:0')
a tensor([[0.6857, 0.3143]], device='cuda:0')
new tensor([[0.6918, 0.3082]], device='cuda:0')
abordage tensor([[0.2388, 0.7612]], device='cuda:0')
policy. tensor([[0.6006, 0.3994]], device='cuda:0')
False loan words: ['government', 'abordage']


In [13]:
!zip -r output.zip /kaggle/working/

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/__notebook__.ipynb (deflated 97%)
  adding: kaggle/working/loan_word_model.pth (deflated 7%)
  adding: kaggle/working/all_languages_combined.csv (deflated 67%)
