In [1]:
!pip install phonetics
!pip install epitran

Collecting phonetics
  Downloading phonetics-1.0.5.tar.gz (8.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: phonetics
  Building wheel for phonetics (setup.py) ... [?25l[?25hdone
  Created wheel for phonetics: filename=phonetics-1.0.5-py2.py3-none-any.whl size=8696 sha256=d880558d6a0a9e2eab523524e53a084bed37b999c0db855a86d078b67d7f8fab
  Stored in directory: /root/.cache/pip/wheels/b7/1e/82/80a78c7d1ad7fc6e0af1b4d9009360b251c0e50fe59f046edb
Successfully built phonetics
Installing collected packages: phonetics
Successfully installed phonetics-1.0.5
Collecting epitran
  Downloading epitran-1.26.0-py2.py3-none-any.whl.metadata (34 kB)
Collecting panphon>=0.20 (from epitran)
  Downloading panphon-0.21.2-py2.py3-none-any.whl.metadata (15 kB)
Collecting jamo (from epitran)
  Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)
Collecting unicodecsv (from panphon>=0.20->epitran)
  Downloading unicodecsv-0.14.1.tar.gz 

In [2]:
import pandas as pd
import numpy as np
import phonetics
import torch
import epitran
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from scipy.spatial.distance import cosine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
import pandas as pd
import os

# List of language pairs
language_pairs = [
    ("Azerbaijani", "Arabic"),
    ("Catalan", "Arabic"),
    ("Chinese", "English"),
    ("English", "French"),
    ("English", "German"),
    ("Finnish", "Swedish"),
    ("German", "French"),
    ("German", "Italian"),
    ("Hindi", "Persian"),
    ("Hungarian", "German"),
    ("Indonesian", "Dutch"),
    ("Kazakh", "Russian"),
    ("Persian", "Arabic"),
    ("Polish", "French"),
    ("Romanian", "French"),
    ("Romanian", "Hungarian"),
]

def read_language(lang1, lang2):
    file_path = f"/kaggle/input/dataset/Datasets/production_train_test/{lang1}-{lang2}/balanced/{lang1}-{lang2}-train_production_balanced.csv"
    
    if os.path.exists(file_path):  # Check if file exists
        df = pd.read_csv(file_path)
        df = df.drop(columns=[col for col in ['Unnamed: 0.1', 'Unnamed: 0'] if col in df.columns], errors="ignore")
        df["language_pair"] = f"{lang1}-{lang2}"  # Add language pair column
        return df
    else:
        print(f"File not found: {file_path}")
        return None

dfs = [read_language(lang1, lang2) for lang1, lang2 in language_pairs]
dfs = [df for df in dfs if df is not None]  # Remove None values

final_df = pd.concat(dfs, ignore_index=True)
print(final_df.info())
final_df.to_csv("all_languages_combined.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47384 entries, 0 to 47383
Data columns (total 21 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   loan_word                                    47384 non-null  object 
 1   original_word                                47384 non-null  object 
 2   loan_word_epitran                            47384 non-null  object 
 3   original_word_epitran                        47384 non-null  object 
 4   loan_english                                 47159 non-null  object 
 5   original_english                             47175 non-null  object 
 6   Fast Levenshtein Distance Div Maxlen         47384 non-null  float64
 7   Dolgo Prime Distance Div Maxlen              47384 non-null  float64
 8   Feature Edit Distance Div Maxlen             47384 non-null  float64
 9   Hamming Feature Distance Div Maxlen          47384 non-null  float64
 10

In [4]:
# need to install - sudo apt-get install flite

epi = epitran.Epitran('fra-Latn')

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word):
    try:
        loan_epitran = epi.transliterate(word)
        print(f"Transliterated text: {loan_epitran}")
    except IndexError as e:
        print(f"Transliteration failed: {e}")
        loan_epitran = "N/A"
    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)
    return {
        "word": word,
        "phonetic": loan_epitran,
        "unicode": unicode_features,
        "length": len(word)
    }

extract_features("khana")

Transliterated text: kana


{'word': 'khana',
 'phonetic': 'kana',
 'unicode': [4.0, 1.0, -6.0, 7.0, -6.0],
 'length': 5}

In [5]:
final_df.head()

Unnamed: 0,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,...,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,loan_unicode,original_unicode,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim,language_pair
0,Möhkəmlik,البطولة,mœhkæmlɪk,albtˤuːlt,Durability,Championship,1.0,0.666667,0.354167,0.393519,...,0.377315,9,Latin,Arabic,synonym,0,-18.639088,0.805898,0.519797,Azerbaijani-Arabic
1,mehriban,مرن,mehrɪbɑn,mrn,kind,flexible,0.625,0.625,0.557292,0.625,...,0.625,8,Latin,Arabic,synonym,0,-9.437688,0.833718,0.753124,Azerbaijani-Arabic
2,qəhbə,قَحْبَة,ɡæhbæ,qَħْbَt,bastard,whore,0.857143,0.6,0.291667,0.325,...,0.3125,7,Latin,Arabic,loan,1,4.729791,0.672404,0.51703,Azerbaijani-Arabic
3,təslim olmaq,التنازل عن العرش,tæslɪm ɔlmɑɡ,altnaːzl ʕn aːlʕrʃ,surrender,abdication,0.777778,0.571429,0.299107,0.333333,...,0.318452,15,Latin,Arabic,synonym,0,-49.171078,0.902082,0.500199,Azerbaijani-Arabic
4,tələsmək,مضطرب,tælæsmæk,mdˤtˤrb,hurry up,مضطرب,1.0,0.875,0.466146,0.520833,...,0.505208,8,Latin,Arabic,synonym,0,-7.328197,0.871732,0.551427,Azerbaijani-Arabic


In [6]:
def normalize(unicode_values):
    if not unicode_values:  # Handle empty case
        return [0] * 25  # Return a zero vector of fixed size

    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word):
    unicode_features = [ord(c) for c in word] if word else [0]  # Ensure it's never empty
    unicode_features = normalize(unicode_features)  

    return {
        "unicode": unicode_features,
        "length": len(word)
    }


df_features = final_df["loan_word"].apply(extract_features).apply(pd.Series)
df_features
df = pd.concat([final_df, df_features], axis=1)

In [7]:
df['label'] = df["label"].replace(['random', 'hard_negative', 'loan', 'synonym'], [0, 1, 0, 0])

# Step 2: Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_and_encode(text, tokenizer, max_length=120):
    encoding = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return encoding["input_ids"].squeeze(0).tolist(), encoding["attention_mask"].squeeze(0).tolist()

df['phonetic_'], df['attention_masks'] = zip(*df['loan_word_epitran'].apply(
    lambda x: tokenize_and_encode(x, tokenizer)
))

# Step 3: Padding the Unicode features (assumes 'unicode' column contains lists of ints or similar)
def pad_sequence(seq, maxlen, pad_value=0):
    try:
        if isinstance(seq, str):
            seq = eval(seq)  # convert stringified list to actual list
    except:
        seq = []
    if len(seq) < maxlen:
        return seq + [pad_value] * (maxlen - len(seq))
    else:
        return seq[:maxlen] 

max_unicode_len = 25
df['unicode_padded'] = df['unicode'].apply(lambda x: pad_sequence(x, max_unicode_len))

positive_df = df[df['label'] == 1]
negative_df = df[df['label'] == 0]

min_count = min(len(positive_df), len(negative_df))

# Sample equal amount from both
balanced_df = pd.concat([
    positive_df.sample(n=min_count, random_state=42),
    negative_df.sample(n=min_count, random_state=42)
]).sample(frac=1).reset_index(drop=True)  # shuffle the final set


  df['label'] = df["label"].replace(['random', 'hard_negative', 'loan', 'synonym'], [0, 1, 0, 0])


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [8]:
print(f"Balanced dataset created: {len(balanced_df)} samples (50% label 0, 50% label 1)")

Balanced dataset created: 6162 samples (50% label 0, 50% label 1)


In [9]:
class LoanWordDataset(Dataset):
    def __init__(self, loan_words, phonetic_seqs, unicode_features, other_features, labels, tokenizer, max_len=128):
        self.loan_words = loan_words
        self.phonetic_seqs = phonetic_seqs
        self.unicode_features = unicode_features
        self.other_features = other_features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.loan_words)

    def __getitem__(self, idx):
        loan_word = self.loan_words[idx]
        phonetic_seq = self.phonetic_seqs[idx]
        unicode_feature = self.unicode_features[idx]
        other_feature = self.other_features[idx]
        label = self.labels[idx]

        # Tokenize loan word
        encoding = self.tokenizer.encode_plus(
            loan_word,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            # 'loan_word': loan_word,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'phonetic_seq': torch.tensor(phonetic_seq, dtype=torch.long),
            'unicode_feature': torch.tensor(unicode_feature, dtype=torch.float),
            'other_feature': torch.tensor(other_feature, dtype=torch.float),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [10]:
df = balanced_df
loan_words = df['loan_word'].tolist()  # Convert to list
phonetic_seqs = df['phonetic_'].apply(lambda x: x.tolist() if isinstance(x, torch.Tensor) else list(x)).tolist()
unicode_features = df['unicode_padded'].tolist()
other_features = df['length'].apply(lambda x: [x]).tolist()  # Convert to list
labels = df['label'].tolist()  # Convert to list

dataset = LoanWordDataset(loan_words, phonetic_seqs, unicode_features, other_features, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

In [11]:
import torch
from transformers import BertModel


class LoanWordClassifier(torch.nn.Module):
    def __init__(self, num_phonetic_embeddings, num_labels=2):
        super().__init__()
        self.bert = BertModel.from_pretrained("/kaggle/input/false-loan-model/output/tuned-bert")
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.phonetic_embedder = torch.nn.Embedding(num_phonetic_embeddings, embedding_dim=64)
        
        bert_hidden_size = self.bert.config.hidden_size 
        phonetic_size = 64
        unicode_size = 25 
        other_size = 1
        
        total_input_size = bert_hidden_size + phonetic_size + unicode_size + other_size
        
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(total_input_size, 256),
            torch.nn.ReLU(),
            torch.nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask, phonetic_seq, unicode_feature, other_feature):
        
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state 
        pooled_output = last_hidden_states.mean(dim=1) 
        phonetic_emb = self.phonetic_embedder(phonetic_seq).mean(dim=1) 

    
        unicode_feature = unicode_feature.view(unicode_feature.size(0), -1)  
        other_feature = other_feature.view(other_feature.size(0), -1)      

    
        combined = torch.cat([
            pooled_output,       
            phonetic_emb,        
            unicode_feature,     
            other_feature        
        ], dim=1)               
        
        logits = self.classifier(combined)
        return logits

In [12]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

max_phonetic_index = max([max(seq) for seq in phonetic_seqs])

model = LoanWordClassifier(num_phonetic_embeddings=max_phonetic_index + 1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()


dataset = dataloader.dataset  # Get the dataset from DataLoader
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=256, shuffle=False)


for epoch in range(50):
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} [Training]")

    for batch in progress_bar:
        optimizer.zero_grad()
        logits = model(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            phonetic_seq=batch["phonetic_seq"].to(device),
            unicode_feature=batch["unicode_feature"].to(device),
            other_feature=batch["other_feature"].to(device)
        )
        loss = loss_fn(logits, batch["label"].to(device))
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        progress_bar.set_postfix(loss=total_train_loss / (progress_bar.n + 1))
    
    print(f"Epoch {epoch + 1}, Training Loss: {total_train_loss / len(train_loader):.4f}")

    # Validation
    model.eval()
    total_val_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} [Validation]"):
            logits = model(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                phonetic_seq=batch["phonetic_seq"].to(device),
                unicode_feature=batch["unicode_feature"].to(device),
                other_feature=batch["other_feature"].to(device)
            )
            loss = loss_fn(logits, batch["label"].to(device))
            total_val_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            labels = batch["label"].cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    val_loss = total_val_loss / len(val_loader)
    val_acc = accuracy_score(all_labels, all_preds)
    val_f1 = f1_score(all_labels, all_preds, average="weighted")

    print(f"Epoch {epoch+1}, Validation Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}")

# for epoch in range(1):
#     model.train()
#     total_loss = 0
#     progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}")

#     for batch in progress_bar:
#         optimizer.zero_grad()
#         logits = model(
#             input_ids=batch["input_ids"].to(device),
#             attention_mask=batch["attention_mask"].to(device),
#             phonetic_seq=batch["phonetic_seq"].to(device),
#             unicode_feature=batch["unicode_feature"].to(device),
#             other_feature=batch["other_feature"].to(device)
#         )
#         loss = loss_fn(logits, batch["label"].to(device))
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#         progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))
#     print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}")

torch.save(model.state_dict(), "loan_word_model.pth")

Epoch 1 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=1.2]


Epoch 1, Training Loss: 1.1966


Epoch 1 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.08it/s]


Epoch 1, Validation Loss: 1.0680, Accuracy: 0.5523, F1 Score: 0.5523


Epoch 2 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.04it/s, loss=1.08]


Epoch 2, Training Loss: 1.0796


Epoch 2 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 2, Validation Loss: 0.9781, Accuracy: 0.5677, F1 Score: 0.5678


Epoch 3 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.969]


Epoch 3, Training Loss: 0.9692


Epoch 3 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 3, Validation Loss: 0.8917, Accuracy: 0.5888, F1 Score: 0.5883


Epoch 4 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.899]


Epoch 4, Training Loss: 0.8993


Epoch 4 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 4, Validation Loss: 0.8403, Accuracy: 0.5953, F1 Score: 0.5946


Epoch 5 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.846]


Epoch 5, Training Loss: 0.8462


Epoch 5 [Validation]: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]


Epoch 5, Validation Loss: 0.7995, Accuracy: 0.6010, F1 Score: 0.5990


Epoch 6 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.814]


Epoch 6, Training Loss: 0.8142


Epoch 6 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 6, Validation Loss: 0.7683, Accuracy: 0.6026, F1 Score: 0.6021


Epoch 7 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.754]


Epoch 7, Training Loss: 0.7537


Epoch 7 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 7, Validation Loss: 0.7376, Accuracy: 0.6026, F1 Score: 0.6011


Epoch 8 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.724]


Epoch 8, Training Loss: 0.7242


Epoch 8 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 8, Validation Loss: 0.7269, Accuracy: 0.6075, F1 Score: 0.6058


Epoch 9 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=0.715]


Epoch 9, Training Loss: 0.7146


Epoch 9 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 9, Validation Loss: 0.7177, Accuracy: 0.6115, F1 Score: 0.6088


Epoch 10 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.709]


Epoch 10, Training Loss: 0.7092


Epoch 10 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 10, Validation Loss: 0.7187, Accuracy: 0.6180, F1 Score: 0.6149


Epoch 11 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.698]


Epoch 11, Training Loss: 0.6980


Epoch 11 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 11, Validation Loss: 0.7161, Accuracy: 0.6156, F1 Score: 0.6133


Epoch 12 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.691]


Epoch 12, Training Loss: 0.6908


Epoch 12 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 12, Validation Loss: 0.7119, Accuracy: 0.6139, F1 Score: 0.6110


Epoch 13 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.691]


Epoch 13, Training Loss: 0.6913


Epoch 13 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 13, Validation Loss: 0.7069, Accuracy: 0.6164, F1 Score: 0.6129


Epoch 14 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.687]


Epoch 14, Training Loss: 0.6869


Epoch 14 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 14, Validation Loss: 0.7063, Accuracy: 0.6172, F1 Score: 0.6144


Epoch 15 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.678]


Epoch 15, Training Loss: 0.6776


Epoch 15 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 15, Validation Loss: 0.7007, Accuracy: 0.6172, F1 Score: 0.6136


Epoch 16 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.672]


Epoch 16, Training Loss: 0.6721


Epoch 16 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 16, Validation Loss: 0.6985, Accuracy: 0.6196, F1 Score: 0.6171


Epoch 17 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.672]


Epoch 17, Training Loss: 0.6725


Epoch 17 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 17, Validation Loss: 0.6984, Accuracy: 0.6253, F1 Score: 0.6215


Epoch 18 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.693]


Epoch 18, Training Loss: 0.6926


Epoch 18 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 18, Validation Loss: 0.6988, Accuracy: 0.6212, F1 Score: 0.6181


Epoch 19 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.662]


Epoch 19, Training Loss: 0.6620


Epoch 19 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 19, Validation Loss: 0.6829, Accuracy: 0.6221, F1 Score: 0.6180


Epoch 20 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.656]


Epoch 20, Training Loss: 0.6556


Epoch 20 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 20, Validation Loss: 0.6956, Accuracy: 0.6188, F1 Score: 0.6144


Epoch 21 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.65]


Epoch 21, Training Loss: 0.6503


Epoch 21 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 21, Validation Loss: 0.6880, Accuracy: 0.6212, F1 Score: 0.6177


Epoch 22 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.656]


Epoch 22, Training Loss: 0.6560


Epoch 22 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 22, Validation Loss: 0.6832, Accuracy: 0.6204, F1 Score: 0.6163


Epoch 23 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=0.646]


Epoch 23, Training Loss: 0.6458


Epoch 23 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 23, Validation Loss: 0.6860, Accuracy: 0.6196, F1 Score: 0.6178


Epoch 24 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.648]


Epoch 24, Training Loss: 0.6484


Epoch 24 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 24, Validation Loss: 0.6848, Accuracy: 0.6237, F1 Score: 0.6200


Epoch 25 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=0.638]


Epoch 25, Training Loss: 0.6377


Epoch 25 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 25, Validation Loss: 0.6817, Accuracy: 0.6221, F1 Score: 0.6174


Epoch 26 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=0.642]


Epoch 26, Training Loss: 0.6424


Epoch 26 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 26, Validation Loss: 0.6748, Accuracy: 0.6196, F1 Score: 0.6166


Epoch 27 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.642]


Epoch 27, Training Loss: 0.6415


Epoch 27 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 27, Validation Loss: 0.6797, Accuracy: 0.6229, F1 Score: 0.6180


Epoch 28 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=0.634]


Epoch 28, Training Loss: 0.6338


Epoch 28 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 28, Validation Loss: 0.6770, Accuracy: 0.6245, F1 Score: 0.6204


Epoch 29 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.633]


Epoch 29, Training Loss: 0.6334


Epoch 29 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 29, Validation Loss: 0.6739, Accuracy: 0.6269, F1 Score: 0.6239


Epoch 30 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.634]


Epoch 30, Training Loss: 0.6338


Epoch 30 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 30, Validation Loss: 0.6812, Accuracy: 0.6261, F1 Score: 0.6219


Epoch 31 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.633]


Epoch 31, Training Loss: 0.6329


Epoch 31 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 31, Validation Loss: 0.6747, Accuracy: 0.6245, F1 Score: 0.6216


Epoch 32 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.63]


Epoch 32, Training Loss: 0.6304


Epoch 32 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 32, Validation Loss: 0.6777, Accuracy: 0.6245, F1 Score: 0.6200


Epoch 33 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.625]


Epoch 33, Training Loss: 0.6247


Epoch 33 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 33, Validation Loss: 0.6723, Accuracy: 0.6245, F1 Score: 0.6228


Epoch 34 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.00it/s, loss=0.619]


Epoch 34, Training Loss: 0.6190


Epoch 34 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 34, Validation Loss: 0.6691, Accuracy: 0.6253, F1 Score: 0.6227


Epoch 35 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.627]


Epoch 35, Training Loss: 0.6267


Epoch 35 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 35, Validation Loss: 0.6725, Accuracy: 0.6277, F1 Score: 0.6228


Epoch 36 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.625]


Epoch 36, Training Loss: 0.6251


Epoch 36 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 36, Validation Loss: 0.6674, Accuracy: 0.6245, F1 Score: 0.6219


Epoch 37 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=0.617]


Epoch 37, Training Loss: 0.6175


Epoch 37 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 37, Validation Loss: 0.6725, Accuracy: 0.6269, F1 Score: 0.6228


Epoch 38 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.621]


Epoch 38, Training Loss: 0.6205


Epoch 38 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 38, Validation Loss: 0.6653, Accuracy: 0.6229, F1 Score: 0.6209


Epoch 39 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=0.618]


Epoch 39, Training Loss: 0.6179


Epoch 39 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 39, Validation Loss: 0.6658, Accuracy: 0.6245, F1 Score: 0.6206


Epoch 40 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=0.618]


Epoch 40, Training Loss: 0.6176


Epoch 40 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 40, Validation Loss: 0.6664, Accuracy: 0.6277, F1 Score: 0.6253


Epoch 41 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=0.612]


Epoch 41, Training Loss: 0.6119


Epoch 41 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 41, Validation Loss: 0.6639, Accuracy: 0.6237, F1 Score: 0.6212


Epoch 42 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.613]


Epoch 42, Training Loss: 0.6132


Epoch 42 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 42, Validation Loss: 0.6667, Accuracy: 0.6245, F1 Score: 0.6213


Epoch 43 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.613]


Epoch 43, Training Loss: 0.6135


Epoch 43 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 43, Validation Loss: 0.6649, Accuracy: 0.6253, F1 Score: 0.6225


Epoch 44 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.616]


Epoch 44, Training Loss: 0.6156


Epoch 44 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 44, Validation Loss: 0.6653, Accuracy: 0.6294, F1 Score: 0.6270


Epoch 45 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.615]


Epoch 45, Training Loss: 0.6155


Epoch 45 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 45, Validation Loss: 0.6657, Accuracy: 0.6334, F1 Score: 0.6294


Epoch 46 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.607]


Epoch 46, Training Loss: 0.6066


Epoch 46 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 46, Validation Loss: 0.6633, Accuracy: 0.6302, F1 Score: 0.6275


Epoch 47 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.61]


Epoch 47, Training Loss: 0.6101


Epoch 47 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 47, Validation Loss: 0.6607, Accuracy: 0.6318, F1 Score: 0.6283


Epoch 48 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.614]


Epoch 48, Training Loss: 0.6142


Epoch 48 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 48, Validation Loss: 0.6609, Accuracy: 0.6326, F1 Score: 0.6294


Epoch 49 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.02it/s, loss=0.605]


Epoch 49, Training Loss: 0.6052


Epoch 49 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Epoch 49, Validation Loss: 0.6614, Accuracy: 0.6318, F1 Score: 0.6284


Epoch 50 [Training]: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s, loss=0.609]


Epoch 50, Training Loss: 0.6087


Epoch 50 [Validation]: 100%|██████████| 5/5 [00:04<00:00,  1.05it/s]


Epoch 50, Validation Loss: 0.6604, Accuracy: 0.6342, F1 Score: 0.6313


In [13]:
import torch
from transformers import BertTokenizer
import epitran

# Load model and tokenizer
model.load_state_dict(torch.load("loan_word_model.pth"))
model.eval()
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
epi = epitran.Epitran("fra-Latn")

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word, max_len=25):
    try:
        loan_epitran = epi.transliterate(word)
        phonetic_seq = [ord(c) for c in loan_epitran] 
    except IndexError as e:
        print(f"Transliteration failed for '{word}': {e}")
        phonetic_seq = [0] 

    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)

    if len(unicode_features) < max_len:
        unicode_features = unicode_features + [0] * (max_len - len(unicode_features))
    else:
        unicode_features = unicode_features[:max_len]  # Truncate if longer

    return phonetic_seq, unicode_features, [len(word)]



sentence = "The government governed a new abordage policy."
words = sentence.split()

false_loans = []
for word in words:
    phonetic_seq, unicode_feature, other_feature = extract_features(word)
    inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        logits = model(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device),
            phonetic_seq=torch.tensor([phonetic_seq], dtype=torch.long).to(device),
            unicode_feature=torch.tensor([unicode_feature], dtype=torch.float).to(device),
            other_feature=torch.tensor([other_feature], dtype=torch.float).to(device)
        )
        probs = torch.softmax(logits, dim=1)
        print(word , probs)
        
        if torch.argmax(probs) == 1:  
            false_loans.append(word)

print("False loan words:", false_loans)

  model.load_state_dict(torch.load("loan_word_model.pth"))


The tensor([[0.6999, 0.3001]], device='cuda:0')
government tensor([[0.6612, 0.3388]], device='cuda:0')
governed tensor([[0.8090, 0.1910]], device='cuda:0')
a tensor([[0.7958, 0.2042]], device='cuda:0')
new tensor([[0.5417, 0.4583]], device='cuda:0')
abordage tensor([[0.4953, 0.5047]], device='cuda:0')
policy. tensor([[0.7249, 0.2751]], device='cuda:0')
False loan words: ['abordage']


In [14]:
!zip -r output.zip /kaggle/working/

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/all_languages_combined.csv (deflated 68%)
  adding: kaggle/working/loan_word_model.pth (deflated 7%)
  adding: kaggle/working/__notebook__.ipynb (deflated 95%)
