In [1]:
!pip install phonetics
!pip install epitran

Collecting phonetics
  Downloading phonetics-1.0.5.tar.gz (8.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: phonetics
  Building wheel for phonetics (setup.py) ... [?25l[?25hdone
  Created wheel for phonetics: filename=phonetics-1.0.5-py2.py3-none-any.whl size=8696 sha256=3159d0a7a398f00dbd24291af51167f84367ba6a7d91ca76e32f314be6fabf2c
  Stored in directory: /root/.cache/pip/wheels/b7/1e/82/80a78c7d1ad7fc6e0af1b4d9009360b251c0e50fe59f046edb
Successfully built phonetics
Installing collected packages: phonetics
Successfully installed phonetics-1.0.5
Collecting epitran
  Downloading epitran-1.26.0-py2.py3-none-any.whl.metadata (34 kB)
Collecting panphon>=0.20 (from epitran)
  Downloading panphon-0.21.2-py2.py3-none-any.whl.metadata (15 kB)
Collecting jamo (from epitran)
  Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)
Collecting unicodecsv (from panphon>=0.20->epitran)
  Downloading unicodecsv-0.14.1.tar.gz 

In [2]:
import pandas as pd
import numpy as np
import phonetics
import torch
import epitran
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from scipy.spatial.distance import cosine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
import pandas as pd
import os

# List of language pairs
language_pairs = [
    ("Azerbaijani", "Arabic"),
    ("Catalan", "Arabic"),
    ("Chinese", "English"),
    ("English", "French"),
    ("English", "German"),
    ("Finnish", "Swedish"),
    ("German", "French"),
    ("German", "Italian"),
    ("Hindi", "Persian"),
    ("Hungarian", "German"),
    ("Indonesian", "Dutch"),
    ("Kazakh", "Russian"),
    ("Persian", "Arabic"),
    ("Polish", "French"),
    ("Romanian", "French"),
    ("Romanian", "Hungarian"),
]

def read_language(lang1, lang2):
    file_path = f"/kaggle/input/dataset/Datasets/production_train_test/{lang1}-{lang2}/balanced/{lang1}-{lang2}-train_production_balanced.csv"    
    if os.path.exists(file_path):  
        df = pd.read_csv(file_path)
        df = df.drop(columns=[col for col in ['Unnamed: 0.1', 'Unnamed: 0'] if col in df.columns], errors="ignore")
        df["language_pair"] = f"{lang1}-{lang2}"
        return df
    else:
        print(f"File not found: {file_path}")
        return None

dfs = [read_language(lang1, lang2) for lang1, lang2 in language_pairs]
dfs = [df for df in dfs if df is not None]

final_df = pd.concat(dfs, ignore_index=True)

print(final_df.info())

final_df.to_csv("all_languages_combined.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47384 entries, 0 to 47383
Data columns (total 21 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   loan_word                                    47384 non-null  object 
 1   original_word                                47384 non-null  object 
 2   loan_word_epitran                            47384 non-null  object 
 3   original_word_epitran                        47384 non-null  object 
 4   loan_english                                 47159 non-null  object 
 5   original_english                             47175 non-null  object 
 6   Fast Levenshtein Distance Div Maxlen         47384 non-null  float64
 7   Dolgo Prime Distance Div Maxlen              47384 non-null  float64
 8   Feature Edit Distance Div Maxlen             47384 non-null  float64
 9   Hamming Feature Distance Div Maxlen          47384 non-null  float64
 10

In [4]:
# need to install - sudo apt-get install flite

epi = epitran.Epitran('fra-Latn')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def normalize(unicode_values):
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word):
    try:
        loan_epitran = epi.transliterate(word)
        print(f"Transliterated text: {loan_epitran}")
    except IndexError as e:
        print(f"Transliteration failed: {e}")
        loan_epitran = "N/A"
    unicode_features = [ord(c) for c in word]
    unicode_features = normalize(unicode_features)
    return {
        "word": word,
        "phonetic": loan_epitran,
        "unicode": unicode_features,
        "length": len(word)
    }

extract_features("hello")

Transliterated text: ɛlo


{'word': 'hello',
 'phonetic': 'ɛlo',
 'unicode': [-2.4000000000000057,
  -5.400000000000006,
  1.5999999999999943,
  1.5999999999999943,
  4.599999999999994],
 'length': 5}

In [5]:
epi = epitran.Epitran('fra-Latn')

def normalize(unicode_values):
    """Normalize Unicode values by subtracting the mean."""
    mean_value = sum(unicode_values) / len(unicode_values)
    return [val - mean_value for val in unicode_values]

def extract_features(word):
    """Extract phonetic and Unicode-based features for a word."""
    try:
        loan_epitran = epi.transliterate(word)  # Phonetic transliteration
    except IndexError as e:
        print(f"Transliteration failed for '{word}': {e}")

    unicode_features = [ord(c) for c in word]  # Convert to Unicode
    unicode_features = normalize(unicode_features)  # Normalize Unicode values

    return {
        "phonetic": loan_epitran,
        "unicode": unicode_features,
        "length": len(word)
    }


df_features = final_df["loan_word"].apply(extract_features).apply(pd.Series)
df_features
df = pd.concat([final_df, df_features], axis=1)

In [6]:
df['label'] = df["label"].replace(['random', 'hard_negative', 'loan', 'synonym'],[0,1,0,0])
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")


def tokenize_and_encode(text, tokenizer, max_length=120):
    encoding = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return encoding["input_ids"].squeeze(0), encoding["attention_mask"].squeeze(0)

df['phonetic_'], df['attention_masks'] = zip(*df['phonetic'].apply(
    lambda x: tokenize_and_encode(x, tokenizer)
))

def pad_sequence(seq, maxlen, pad_value=0):
    """Pads sequences with pad_value up to maxlen."""
    return seq + [pad_value] * (maxlen - len(seq))

max_unicode_len = 30

df['unicode_padded'] = df['unicode'].apply(lambda x: pad_sequence(x, max_unicode_len))


  df['label'] = df["label"].replace(['random', 'hard_negative', 'loan', 'synonym'],[0,1,0,0])


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [7]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

text_pairs = list(zip(df["original_word"], df["loan_word"]))
labels = torch.tensor(df["label"].tolist())

inputs = tokenizer.batch_encode_plus(
    [f"{loan} [SEP] {original}" for loan, original in text_pairs],
    padding=True,
    truncation=True,
    return_tensors="pt",
    max_length=128,
)

batch_size = 128
dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"], labels)
dataloader2 = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

for name, param in model.bert.named_parameters():
    if not any(layer in name for layer in ["encoder.layer.9","encoder.layer.10", "encoder.layer.11"]):
        param.requires_grad = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

for epoch in range(50):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader2, desc=f"Epoch {epoch+1}")

    for batch in progress_bar:
        input_ids, attention_mask, batch_labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=total_loss / (progress_bar.n + 1))

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader2)}")

model.save_pretrained("tuned-bert")
tokenizer.save_pretrained("tuned-bert-tokenizer")


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 371/371 [01:02<00:00,  5.97it/s, loss=0.228]


Epoch 1, Loss: 0.22834462734727526


Epoch 2: 100%|██████████| 371/371 [01:01<00:00,  5.99it/s, loss=0.193]


Epoch 2, Loss: 0.1933460118615563


Epoch 3: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.181]


Epoch 3, Loss: 0.18081258730385502


Epoch 4: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.173]


Epoch 4, Loss: 0.17288236368216595


Epoch 5: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.165]


Epoch 5, Loss: 0.1652151090557363


Epoch 6: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.155]


Epoch 6, Loss: 0.15544505006661313


Epoch 7: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.149]


Epoch 7, Loss: 0.149126036117742


Epoch 8: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.142]


Epoch 8, Loss: 0.14168508782621342


Epoch 9: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.134]


Epoch 9, Loss: 0.13424277203182647


Epoch 10: 100%|██████████| 371/371 [01:01<00:00,  6.03it/s, loss=0.125]


Epoch 10, Loss: 0.12475995043659147


Epoch 11: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.119]


Epoch 11, Loss: 0.11850718328493304


Epoch 12: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.11]


Epoch 12, Loss: 0.11013716926692792


Epoch 13: 100%|██████████| 371/371 [01:01<00:00,  5.99it/s, loss=0.104]


Epoch 13, Loss: 0.10352582091872262


Epoch 14: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0993]


Epoch 14, Loss: 0.09933326911492489


Epoch 15: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0922]


Epoch 15, Loss: 0.09220582221175301


Epoch 16: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0862]


Epoch 16, Loss: 0.08619845535033796


Epoch 17: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0803]


Epoch 17, Loss: 0.08026497254795784


Epoch 18: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0773]


Epoch 18, Loss: 0.07727399695729913


Epoch 19: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0719]


Epoch 19, Loss: 0.07189182280700884


Epoch 20: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0662]


Epoch 20, Loss: 0.06615425176627311


Epoch 21: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0631]


Epoch 21, Loss: 0.0630623845714885


Epoch 22: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0595]


Epoch 22, Loss: 0.05946778968438466


Epoch 23: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0569]


Epoch 23, Loss: 0.05685866390407648


Epoch 24: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0548]


Epoch 24, Loss: 0.054798679062318445


Epoch 25: 100%|██████████| 371/371 [01:01<00:00,  5.99it/s, loss=0.0481]


Epoch 25, Loss: 0.04808460109970078


Epoch 26: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0464]


Epoch 26, Loss: 0.046422282940484685


Epoch 27: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0462]


Epoch 27, Loss: 0.04624701582266876


Epoch 28: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0427]


Epoch 28, Loss: 0.04268178650657483


Epoch 29: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0409]


Epoch 29, Loss: 0.040875322867004134


Epoch 30: 100%|██████████| 371/371 [01:01<00:00,  6.03it/s, loss=0.0378]


Epoch 30, Loss: 0.03784740433935327


Epoch 31: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0397]


Epoch 31, Loss: 0.03970676245035433


Epoch 32: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0355]


Epoch 32, Loss: 0.03549488787187861


Epoch 33: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0342]


Epoch 33, Loss: 0.03419149527514499


Epoch 34: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0321]


Epoch 34, Loss: 0.032144862184780486


Epoch 35: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0318]


Epoch 35, Loss: 0.03182166152146728


Epoch 36: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0323]


Epoch 36, Loss: 0.032276869962378496


Epoch 37: 100%|██████████| 371/371 [01:01<00:00,  6.00it/s, loss=0.0287]


Epoch 37, Loss: 0.0286706754011466


Epoch 38: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0291]


Epoch 38, Loss: 0.02914399319489027


Epoch 39: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0274]


Epoch 39, Loss: 0.027422290307286414


Epoch 40: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0271]


Epoch 40, Loss: 0.02708465071059914


Epoch 41: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.026]


Epoch 41, Loss: 0.025985350888040458


Epoch 42: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0256]


Epoch 42, Loss: 0.02561909328321512


Epoch 43: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0251]


Epoch 43, Loss: 0.02510017377335071


Epoch 44: 100%|██████████| 371/371 [01:01<00:00,  6.03it/s, loss=0.0221]


Epoch 44, Loss: 0.022117305025640334


Epoch 45: 100%|██████████| 371/371 [01:01<00:00,  6.03it/s, loss=0.0231]


Epoch 45, Loss: 0.023111556881958383


Epoch 46: 100%|██████████| 371/371 [01:01<00:00,  6.03it/s, loss=0.0235]


Epoch 46, Loss: 0.023486615945597623


Epoch 47: 100%|██████████| 371/371 [01:01<00:00,  6.03it/s, loss=0.0209]


Epoch 47, Loss: 0.020939104386725818


Epoch 48: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0205]


Epoch 48, Loss: 0.02047517222066465


Epoch 49: 100%|██████████| 371/371 [01:01<00:00,  5.99it/s, loss=0.0196]


Epoch 49, Loss: 0.019584939286802806


Epoch 50: 100%|██████████| 371/371 [01:01<00:00,  6.04it/s, loss=0.0195]


Epoch 50, Loss: 0.01951124999879902


('tuned-bert-tokenizer/tokenizer_config.json',
 'tuned-bert-tokenizer/special_tokens_map.json',
 'tuned-bert-tokenizer/vocab.txt',
 'tuned-bert-tokenizer/added_tokens.json')

In [9]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

model_path = "tuned-bert"
tokenizer_path = "tuned-bert-tokenizer"

tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
model = BertForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

def predict(loan_word, original_word):
    """Predicts if a loanword belongs to the target language."""
    
    encoded_input = tokenizer(
        f"{loan_word} [SEP] {original_word}",
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )

    input_ids = encoded_input["input_ids"].to(device)
    attention_mask = encoded_input["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()  # Convert to class label
    print(logits)
    return predicted_label

loan_word = "qəhbə"
original_word = "قَحْبَة"

predicted_class = predict(loan_word, original_word)
print(f"Prediction: {predicted_class}")


tensor([[ 6.3890, -6.8924]], device='cuda:0')
Prediction: 0


In [10]:
for name, param in model.named_parameters():
    print(f"Layer: {name} | Shape: {param.shape}")


Layer: bert.embeddings.word_embeddings.weight | Shape: torch.Size([119547, 768])
Layer: bert.embeddings.position_embeddings.weight | Shape: torch.Size([512, 768])
Layer: bert.embeddings.token_type_embeddings.weight | Shape: torch.Size([2, 768])
Layer: bert.embeddings.LayerNorm.weight | Shape: torch.Size([768])
Layer: bert.embeddings.LayerNorm.bias | Shape: torch.Size([768])
Layer: bert.encoder.layer.0.attention.self.query.weight | Shape: torch.Size([768, 768])
Layer: bert.encoder.layer.0.attention.self.query.bias | Shape: torch.Size([768])
Layer: bert.encoder.layer.0.attention.self.key.weight | Shape: torch.Size([768, 768])
Layer: bert.encoder.layer.0.attention.self.key.bias | Shape: torch.Size([768])
Layer: bert.encoder.layer.0.attention.self.value.weight | Shape: torch.Size([768, 768])
Layer: bert.encoder.layer.0.attention.self.value.bias | Shape: torch.Size([768])
Layer: bert.encoder.layer.0.attention.output.dense.weight | Shape: torch.Size([768, 768])
Layer: bert.encoder.layer.0.at

In [11]:
final_df

Unnamed: 0,loan_word,original_word,loan_word_epitran,original_word_epitran,loan_english,original_english,Fast Levenshtein Distance Div Maxlen,Dolgo Prime Distance Div Maxlen,Feature Edit Distance Div Maxlen,Hamming Feature Distance Div Maxlen,...,Partial Hamming Feature Distance Div Maxlen,plain Levenshtein,loan_unicode,original_unicode,label,label_bin,DNN_logits,MBERT_cos_sim,XLM_cos_sim,language_pair
0,Möhkəmlik,البطولة,mœhkæmlɪk,albtˤuːlt,Durability,Championship,1.000000,0.666667,0.354167,0.393519,...,0.377315,9,Latin,Arabic,synonym,0,-18.639088,0.805898,0.519797,Azerbaijani-Arabic
1,mehriban,مرن,mehrɪbɑn,mrn,kind,flexible,0.625000,0.625000,0.557292,0.625000,...,0.625000,8,Latin,Arabic,synonym,0,-9.437688,0.833718,0.753124,Azerbaijani-Arabic
2,qəhbə,قَحْبَة,ɡæhbæ,qَħْbَt,bastard,whore,0.857143,0.600000,0.291667,0.325000,...,0.312500,7,Latin,Arabic,loan,1,4.729791,0.672404,0.517030,Azerbaijani-Arabic
3,təslim olmaq,التنازل عن العرش,tæslɪm ɔlmɑɡ,altnaːzl ʕn aːlʕrʃ,surrender,abdication,0.777778,0.571429,0.299107,0.333333,...,0.318452,15,Latin,Arabic,synonym,0,-49.171078,0.902082,0.500199,Azerbaijani-Arabic
4,tələsmək,مضطرب,tælæsmæk,mdˤtˤrb,hurry up,مضطرب,1.000000,0.875000,0.466146,0.520833,...,0.505208,8,Latin,Arabic,synonym,0,-7.328197,0.871732,0.551427,Azerbaijani-Arabic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47379,Barabas,Fellak,barabas,fɛlːɒk,Barabas,Fellak,1.000000,0.428571,0.351190,0.392857,...,0.377976,6,Latin,Latin,random,0,-10.796869,0.893563,0.532925,Romanian-Hungarian
47380,Agrișteu,Egrestő,aɡriʃteu,ɛɡrɛʃtøː,Agristeu,Egrestő,0.500000,0.125000,0.143229,0.156250,...,0.156250,5,Latin,Latin,loan,1,3.594073,0.913684,0.794811,Romanian-Hungarian
47381,Hădărău,Hadaró,hədərəu,hɒdɒroː,Hădărău,Hadaró,0.571429,0.142857,0.181548,0.196429,...,0.196429,4,Latin,Latin,loan,1,0.199825,0.855007,0.712788,Romanian-Hungarian
47382,Mateiaș,Mátyás,mateiaʃ,maːcaːʃ,Mateiaș,Matthias,0.571429,0.428571,0.288690,0.321429,...,0.318452,5,Latin,Latin,loan,1,-0.000949,0.877873,0.582590,Romanian-Hungarian
