# Lost in Translation: Computational Approach to Linear A Decryption with LSTM and Transformer Models
### *Team: Steven Lu, Georgiy Sekretaryuk, Oluwafemi*

## OUTLINE

Part 1 Goals:
- replicate NeuroDecipher LSTM model with Linear B
- apply NeuroDecipher NLP approaches in a transformer model
- test different pre-training techniques and parameters to see how it influences the result

Part 2 Goals:

...TBD after Nov 13
- Work with Linear A here

## IMPORTS

Import the necessary libraries for the project and define any additional configurations.

In [49]:
# IMPORT THE LIBRARIES HERE
!pip install transformers
!pip install torch
!pip install transliterate
!pip install sentencepiece
import os
import shutil
import sys
import pandas as pd
from transformers import BertTokenizer, BertModel, BertConfig
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import Adam
import torch.nn.functional as F
from transliterate import translit, get_available_language_codes
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch



In [2]:
#setup for GDrive
# #@title SELECT USER to mount the data drive according to its path in your drive
# USER = 'Georgiy' #@param ['Georgiy', 'Steven', 'Oluwafemi']

# #@title Mount GDrive
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)
# #remove cache
# !rm -rf "/content/drive/MyDrive/NLP_266/__pycache__"

# #@title Set PATH to /data/ folder
# PATHS = {}
# PATHS['Georgiy'] = "/content/drive/MyDrive/NLP_266"
# PATHS['Steven'] = "/content/drive/Shareddrives/PathForSteven"  # Replace with the actual path
# PATHS['Oluwafemi'] = "/content/drive/Shareddrives/PathForOluwafemi"  # Replace with the actual path
# PATH = PATHS[USER]

# if PATH == "":
#     raise ValueError("Enter your path to the shared data folder.\nIt should start with 'content/drive/...' and end with '.../281 Final Project/data/")


In [3]:
# # Import Lin B from NeuroDecipher https://github.com/j-luo93/NeuroDecipher
#only run this if the NeuroDecipher folder is empty
# folder_path = 'NeuroDecipher'

# if os.path.exists(folder_path):
#    shutil.rmtree(folder_path)
#    print(f"The folder '{folder_path}' has been removed.")
# else:
#    print(f"The folder '{folder_path}' does not exist.")

# !git clone https://github.com/j-luo93/NeuroDecipher
# !git submodule init && git submodule update
# !pip install torch torchvision torchaudio
# !cd NeuroDecipher && pip install -r requirements.txt
# !cd NeuroDecipher && pip install .
# !cd NeuroDecipher/arglib && ls
# !cd NeuroDecipher/editdistance && pip install .
# !cd NeuroDecipher/arglib && pip install .
# !cd NeuroDecipher/dev_misc && pip install -r requirements.txt
# !cd NeuroDecipher/dev_misc && pip install .

## LOAD THE DATA

Load the data from https://github.com/j-luo93/NeuroDecipher.

Each .cog file is essentially a tsv file, where each column corresponds to the words in one language. Words in the same row are considered cognates. If for one word, there is no corresponding cognate in another language, _ is used to fill the cell. If multiple cognates are available for the same word, '|' is used to separate them.


In [4]:
# Load the data into a pandas DataFrame
file_path = 'NeuroDecipher/data/linear_b-greek.cog'
file_path_names = 'NeuroDecipher/data/linear_b-greek.names.cog'
data_linearb = pd.read_csv(file_path, sep='\t', header=0)
data_linearb_names = pd.read_csv(file_path_names, sep='\t', header=0)

# Print data for testing
print('Loaded Linear B Cognates before modifications:\n', data_linearb)
print('Loaded Linear B Names before modifications:\n', data_linearb_names)

Loaded Linear B Cognates before modifications:
     linear_b              greek
0      𐀀𐀁𐀪𐀦𐀲          αελιποτας
1       𐀀𐀁𐀴𐀵     αεθιστος|εθιζω
2       𐀀𐀅𐀔𐀃      αδαμαο|αδαμας
3       𐀀𐀅𐀕𐀸  αδαμεfεις|αδαμευς
4      𐀀𐀅𐀨𐀴𐀍          αδραστιος
..       ...                ...
914     𐁆𐀯𐀊𐀒          φυσιαρχος
915       𐁆𐀳              φυτερ
916     𐁆𐀳𐀪𐀊            φυτερια
917   𐁆𐁈𐀀𐀐𐀩𐀄       φυλιαςαγρευς
918       𐁇𐀜             φτενοι

[919 rows x 2 columns]
Loaded Linear B Names before modifications:
     linear_b              greek
0      𐀀𐀁𐀪𐀦𐀲          αελιποτας
1       𐀀𐀁𐀴𐀵                  _
2       𐀀𐀅𐀔𐀃      αδαμαο|αδαμας
3       𐀀𐀅𐀕𐀸  αδαμεfεις|αδαμευς
4      𐀀𐀅𐀨𐀴𐀍          αδραστιος
..       ...                ...
914     𐁆𐀯𐀊𐀒          φυσιαρχος
915       𐁆𐀳                  _
916     𐁆𐀳𐀪𐀊                  _
917   𐁆𐁈𐀀𐀐𐀩𐀄       φυλιαςαγρευς
918       𐁇𐀜                  _

[919 rows x 2 columns]


## DATA MODIFICATION

- Do we split the data into individual letters?

- INstead of columns for cog 1 / cog 2, turn it into rows -> increases dataset size
- turn empty rows into test/train



In [5]:
# @title Modify the Data

# LINEAR B COGNATES

# Renaming the original greek column to track the original
data_linearb.rename(columns={'greek': 'greek_original'}, inplace=True)
# Split the 'Greek' col into 2
split_columns = data_linearb['greek_original'].str.split('|', expand=True)

# Assigning split cols
data_linearb['greek_cog_1'] = split_columns[0]
data_linearb['greek_cog_2'] = split_columns[1].fillna('')
data_linearb['greek_cog_3'] = split_columns[2].fillna('')
data_linearb['greek_cog_4'] = split_columns[3].fillna('')
data_linearb['greek_cog_5'] = split_columns[4].fillna('')

# LINEAR B NAMES

data_linearb_names.rename(columns={'greek': 'greek_original'}, inplace=True)
# Split the 'Greek' col into 2
split_columns = data_linearb_names['greek_original'].str.split('|', expand=True)
print(split_columns.head()) #max: 5
# Assigning split cols
data_linearb_names['greek_cog_1'] = split_columns[0]
data_linearb_names['greek_cog_2'] = split_columns[1].fillna('')
data_linearb_names['greek_cog_3'] = split_columns[2].fillna('')
data_linearb_names['greek_cog_4'] = split_columns[3].fillna('')
data_linearb_names['greek_cog_5'] = split_columns[4].fillna('')
# Replace all _ with blank space
data_linearb_names.replace('_', '', inplace=True)



           0        1     2     3     4
0  αελιποτας     None  None  None  None
1          _     None  None  None  None
2     αδαμαο   αδαμας  None  None  None
3  αδαμεfεις  αδαμευς  None  None  None
4  αδραστιος     None  None  None  None


In [6]:

assert(len(data_linearb)==len(data_linearb_names))
data_linearb_split=[]
data_linearb_names_split=[]
for i in range(len(data_linearb)):
    #fill linear B
    temp=[data_linearb["linear_b"].iloc[i],data_linearb["greek_cog_1"].iloc[i]]
    data_linearb_split.append(temp)
    if data_linearb["greek_cog_2"].iloc[i]!="":
        data_linearb_split.append([data_linearb["linear_b"].iloc[i],data_linearb["greek_cog_2"].iloc[i]])
    if data_linearb["greek_cog_3"].iloc[i]!="":
        data_linearb_split.append([data_linearb["linear_b"].iloc[i],data_linearb["greek_cog_3"].iloc[i]])
    if data_linearb["greek_cog_4"].iloc[i]!="":
        data_linearb_split.append([data_linearb["linear_b"].iloc[i],data_linearb["greek_cog_4"].iloc[i]])
    if data_linearb["greek_cog_5"].iloc[i]!="":
        data_linearb_split.append([data_linearb["linear_b"].iloc[i],data_linearb["greek_cog_5"].iloc[i]])
    #fill linear B names
    temp=[data_linearb_names["linear_b"].iloc[i],data_linearb_names["greek_cog_1"].iloc[i]]
    data_linearb_names_split.append(temp)
    if data_linearb_names["greek_cog_2"].iloc[i]!="":
        data_linearb_names_split.append([data_linearb_names["linear_b"].iloc[i],data_linearb_names["greek_cog_2"].iloc[i]])
    if data_linearb_names["greek_cog_3"].iloc[i]!="":
        data_linearb_names_split.append([data_linearb["linear_b"].iloc[i],data_linearb["greek_cog_3"].iloc[i]])
    if data_linearb_names["greek_cog_4"].iloc[i]!="":
        data_linearb_names_split.append([data_linearb["linear_b"].iloc[i],data_linearb["greek_cog_4"].iloc[i]])
    if data_linearb_names["greek_cog_5"].iloc[i]!="":
        data_linearb_names_split.append([data_linearb["linear_b"].iloc[i],data_linearb["greek_cog_5"].iloc[i]])
data_linearb_split=pd.DataFrame(data_linearb_split,columns=["linear_b","greek"])
data_linearb_names_split=pd.DataFrame(data_linearb_names_split,columns=["linear_b","greek"])



In [7]:
# Counting Linear B original dataset and split dataset.

print("Original:",len(data_linearb),"lines")
print(data_linearb.head(),'\n')
print("Split:",len(data_linearb_split),"lines")
print(data_linearb_split.head(),'\n')

##########################
###### SANITY CHECK ######
##########################

data_linearb_split_count = data_linearb_split["linear_b"].value_counts().reset_index()
data_linearb_split_count.columns = ["linear_b", "count"]
# print(data_linearb_split_count)

# Count how many greek definitions each linear b value has in the original table, separeted by '|'
data_linearb_count = data_linearb["greek_original"].apply(lambda x: 0 if pd.isna(x) else (1 if '|' not in x else x.count('|') + 1))
data_linearb_count = pd.DataFrame({"linear_b": data_linearb["linear_b"], "count": data_linearb_count})
# print(data_linearb_count)

# The values should match. If they don't, print out the rows that don't match. Otherwise, print out the number of matches.
match_count = 0
for index, row in data_linearb_count.iterrows():
    split_count = data_linearb_split_count[data_linearb_split_count['linear_b'] == row['linear_b']]['count'].values
    original_count = row['count']
    if split_count != original_count:
        print("Value:", row['linear_b'], "Split Count:", split_count, "Original Count:", original_count)
    else:
        match_count += 1
print(f"\nWe have {match_count} matches.")

Original: 919 lines
  linear_b     greek_original greek_cog_1 greek_cog_2 greek_cog_3 greek_cog_4   
0    𐀀𐀁𐀪𐀦𐀲          αελιποτας   αελιποτας                                      \
1     𐀀𐀁𐀴𐀵     αεθιστος|εθιζω    αεθιστος       εθιζω                           
2     𐀀𐀅𐀔𐀃      αδαμαο|αδαμας      αδαμαο      αδαμας                           
3     𐀀𐀅𐀕𐀸  αδαμεfεις|αδαμευς   αδαμεfεις     αδαμευς                           
4    𐀀𐀅𐀨𐀴𐀍          αδραστιος   αδραστιος                                       

  greek_cog_5  
0              
1              
2              
3              
4               

Split: 1429 lines
  linear_b      greek
0    𐀀𐀁𐀪𐀦𐀲  αελιποτας
1     𐀀𐀁𐀴𐀵   αεθιστος
2     𐀀𐀁𐀴𐀵      εθιζω
3     𐀀𐀅𐀔𐀃     αδαμαο
4     𐀀𐀅𐀔𐀃     αδαμας 

Value: 𐀩𐀺 Split Count: [5] Original Count: 7
Value: 𐀵𐀥𐀆 Split Count: [5] Original Count: 6

We have 917 matches.


In [8]:
#Counting Linear B Names and split names.

# print('\n ------ LINEAR B NAMES -----\n')
print("Original:",len(data_linearb_names),"lines")
print(data_linearb_names.head())
print("Split:",len(data_linearb_names_split),"lines")
print(data_linearb_names_split.head())

##########################
###### SANITY CHECK ######
##########################

# Count how many times each linear b value appears int he split.
data_linearb_names_split_count = data_linearb_names_split["linear_b"].value_counts().reset_index()
data_linearb_names_split_count.columns = ["linear_b", "count"]
# print(data_linearb_names_split_count)

# Count how many greek definitions each linear b value has in the original table, separeted by '|'
data_linearb_names_count = data_linearb_names["greek_original"].apply(lambda x: 0 if pd.isna(x) else (1 if '|' not in x else x.count('|') + 1))
data_linearb_names_count = pd.DataFrame({"linear_b": data_linearb_names["linear_b"], "count": data_linearb_names_count})
# print(data_linearb_names_count)

# The values should match. If they don't, print out the rows that don't match. Otherwise, print out the number of matches.
match_count = 0
for index, row in data_linearb_names_count.iterrows():
    split_count = data_linearb_names_split_count[data_linearb_names_split_count['linear_b'] == row['linear_b']]['count'].values
    original_count = row['count']
    if split_count != original_count:
        print("Value:", row['linear_b'], "Split Count:", split_count, "Original Count:", original_count)
    else:
        match_count += 1
print(f"\nWe have {match_count} matches.")



Original: 919 lines
  linear_b     greek_original greek_cog_1 greek_cog_2 greek_cog_3 greek_cog_4   
0    𐀀𐀁𐀪𐀦𐀲          αελιποτας   αελιποτας                                      \
1     𐀀𐀁𐀴𐀵                                                                      
2     𐀀𐀅𐀔𐀃      αδαμαο|αδαμας      αδαμαο      αδαμας                           
3     𐀀𐀅𐀕𐀸  αδαμεfεις|αδαμευς   αδαμεfεις     αδαμευς                           
4    𐀀𐀅𐀨𐀴𐀍          αδραστιος   αδραστιος                                       

  greek_cog_5  
0              
1              
2              
3              
4              
Split: 1069 lines
  linear_b      greek
0    𐀀𐀁𐀪𐀦𐀲  αελιποτας
1     𐀀𐀁𐀴𐀵           
2     𐀀𐀅𐀔𐀃     αδαμαο
3     𐀀𐀅𐀔𐀃     αδαμας
4     𐀀𐀅𐀕𐀸  αδαμεfεις

We have 919 matches.


In [9]:
print("Checking unique values in each column:\n")

print("data_linearb:\n")
for col in data_linearb.columns:
    if not isinstance(data_linearb[col].iloc[0], list):
        print(f"{col} Unique Values:", data_linearb[col].nunique())

print("\n")

print("data_linearb_names:\n")
for col in data_linearb_names.columns:
    print(f"{col} Unique Values:", data_linearb_names[col].nunique())

Checking unique values in each column:

data_linearb:

linear_b Unique Values: 919
greek_original Unique Values: 918
greek_cog_1 Unique Values: 918
greek_cog_2 Unique Values: 388
greek_cog_3 Unique Values: 87
greek_cog_4 Unique Values: 28
greek_cog_5 Unique Values: 7


data_linearb_names:

linear_b Unique Values: 919
greek_original Unique Values: 456
greek_cog_1 Unique Values: 456
greek_cog_2 Unique Values: 131
greek_cog_3 Unique Values: 16
greek_cog_4 Unique Values: 5
greek_cog_5 Unique Values: 2


In [11]:
"""
TODO by Sunday:

[DONE]- Cycle through greek translations of linear B with more than 2 translations (separeted by '|' separator), create n columns where n is max count of translations in the largest row

[DONE]- INstead of columns for cog 1 - cog n, turn it into rows -> increases dataset size 
[DONE]- TWEAK - see above 
[DONE]- Determine which are names and not names
#done- see below
    - For each language, create an object. In the 
      object store data for the alphabet and the universal syllabic translation.


#OLD:
# - Create a mapping dict for universal character embeddings for linear b and for greek
#     - Create a unersal syllable matrix
# - Map linear b to universal syllables (matrix)
# - Map greek to universal syllables (matrix)

#NEW:
[DONE]- Transliterate Linear B - done
[DONE]- Transliterate Modern Greek - done

- Cycle through each greek word. Find word with highest "syllabic matching" to linear B and use that word for the model.
𐀀𐀇𐀪𐀊𐀠	ανδριαντει|ανδριαφι|ανδριος|ανερ

- turn empty rows into train/test
#Steven - done: see above
- Create a train/test split of 20/80 (50/50 distribution of names/not name cognates?)


- Identify separators for transliterated Linear B
  - Separators between characters; separators between words

- UNKNOWN: separators for transliterated Greek
  - Q: How do we set up the model to predict this...

- Model BART, T5

"""

'\nTODO by Sunday:\n\n[DONE]- Cycle through greek translations of linear B with more than 2 translations (separeted by \'|\' separator), create n columns where n is max count of translations in the largest row\n\n[DONE]- INstead of columns for cog 1 - cog n, turn it into rows -> increases dataset size \n[DONE]- TWEAK - see above \n[DONE]- Determine which are names and not names\n#done- see below\n    - For each language, create an object. In the \n      object store data for the alphabet and the universal syllabic translation.\n\n\n#OLD:\n# - Create a mapping dict for universal character embeddings for linear b and for greek\n#     - Create a unersal syllable matrix\n# - Map linear b to universal syllables (matrix)\n# - Map greek to universal syllables (matrix)\n\n#NEW:\n[DONE]- Transliterate Linear B - done\n[DONE]- Transliterate Modern Greek - done\n\n- Cycle through each greek word. Find word with highest "syllabic matching" to linear B and use that word for the model.\n𐀀𐀇𐀪𐀊𐀠\tανδρι

In [46]:
# linear b syllabic mapping from NeuroDecipher MIT paper

linb2syl = {
    u'𐀀': 'a', u'𐀁': 'e', u'𐀂': 'i', u'𐀃': 'o', u'𐀄': 'u', u'𐀅': 'da', u'𐀆': 'de', 
    u'𐀇': 'di', u'𐀈': 'do', u'𐀉': 'du', u'𐀊': 'ja', u'𐀋': 'je', u'𐀍': 'jo', 
    u'𐀎': 'ju', u'𐀏': 'ka', u'𐀐': 'ke', u'𐀑': 'ki', u'𐀒': 'ko', u'𐀓': 'ku', 
    u'𐀔': 'ma', u'𐀕': 'me', u'𐀖': 'mi', u'𐀗': 'mo', u'𐀘': 'mu', u'𐀙': 'na', 
    u'𐀚': 'ne', u'𐀛': 'ni', u'𐀜': 'no', u'𐀝': 'nu', u'𐀞': 'pa', u'𐀟': 'pe', 
    u'𐀠': 'pi', u'𐀡': 'po', u'𐀢': 'pu', u'𐀣': 'qa', u'𐀤': 'qe', u'𐀥': 'qi', 
    u'𐀦': 'qo', u'𐀨': 'ra', u'𐀩': 're', u'𐀪': 'ri', u'𐀫': 'ro', u'𐀬': 'ru',
    u'𐀭': 'sa', u'𐀮': 'se', u'𐀯': 'si', u'𐀰': 'so', u'𐀱': 'su', u'𐀲': 'ta', 
    u'𐀳': 'te', u'𐀴': 'ti', u'𐀵': 'to', u'𐀶': 'tu', u'𐀷': 'wa', u'𐀸': 'we', 
    u'𐀹': 'wi', u'𐀺': 'wo', u'𐀼': 'za', u'𐀽': 'ze', u'𐀿': 'zo', u'𐁀': 'a2', 
    u'𐁁': 'a3', u'𐁂': 'au', u'𐁃': 'dwe', u'𐁄': 'dwo', u'𐁅': 'nwa', u'𐁆': 'pu2', 
    u'𐁇': 'pte', u'𐁈': 'ra2', u'𐁉': 'ra3', u'𐁊': 'ro2', u'𐁋': 'ta2', u'𐁌': 'twe', u'𐁍': 'two'
}

def transliterate_linb(word,dic):
    """Transliterate Linear B characters into syllables."""
    
    res="" #result

    # Cycle through each character in the Linear B word
    for ch in word:

        # translation is the value for that character's key in the mapping dictionary
        trans=dic[ch]
        res+=trans

    return res

#test
print(transliterate_linb("𐀀𐀁𐀪𐀦𐀲",linb2syl))

aeriqota


In [32]:
#put binary inside the split name data of whether the linear b value is a name or not. 

def name(row):
    if row["greek"]=="":
        return 0
    return 1
data_linearb_names_split["Name"]=data_linearb_names_split.apply(name,axis=1)
print("Current names dataset:", len(data_linearb_names_split))
print(data_linearb_names_split.head())

#apply to the original, non-name dataset as well
name_binary=[]
for i in range(len(data_linearb_split)):
    cur_linearb=data_linearb_split["linear_b"].iloc[i]
    corresponding_name=data_linearb_names_split[data_linearb_names_split["linear_b"]==cur_linearb]["greek"].iloc[0]
    if corresponding_name=="":
        name_binary.append(0)
    else: 
        name_binary.append(1)
print("\nCurrent non-names dataset:", len(data_linearb_split))
data_linearb_split["Name"]=name_binary
print(data_linearb_split.head())
        

Current names dataset: 1069
  linear_b      greek  Name greek_transliterate linear_b_transliterate
0    𐀀𐀁𐀪𐀦𐀲  αελιποτας     1           aelipotas               aeriqota
1     𐀀𐀁𐀴𐀵                0                                     aetito
2     𐀀𐀅𐀔𐀃     αδαμαο     1              adamao                 adamao
3     𐀀𐀅𐀔𐀃     αδαμας     1              adamas                 adamao
4     𐀀𐀅𐀕𐀸  αδαμεfεις     1           adamefeis                adamewe

Current non-names dataset: 1429
  linear_b      greek  Name greek_transliterate linear_b_transliterate
0    𐀀𐀁𐀪𐀦𐀲  αελιποτας     1           aelipotas               aeriqota
1     𐀀𐀁𐀴𐀵   αεθιστος     0           aethistos                 aetito
2     𐀀𐀁𐀴𐀵      εθιζω     0              ethizo                 aetito
3     𐀀𐀅𐀔𐀃     αδαμαο     1              adamao                 adamao
4     𐀀𐀅𐀔𐀃     αδαμας     1              adamas                 adamao


In [None]:
#TRANSLITERATION OF GREEK

!pip install transliterate


In [35]:
#Transliterate Linear B Names 

# Create clean lists to store data
greek_transliterate=[]
greek_transliterate_names=[]
from transliterate import translit, get_available_language_codes

# Cycle through the dataset with the names
for i in range(len(data_linearb_names_split)):
    
    #if blank, leave blank
    if data_linearb_names_split["greek"].iloc[i]=="":greek_transliterate_names.append("")
    
    #if not blank, transliterate
    else:greek_transliterate_names.append(translit(data_linearb_names_split["greek"].iloc[i], reversed=True))

# Cycle through the dataset with the non-names
for i in range(len(data_linearb_split)):
    
    #if blank, leave blank
    if data_linearb_split["greek"].iloc[i]=="":greek_transliterate.append("")
    
    #if not blank, transliterate
    else:greek_transliterate.append(translit(data_linearb_split["greek"].iloc[i], reversed=True))


In [36]:
#Transliterate Greek names

# Create clean lists to store data
linearb_transliterate=[]
linearb_transliterate_names=[]

# Cycle through the dataset with the names
for i in range(len(data_linearb_names_split)):
    
    #if blank, leave blank
    if data_linearb_names_split["linear_b"].iloc[i]=="":linearb_transliterate_names.append("")

    #if not blank, transliterate
    else:linearb_transliterate_names.append(transliterate_linb(data_linearb_names_split["linear_b"].iloc[i], linb2syl))

# Cycle through the dataset with the non-names
for i in range(len(data_linearb_split)):
    
    #if blank, leave blank
    if data_linearb_split["linear_b"].iloc[i]=="":linearb_transliterate.append("")
    
    #if not blank, transliterate
    else:linearb_transliterate.append(transliterate_linb(data_linearb_split["linear_b"].iloc[i], linb2syl))



In [37]:
assert(len(greek_transliterate)==len(data_linearb_split))
assert(len(greek_transliterate_names)==len(data_linearb_names_split))
data_linearb_split["greek_transliterate"]=greek_transliterate
data_linearb_names_split["greek_transliterate"]=greek_transliterate_names
data_linearb_split["linear_b_transliterate"]=linearb_transliterate
data_linearb_names_split["linear_b_transliterate"]=linearb_transliterate_names
print(data_linearb_split.head())
print(data_linearb_names_split.head())

  linear_b      greek  Name greek_transliterate linear_b_transliterate
0    𐀀𐀁𐀪𐀦𐀲  αελιποτας     1           aelipotas               aeriqota
1     𐀀𐀁𐀴𐀵   αεθιστος     0           aethistos                 aetito
2     𐀀𐀁𐀴𐀵      εθιζω     0              ethizo                 aetito
3     𐀀𐀅𐀔𐀃     αδαμαο     1              adamao                 adamao
4     𐀀𐀅𐀔𐀃     αδαμας     1              adamas                 adamao
  linear_b      greek  Name greek_transliterate linear_b_transliterate
0    𐀀𐀁𐀪𐀦𐀲  αελιποτας     1           aelipotas               aeriqota
1     𐀀𐀁𐀴𐀵                0                                     aetito
2     𐀀𐀅𐀔𐀃     αδαμαο     1              adamao                 adamao
3     𐀀𐀅𐀔𐀃     αδαμας     1              adamas                 adamao
4     𐀀𐀅𐀕𐀸  αδαμεfεις     1           adamefeis                adamewe


## EXPLORATORY DATA ANALYSIS

Analyze the dataset features.


In [38]:
# Basic statistics and exploration

print('\n----- DESCRIBING THE NON-NAME DATA: -----\n')
print(data_linearb_split.describe())

print('\n----- INFO: -----\n')
print(data_linearb_split.info())

# Check for missing values
print('\n----- CHECKING FOR MISSING VALUES: -----\n')
print(data_linearb_split.isnull().sum())

# Explore unique values and frequency distribution
print('\n----- CHECKING UNIQUE VALUES: -----\n')
# print(data_linearb_split['linear_b'].value_counts())
# print(data_linearb['greek_original'].value_counts())
# print(data_linearb['greek_cog_1'].value_counts())
# print(data_linearb['greek_cog_2'].value_counts())



----- DESCRIBING THE COGNATE DATA: -----

              Name
count  1429.000000
mean      0.433170
std       0.495687
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000

----- INFO: -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1429 entries, 0 to 1428
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   linear_b                1429 non-null   object
 1   greek                   1429 non-null   object
 2   Name                    1429 non-null   int64 
 3   greek_transliterate     1429 non-null   object
 4   linear_b_transliterate  1429 non-null   object
dtypes: int64(1), object(4)
memory usage: 55.9+ KB
None

----- CHECKING FOR MISSING VALUES: -----

linear_b                  0
greek                     0
Name                      0
greek_transliterate       0
linear_b_transliterate    0
dtype: int64

----- CHECKING UNIQUE VALUES: -----



In [40]:
print('\n----- DESCRIBING THE NAMES DATA: -----\n')
print(data_linearb_names_split.describe())

print('\n----- INFO: -----\n')
print(data_linearb_names_split.info())

# Check for missing values
print('\n----- CHECKING FOR MISSING VALUES: -----\n')
print(data_linearb_names_split.isnull().sum())

# Explore unique values and frequency distribution
print('\n----- CHECKING UNIQUE VALUES: -----\n')
# print(data_linearb_names_split['linear_b'].value_counts())
# print(data_linearb_names_split['greek_original'].value_counts())
# print(data_linearb_names_split['greek_cog_1'].value_counts())
# print(data_linearb_names_split['greek_cog_2'].value_counts())


----- DESCRIBING THE NAMES DATA: -----

              Name
count  1069.000000
mean      0.565949
std       0.495864
min       0.000000
25%       0.000000
50%       1.000000
75%       1.000000
max       1.000000

----- INFO: -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1069 entries, 0 to 1068
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   linear_b                1069 non-null   object
 1   greek                   1069 non-null   object
 2   Name                    1069 non-null   int64 
 3   greek_transliterate     1069 non-null   object
 4   linear_b_transliterate  1069 non-null   object
dtypes: int64(1), object(4)
memory usage: 41.9+ KB
None

----- CHECKING FOR MISSING VALUES: -----

linear_b                  0
greek                     0
Name                      0
greek_transliterate       0
linear_b_transliterate    0
dtype: int64

----- CHECKING UNIQUE VALUES: -----



## SPLITTING & TOKENIZATION

- Breakdown the words into characters
- ???
- Split the data into test train

In [42]:
#Creating the train/test split

#only need to split names into train and test for now,
#since the names has several hundred blanks while there are no blanks in the ovr data

data_linearb_names_train=data_linearb_names_split[data_linearb_names_split["greek"]!=""]
data_linearb_names_test=data_linearb_names_split[data_linearb_names_split["greek"]==""]
print("linearb_names_train:", len(data_linearb_names_train))
print(data_linearb_names_train.head(2))
print("linearb_names_test:", len(data_linearb_names_test))
print(data_linearb_names_test.head(2))

linearb_names_train: 605
  linear_b      greek  Name greek_transliterate linear_b_transliterate
0    𐀀𐀁𐀪𐀦𐀲  αελιποτας     1           aelipotas               aeriqota
2     𐀀𐀅𐀔𐀃     αδαμαο     1              adamao                 adamao
linearb_names_test: 464
  linear_b greek  Name greek_transliterate linear_b_transliterate
1     𐀀𐀁𐀴𐀵           0                                     aetito
7      𐀀𐀆𐀳           0                                      adete


In [43]:
# @title: Splitting & tokenizing the data


# IS THIS CORRECT???

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
data_linearb_names_train['linear_b_tokens'] = data_linearb_names_train['linear_b'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
data_linearb_names_train['greek_tokens'] = data_linearb_names_train['greek'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
data_linearb_names_test['linear_b_tokens'] = data_linearb_names_test['linear_b'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

# these 2 columns don't exist. Why was this added?
# data_linearb['greek_cog_1_tokens'] = data_linearb['greek_cog_1'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
# data_linearb['greek_cog_2_tokens'] = data_linearb['greek_cog_2'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True) if x else [])

# NEED TO TOKENIZE NAMES AND OTHER DATASETS THAT ARE LOADED HERE


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_linearb_names_train['linear_b_tokens'] = data_linearb_names_train['linear_b'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_linearb_names_train['greek_tokens'] = data_linearb_names_train['greek'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.or

In [45]:
print("linearb_names_train:", len(data_linearb_names_train))
print(data_linearb_names_train.head(2))
print("\nlinearb_names_test:", len(data_linearb_names_test))
print(data_linearb_names_test.head(2))

linearb_names_train: 605
  linear_b      greek  Name greek_transliterate linear_b_transliterate   
0    𐀀𐀁𐀪𐀦𐀲  αελιποτας     1           aelipotas               aeriqota  \
2     𐀀𐀅𐀔𐀃     αδαμαο     1              adamao                 adamao   

   linear_b_tokens                                       greek_tokens  
0  [101, 100, 102]  [101, 1155, 29723, 29727, 18199, 29731, 29730,...  
2  [101, 100, 102]  [101, 1155, 29722, 14608, 29728, 14608, 29730,...  

linearb_names_test: 464
  linear_b greek  Name greek_transliterate linear_b_transliterate   
1     𐀀𐀁𐀴𐀵           0                                     aetito  \
7      𐀀𐀆𐀳           0                                      adete   

   linear_b_tokens  
1  [101, 100, 102]  
7  [101, 100, 102]  


## MODEL ARCHITECTURE

- Identify baseline model
- Test other Seq2seq models
  - Transformer model - our own?
  - Or can we modify BERT/another model and train it too?

In [None]:
## THE BART MODEL 1

In [None]:
# from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# article_A = "Input A"
# article_B = "Expected Output B"

# model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
# tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# # translate Linear B to Greek
# tokenizer.src_lang = "ar_AR"
# encoded_ar = tokenizer(article_ar, return_tensors="pt")
# generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
# tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [51]:
import sentencepiece

train_df = data_linearb_names_train
test_df = data_linearb_names_test

# Prepare the data
train_df['input_text'] = 'translate Linear B to Greek: ' + train_df['linear_b_transliterate']
train_df['target_text'] = train_df['greek_transliterate']

# Define a custom dataset
class LinearBDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=512):
        self.tokenizer = tokenizer
        self.input_texts = data['input_text']
        self.target_texts = data['target_text']
        self.max_length = max_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        source = self.tokenizer(self.input_texts[idx], padding='max_length', max_length=self.max_length, truncation=True, return_tensors='pt')
        target = self.tokenizer(self.target_texts[idx], padding='max_length', max_length=self.max_length, truncation=True, return_tensors='pt')
        return source, target

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

train_dataset = LinearBDataset(tokenizer, train_df)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Training loop (simplified)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

for epoch in range(epochs):
    for batch in train_loader:
        input_ids = batch[0]['input_ids'].to(device)
        attention_mask = batch[0]['attention_mask'].to(device)
        labels = batch[1]['input_ids'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['input_text'] = 'translate Linear B to Greek: ' + train_df['linear_b_transliterate']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['target_text'] = train_df['greek_transliterate']


ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
linearb_names_test['input_text'] = 'translate Linear B to Greek: ' + linearb_names_test['linear_b_transliterate']
test_dataset = LinearBDataset(tokenizer, linearb_names_test)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Function to generate predictions
def generate_predictions(model, tokenizer, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            # Prepare batch data
            input_ids = batch[0]['input_ids'].to(device)
            attention_mask = batch[0]['attention_mask'].to(device)

            # Generate prediction
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
            predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            predictions.append(predicted_text)
    return predictions

# Generate predictions
predicted_transliterations = generate_predictions(model, tokenizer, test_loader)

# Print results
for i, row in linearb_names_test.iterrows():
    print(f"Linear B Transliterate: {row['linear_b_transliterate']}")
    print(f"Predicted Greek Transliterate: {predicted_transliterations[i]}")
    print(f"Actual Greek Transliterate: {row['greek_transliterate'] if 'greek_transliterate' in row else 'N/A'}")
    print("--------------------------------------------------")


In [None]:
## T5 MODEL

In [None]:
t5_model = TFT5ForConditionalGeneration.from_pretrained('t5-large')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-large')
t5_model.summary()

In [None]:
ARTICLE = data_linearb_names_split['linear_b']

In [None]:
t5_input_texts = ["translate LinearB to Greek: " + str(entry) for entry in top_10_articles]
t5_inputs = t5_tokenizer(t5_input_texts, return_tensors='tf', padding=True, truncation=True)

In [None]:
t5_summary_ids = t5_model.generate(t5_inputs['input_ids'],
                                   num_beams=3,
                                   no_repeat_ngram_size=3,
                                   min_length=10,
                                   max_length=40)

print([t5_tokenizer.decode(g, skip_special_tokens=True,
                           clean_up_tokenization_spaces=False) for g in t5_summary_ids])

In [None]:
## BART MODEL

In [None]:
# Remove any existing file/directory with the same name
!rm -rf bart.large.tar.gz

# Download the BART model
!wget https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz

# Extract the tar file
!tar -xzvf bart.large.tar.gz

In [None]:
model_directory = './bart.large.tar.gz'
bart = BARTModel.from_pretrained(model_directory, checkpoint_file='model.pt')
bart.eval()

In [None]:
tokens = bart.encode('αελιποτας')
print("Encoded tokens:", tokens.tolist())
decoded_text = bart.decode(tokens)
print("Decoded text:", decoded_text)

In [None]:
last_layer_features = bart.extract_features(tokens)

assert last_layer_features.size() == torch.Size([1, len(tokens), bart.model.encoder.embed_tokens.embedding_dim])

all_layers = bart.extract_features(tokens, return_all_hiddens=True)

assert len(all_layers) == bart.model.encoder.layers.__len__() + 1  # +1 for the embedding layer
assert torch.all(all_layers[-1] == last_layer_features)

### Loading the Model

In [None]:
# Loading BERT
config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True)
bert_model = BertModel(config)

### Building the Model

In [None]:
# Building the COgnate model (sample skeleton)

class CognatePredictionModel(nn.Module):
    def __init__(self, bert_model):
        super(CognatePredictionModel, self).__init__()
        self.bert = bert_model

        # BERT outputs a 768-d vector
        bert_output_size = 768

        # Additional fully connected layers
        self.fc1 = nn.Linear(bert_output_size * 2, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 256)
        # Output layer for binary classification
        self.fc3 = nn.Linear(256, 1)

    def forward(self, linear_b_tokens, greek_tokens):
        # Pass input through BERT, take pooled output
        outputs_linear_b = self.bert(linear_b_tokens)[1]
        outputs_greek = self.bert(greek_tokens)[1]

        # Concatenate the outputs
        combined = torch.cat((outputs_linear_b, outputs_greek), 1)

        # Pass through additional layers; placeholders
        x = self.fc1(combined)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)

        #print x
        # Should be tensor with logits

        return x


## TRAINING

- Train the model

In [None]:
unique_greek_tokens = set()

for tokens in data_linearb_names_train['greek']:
    unique_greek_tokens.update(tokens.split('|'))

for tokens in data_linearb_names_test['greek']:
    unique_greek_tokens.update(tokens.split('|'))

token_to_id = {token: idx for idx, token in enumerate(unique_greek_tokens)}

In [None]:
class CognateDataset(Dataset):
    def __init__(self, linear_b_tokens, greek_tokens, token_to_id, default_id=0):
        self.linear_b_tokens = linear_b_tokens
        self.greek_tokens = greek_tokens
        self.token_to_id = token_to_id
        self.default_id = default_id

    def __len__(self):
        return len(self.linear_b_tokens)

    def __getitem__(self, idx):
        linear_b_token_tensor = torch.tensor(self.linear_b_tokens[idx], dtype=torch.long)
        greek_token_tensor = torch.tensor(self.greek_tokens[idx], dtype=torch.long)

        return {
            'linear_b_tokens': linear_b_token_tensor,
            'greek_tokens': greek_token_tensor
        }

train_dataset = CognateDataset(
    data_linearb_names_train['linear_b_tokens'].tolist(),
    data_linearb_names_train['greek_tokens'].tolist(),
    token_to_id,
    default_id=0
)

test_dataset = CognateDataset(
    data_linearb_names_test['linear_b_tokens'].tolist(),
    # For test data, you might not have labels or might handle them differently
    [0] * len(data_linearb_names_test),  # Placeholder if you don't have labels
    token_to_id,
    default_id=0
)

def collate_fn(batch):
    linear_b_tokens = [item['linear_b_tokens'] for item in batch]
    greek_tokens = [item['greek_tokens'] for item in batch]

    # Pad sequences
    linear_b_tokens_padded = pad_sequence(linear_b_tokens, batch_first=True, padding_value=tokenizer.pad_token_id)
    greek_tokens_padded = pad_sequence(greek_tokens, batch_first=True, padding_value=tokenizer.pad_token_id)

    return {
        'linear_b_tokens': linear_b_tokens_padded,
        'greek_tokens': greek_tokens_padded
    }

data_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [None]:
# Model

model = CognatePredictionModel(bert_model)
loss_function = nn.BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=0.001)


num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in data_loader:
        linear_b_tokens = batch['linear_b_tokens']
        greek_tokens = batch['greek_tokens']
        optimizer.zero_grad()

        outputs = model(linear_b_tokens, greek_tokens)
        outputs = outputs.squeeze()

        loss = loss_function(outputs, greek_tokens.float())
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        predicted_labels = (outputs > 0).float()
        correct_predictions += (predicted_labels == greek_tokens).sum().item()
        total_predictions += greek_tokens.numel()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

## EVALUATION

- The primary goal metric is accuracy as compared to NeuroDecipher

In [None]:
# Evaluation code

