# Lost in Translation: Computational Approach to Linear A Decryption with LSTM and Transformer Models
### *Team: Steven Lu, Georgiy Sekretaryuk, Oluwafemi*

## OUTLINE

Part 1 Goals:
- replicate NeuroDecipher LSTM model with Linear B
- apply NeuroDecipher NLP approaches in a transformer model
- test different pre-training techniques and parameters to see how it influences the result

Part 2 Goals:

...TBD after Nov 13
- Work with Linear A here

## IMPORTS

Import the necessary libraries for the project and define any additional configurations.

In [None]:
# IMPORT THE LIBRARIES HERE
!pip install transformers
!pip install torch
import os
import shutil
import sys
import pandas as pd
from transformers import BertTokenizer, BertModel, BertConfig
import torch.nn as nn
rom torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import Adam
import torch.nn.functional as F


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#setup for GDrive
# #@title SELECT USER to mount the data drive according to its path in your drive
# USER = 'Georgiy' #@param ['Georgiy', 'Steven', 'Oluwafemi']

# #@title Mount GDrive
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)
# #remove cache
# !rm -rf "/content/drive/MyDrive/NLP_266/__pycache__"

# #@title Set PATH to /data/ folder
# PATHS = {}
# PATHS['Georgiy'] = "/content/drive/MyDrive/NLP_266"
# PATHS['Steven'] = "/content/drive/Shareddrives/PathForSteven"  # Replace with the actual path
# PATHS['Oluwafemi'] = "/content/drive/Shareddrives/PathForOluwafemi"  # Replace with the actual path
# PATH = PATHS[USER]

# if PATH == "":
#     raise ValueError("Enter your path to the shared data folder.\nIt should start with 'content/drive/...' and end with '.../281 Final Project/data/")


In [None]:
# # Import Lin B from NeuroDecipher https://github.com/j-luo93/NeuroDecipher
#only run this if the NeuroDecipher folder is empty
# folder_path = 'NeuroDecipher'

# if os.path.exists(folder_path):
#    shutil.rmtree(folder_path)
#    print(f"The folder '{folder_path}' has been removed.")
# else:
#    print(f"The folder '{folder_path}' does not exist.")

# !git clone https://github.com/j-luo93/NeuroDecipher
# !git submodule init && git submodule update
# !pip install torch torchvision torchaudio
# !cd NeuroDecipher && pip install -r requirements.txt
# !cd NeuroDecipher && pip install .
# !cd NeuroDecipher/arglib && ls
# !cd NeuroDecipher/editdistance && pip install .
# !cd NeuroDecipher/arglib && pip install .
# !cd NeuroDecipher/dev_misc && pip install -r requirements.txt
# !cd NeuroDecipher/dev_misc && pip install .

## LOAD THE DATA

Load the data from https://github.com/j-luo93/NeuroDecipher.

Each .cog file is essentially a tsv file, where each column corresponds to the words in one language. Words in the same row are considered cognates. If for one word, there is no corresponding cognate in another language, _ is used to fill the cell. If multiple cognates are available for the same word, '|' is used to separate them.


In [None]:
# Load the data into a pandas DataFrame
file_path = 'NeuroDecipher/data/linear_b-greek.cog'
file_path_names = 'NeuroDecipher/data/linear_b-greek.names.cog'
data_linearb = pd.read_csv(file_path, sep='\t', header=0)
data_linearb_names = pd.read_csv(file_path_names, sep='\t', header=0)

print('Loaded Linear B Cognates before modifications:\n', data_linearb)
print('Loaded Linear B Names before modifications:\n', data_linearb_names)

Loaded Linear B Cognates before modifications:
     linear_b              greek
0      𐀀𐀁𐀪𐀦𐀲          αελιποτας
1       𐀀𐀁𐀴𐀵     αεθιστος|εθιζω
2       𐀀𐀅𐀔𐀃      αδαμαο|αδαμας
3       𐀀𐀅𐀕𐀸  αδαμεfεις|αδαμευς
4      𐀀𐀅𐀨𐀴𐀍          αδραστιος
..       ...                ...
914     𐁆𐀯𐀊𐀒          φυσιαρχος
915       𐁆𐀳              φυτερ
916     𐁆𐀳𐀪𐀊            φυτερια
917   𐁆𐁈𐀀𐀐𐀩𐀄       φυλιαςαγρευς
918       𐁇𐀜             φτενοι

[919 rows x 2 columns]
Loaded Linear B Names before modifications:
     linear_b              greek
0      𐀀𐀁𐀪𐀦𐀲          αελιποτας
1       𐀀𐀁𐀴𐀵                  _
2       𐀀𐀅𐀔𐀃      αδαμαο|αδαμας
3       𐀀𐀅𐀕𐀸  αδαμεfεις|αδαμευς
4      𐀀𐀅𐀨𐀴𐀍          αδραστιος
..       ...                ...
914     𐁆𐀯𐀊𐀒          φυσιαρχος
915       𐁆𐀳                  _
916     𐁆𐀳𐀪𐀊                  _
917   𐁆𐁈𐀀𐀐𐀩𐀄       φυλιαςαγρευς
918       𐁇𐀜                  _

[919 rows x 2 columns]


## DATA MODIFICATION

- Do we split the data into individual letters?

- INstead of columns for cog 1 / cog 2, turn it into rows -> increases dataset size
- turn empty rows into test/train



In [None]:
# @title Modify the Data

# LINEAR B COGNATES

# Renaming the original greek column to track the original
data_linearb.rename(columns={'greek': 'greek_original'}, inplace=True)
# Split the 'Greek' col into 2
split_columns = data_linearb['greek_original'].str.split('|', expand=True)

# Assigning split cols
data_linearb['greek_cog_1'] = split_columns[0]
data_linearb['greek_cog_2'] = split_columns[1].fillna('')


# LINEAR B NAMES

data_linearb_names.rename(columns={'greek': 'greek_original'}, inplace=True)
# Split the 'Greek' col into 2
split_columns = data_linearb_names['greek_original'].str.split('|', expand=True)

# Assigning split cols
data_linearb_names['greek_cog_1'] = split_columns[0]
data_linearb_names['greek_cog_2'] = split_columns[1].fillna('')

# Replace all _ with blank space
data_linearb_names.replace('_', '', inplace=True)

assert(len(data_linearb)==len(data_linearb_names))
data_linearb_split=[]
data_linearb_names_split=[]
for i in range(len(data_linearb)):
    #fill linear B
    temp=[data_linearb["linear_b"].iloc[i],data_linearb["greek_cog_1"].iloc[i]]
    data_linearb_split.append(temp)
    if data_linearb["greek_cog_2"].iloc[i]!="":
        data_linearb_split.append([data_linearb["linear_b"].iloc[i],data_linearb["greek_cog_2"].iloc[i]])

    #fill linear B names
    temp=[data_linearb_names["linear_b"].iloc[i],data_linearb_names["greek_cog_1"].iloc[i]]
    data_linearb_names_split.append(temp)
    if data_linearb_names["greek_cog_2"].iloc[i]!="":
        data_linearb_names_split.append([data_linearb_names["linear_b"].iloc[i],data_linearb_names["greek_cog_2"].iloc[i]])
data_linearb_split=pd.DataFrame(data_linearb_split,columns=["linear_b","greek"])
data_linearb_names_split=pd.DataFrame(data_linearb_names_split,columns=["linear_b","greek"])
# Display first few rows
print(data_linearb.head(),'\n')
print(data_linearb_split.head(),'\n')
# print('\n ------ LINEAR B NAMES -----\n')
print(data_linearb_names.head())
print(data_linearb_names_split.head())

  linear_b     greek_original greek_cog_1 greek_cog_2
0    𐀀𐀁𐀪𐀦𐀲          αελιποτας   αελιποτας            
1     𐀀𐀁𐀴𐀵     αεθιστος|εθιζω    αεθιστος       εθιζω
2     𐀀𐀅𐀔𐀃      αδαμαο|αδαμας      αδαμαο      αδαμας
3     𐀀𐀅𐀕𐀸  αδαμεfεις|αδαμευς   αδαμεfεις     αδαμευς
4    𐀀𐀅𐀨𐀴𐀍          αδραστιος   αδραστιος             

  linear_b      greek
0    𐀀𐀁𐀪𐀦𐀲  αελιποτας
1     𐀀𐀁𐀴𐀵   αεθιστος
2     𐀀𐀁𐀴𐀵      εθιζω
3     𐀀𐀅𐀔𐀃     αδαμαο
4     𐀀𐀅𐀔𐀃     αδαμας 

  linear_b     greek_original greek_cog_1 greek_cog_2
0    𐀀𐀁𐀪𐀦𐀲          αελιποτας   αελιποτας            
1     𐀀𐀁𐀴𐀵                                           
2     𐀀𐀅𐀔𐀃      αδαμαο|αδαμας      αδαμαο      αδαμας
3     𐀀𐀅𐀕𐀸  αδαμεfεις|αδαμευς   αδαμεfεις     αδαμευς
4    𐀀𐀅𐀨𐀴𐀍          αδραστιος   αδραστιος            
  linear_b      greek
0    𐀀𐀁𐀪𐀦𐀲  αελιποτας
1     𐀀𐀁𐀴𐀵           
2     𐀀𐀅𐀔𐀃     αδαμαο
3     𐀀𐀅𐀔𐀃     αδαμας
4     𐀀𐀅𐀕𐀸  αδαμεfεις


In [None]:
#only need to split names into train and test for now,
#since the names has several hundred blanks while there are no blanks in the ovr data
data_linearb_names_train=data_linearb_names_split[data_linearb_names_split["greek"]!=""]
data_linearb_names_test=data_linearb_names_split[data_linearb_names_split["greek"]==""]
print(len(data_linearb_names_train))
print(len(data_linearb_names_test))

585
464


## EXPLORATORY DATA ANALYSIS

Analyze the dataset features.


In [None]:
# Basic statistics and exploration

print('\n----- DESCRIBING THE COGNATE DATA: -----\n')
print(data_linearb.describe())

print('\n----- INFO: -----\n')
print(data_linearb.info())

# Check for missing values
print('\n----- CHECKING FOR MISSING VALUES: -----\n')
print(data_linearb.isnull().sum())

# Explore unique values and frequency distribution
print('\n----- CHECKING UNIQUE VALUES: -----\n')
print(data_linearb['linear_b'].value_counts())
print(data_linearb['greek_original'].value_counts())
print(data_linearb['greek_cog_1'].value_counts())
print(data_linearb['greek_cog_2'].value_counts())



----- DESCRIBING THE COGNATE DATA: -----

       linear_b greek_original greek_cog_1 greek_cog_2
count       919            919         919         919
unique      919            918         918         388
top       𐀀𐀁𐀪𐀦𐀲        επι|οπι         επι            
freq          1              2           2         528

----- INFO: -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919 entries, 0 to 918
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   linear_b        919 non-null    object
 1   greek_original  919 non-null    object
 2   greek_cog_1     919 non-null    object
 3   greek_cog_2     919 non-null    object
dtypes: object(4)
memory usage: 28.8+ KB
None

----- CHECKING FOR MISSING VALUES: -----

linear_b          0
greek_original    0
greek_cog_1       0
greek_cog_2       0
dtype: int64

----- CHECKING UNIQUE VALUES: -----

linear_b
𐀀𐀁𐀪𐀦𐀲     1
𐀟𐀩𐀄𐀫𐀙𐀆    1
𐀟𐀩𐀐𐀄      1
𐀟𐀩𐀦𐀲      1
𐀟𐀪𐀕𐀆      1
     

In [None]:
print('\n----- DESCRIBING THE NAMES DATA: -----\n')
print(data_linearb_names.describe())

print('\n----- INFO: -----\n')
print(data_linearb_names.info())

# Check for missing values
print('\n----- CHECKING FOR MISSING VALUES: -----\n')
print(data_linearb_names.isnull().sum())

# Explore unique values and frequency distribution
print('\n----- CHECKING UNIQUE VALUES: -----\n')
print(data_linearb_names['linear_b'].value_counts())
print(data_linearb_names['greek_original'].value_counts())
print(data_linearb_names['greek_cog_1'].value_counts())
print(data_linearb_names['greek_cog_2'].value_counts())


----- DESCRIBING THE NAMES DATA: -----

       linear_b greek_original greek_cog_1 greek_cog_2
count       919            919         919         919
unique      919            456         456         131
top       𐀀𐀁𐀪𐀦𐀲                                       
freq          1            464         464         789

----- INFO: -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919 entries, 0 to 918
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   linear_b        919 non-null    object
 1   greek_original  919 non-null    object
 2   greek_cog_1     919 non-null    object
 3   greek_cog_2     919 non-null    object
dtypes: object(4)
memory usage: 28.8+ KB
None

----- CHECKING FOR MISSING VALUES: -----

linear_b          0
greek_original    0
greek_cog_1       0
greek_cog_2       0
dtype: int64

----- CHECKING UNIQUE VALUES: -----

linear_b
𐀀𐀁𐀪𐀦𐀲     1
𐀟𐀩𐀄𐀫𐀙𐀆    1
𐀟𐀩𐀐𐀄      1
𐀟𐀩𐀦𐀲      1
𐀟𐀪𐀕𐀆      1
       

## SPLITTING & TOKENIZATION

- Breakdown the words into characters
- ???
- Split the data into test train

In [None]:
# @title: Splitting & tokenizing the data


# SPLIT THE DATA HERE... BUT HOW? WHAT ARE THE LABELS?


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
data_linearb_names_train['linear_b_tokens'] = data_linearb_names_train['linear_b'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
data_linearb_names_train['greek_tokens'] = data_linearb_names_train['greek'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
data_linearb_names_test['linear_b_tokens'] = data_linearb_names_test['linear_b'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
data_linearb['greek_cog_1_tokens'] = data_linearb['greek_cog_1'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
data_linearb['greek_cog_2_tokens'] = data_linearb['greek_cog_2'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True) if x else [])


# NEED TO TOKENIZE NAMES AND OTHER DATASETS THAT ARE LOADED HERE


## MODEL ARCHITECTURE

- Identify baseline model
- Test other Seq2seq models
  - Transformer model - our own?
  - Or can we modify BERT/another model and train it too?

### Loading the Model

In [None]:
# Loading BERT
config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True)
bert_model = BertModel(config)

### Building the Model

In [None]:
# Building the COgnate model (sample skeleton)

class CognatePredictionModel(nn.Module):
    def __init__(self, bert_model):
        super(CognatePredictionModel, self).__init__()
        self.bert = bert_model

        # BERT outputs a 768-d vector
        bert_output_size = 768

        # Additional fully connected layers
        self.fc1 = nn.Linear(bert_output_size * 2, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 256)
        # Output layer for binary classification
        self.fc3 = nn.Linear(256, 1)

    def forward(self, linear_b_tokens, greek_tokens):
        # Pass input through BERT, take pooled output
        outputs_linear_b = self.bert(linear_b_tokens)[1]
        outputs_greek = self.bert(greek_tokens)[1]

        # Concatenate the outputs
        combined = torch.cat((outputs_linear_b, outputs_greek), 1)

        # Pass through additional layers; placeholders
        x = self.fc1(combined)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)

        #print x
        # Should be tensor with logits

        return x


## TRAINING

- Train the model

In [None]:
unique_greek_tokens = set()

for tokens in data_linearb_names_train['greek']:
    unique_greek_tokens.update(tokens.split('|'))

for tokens in data_linearb_names_test['greek']:
    unique_greek_tokens.update(tokens.split('|'))

token_to_id = {token: idx for idx, token in enumerate(unique_greek_tokens)}

In [None]:
class CognateDataset(Dataset):
    def __init__(self, linear_b_tokens, greek_tokens, token_to_id, default_id=0):
        self.linear_b_tokens = linear_b_tokens
        self.greek_tokens = greek_tokens
        self.token_to_id = token_to_id
        self.default_id = default_id

    def __len__(self):
        return len(self.linear_b_tokens)

    def __getitem__(self, idx):
        linear_b_token_tensor = torch.tensor(self.linear_b_tokens[idx], dtype=torch.long)
        greek_token_tensor = torch.tensor(self.greek_tokens[idx], dtype=torch.long)

        return {
            'linear_b_tokens': linear_b_token_tensor,
            'greek_tokens': greek_token_tensor
        }

train_dataset = CognateDataset(
    data_linearb_names_train['linear_b_tokens'].tolist(),
    data_linearb_names_train['greek_tokens'].tolist(),
    token_to_id,
    default_id=0
)

test_dataset = CognateDataset(
    data_linearb_names_test['linear_b_tokens'].tolist(),
    # For test data, you might not have labels or might handle them differently
    [0] * len(data_linearb_names_test),  # Placeholder if you don't have labels
    token_to_id,
    default_id=0
)

def collate_fn(batch):
    linear_b_tokens = [item['linear_b_tokens'] for item in batch]
    greek_tokens = [item['greek_tokens'] for item in batch]

    # Pad sequences
    linear_b_tokens_padded = pad_sequence(linear_b_tokens, batch_first=True, padding_value=tokenizer.pad_token_id)
    greek_tokens_padded = pad_sequence(greek_tokens, batch_first=True, padding_value=tokenizer.pad_token_id)

    return {
        'linear_b_tokens': linear_b_tokens_padded,
        'greek_tokens': greek_tokens_padded
    }

data_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [None]:
# Model

model = CognatePredictionModel(bert_model)
loss_function = nn.BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=0.001)


num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    for batch in data_loader:
        linear_b_tokens = batch['linear_b_tokens']
        greek_tokens = batch['greek_tokens']
        optimizer.zero_grad()

        outputs = model(linear_b_tokens, greek_tokens)
        outputs = outputs.squeeze()

        loss = loss_function(outputs, greek_tokens.float())
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        predicted_labels = (outputs > 0).float()
        correct_predictions += (predicted_labels == greek_tokens).sum().item()
        total_predictions += greek_tokens.numel()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

## EVALUATION

- The primary goal metric is accuracy as compared to NeuroDecipher

In [None]:
# Evaluation code

