# Data Preprocessing
We do it in python, because there is a much better support for stuff like multilingual bert models from huggingface.

In [7]:
import csv
import re
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import json
import gc


## Load the dataset

In [8]:
input_filename = "settles.acl16.learning_traces.13m.csv"  # Change to the name of your input CSV file


In [9]:
vocabulary_lex = ""

with open(input_filename, 'r') as csv_file:
    reader = csv.reader(csv_file)
    header = next(reader)  # Read the header
    vocabulary_lex = set([(column[7], column[4], column[6]) for column in reader])
    del header, reader
del csv_file
gc.collect()


512

In [10]:
all_lex_tags = set()
all_lex_tags_dict = {}

for lex in vocabulary_lex:
    for l in lex[0].split("/")[-1].split("<")[1:]:
        lex_str = l.replace(">", "")
        all_lex_tags.add(lex_str)
        if lex_str in all_lex_tags_dict.keys():
            all_lex_tags_dict[lex_str] += 1
        else: all_lex_tags_dict[lex_str] = 1


print(all_lex_tags)

most_all_lex_tags_dict = {}

for key, value in all_lex_tags_dict.items():
    if value > 1000:
        most_all_lex_tags_dict[key] = value

print(most_all_lex_tags_dict)
most_all_lex_tags_dict.keys()

{'@prn:celui_la', '@future_perfect', '@common_phrases:de_rien', '@pr:pres_de', 'adv', '@ij:bis_bald', '@common_phrases:a_plus_tard', '@modal', 'dat', 'def', '*numb', '@itg:est_ce_que', 'det', 'acr', 'p1', 'comp', 'vaux', '@cnj:autant_que', '@ij:buenas_noches', 'pro', 'pr+il', '*gndr', 'pst', '@adv:a_part', '@cnj:parce_que', '@common_phrases:a_demain', '@cnj:du_fait_que', '@neg:plus_de', 'vblex', 'n+versicherung', 'inf+ci', '@neg:pas_du_tout', 'sp', '@pluperfect', 'past', '@pr:a_travers', '@ij:thank_you', 'predet', 'num', 'pl+lo', 'vbmod', '@ij:au_revoir', '@pr:plus_de', '@subjunctive_perfect', '@pr:afin_de', 'loc', '@pr:a_cote_de', '@future', '@prn:n_importe_quoi', '@pos', 'n+essen', 'pr+ele', '@prn:le_notre', '*case', '@pr:un_peu_de', 'pprep', '@cnj:bien_que', 'adj+haltung', 'pri', 'pii', '@adv:tout_a_fait', '@adv:en_fait', 'vblex+bad', 'pron', 'sg+mi', 'pres', '@prn:ce_dont', 'sg', 'ord', '@present_perfect', 'nom', '@det:de_le', 'pr+esse', 'pl', '@future_phrasal', 'n+sandwich', 'n+me

dict_keys(['mf', 'n', 'm', '*numb', 'vblex', 'pri', 'p3', 'pl', 'inf', 'f', '*gndr', 'adj', 'sg', '*pers', 'nom'])

## Let's do the Bert Embeddings

In [11]:
pattern = r'([^<]+/\w+)<'

vocabulary = set()
for i in [re.match(pattern, s[0]).group(1) for s in vocabulary_lex if re.match(pattern, s[0])]:
    words = i.split("/")
    vocabulary.add(words[0])
    vocabulary.add(words[1])
vocabulary = list(vocabulary)
len(vocabulary)

11198

In [12]:
# Load pre-trained multilingual BERT model
model = "google-bert/bert-base-cased"
# model = "bert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModel.from_pretrained(model)


def get_embeddings_iteratively(phrases, batch_size=2, output_file='embeddings.npy'):
    all_embeddings = []
    for i in range(0, len(phrases), batch_size):
        batch = phrases[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        embeddings = embeddings.detach().numpy()
        all_embeddings.append(embeddings)
        del inputs, outputs, embeddings
        torch.cuda.empty_cache()  # Clear GPU memory if using GPU
    all_embeddings = np.vstack(all_embeddings)
    np.save(output_file, all_embeddings)
    return all_embeddings


In [13]:
embeddings = get_embeddings_iteratively(vocabulary, batch_size=20, output_file='embeddings.npy')


In [14]:
embeddings = np.load('embeddings.npy')
embeddings.shape

(11198, 768)

In [15]:
# Reduce dimensionality of embeddings
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# plt.figure(figsize=(15, 15))
# for i, phrase in enumerate(vocabulary):
#     plt.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1], label=phrase, s=3)
#     plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], phrase, fontsize=7, alpha=0.4, ha='right', va='center')
# plt.title("BERT Embeddings in 2D")
# plt.xlabel("PCA Component 1")
# plt.ylabel("PCA Component 2")
# plt.show()



## Export the calculated PCA embeddings

In [16]:
vocabulary_dataset = []
for i, word in enumerate(vocabulary):
    for sample in vocabulary_lex:
        if word in sample[0]:
            lexeme = sample[0]
            language = sample[1]
            idx = sample[2]

    vocabulary_dataset.append({
        "custom_id" : i,
        "word": word,
        "position": list(reduced_embeddings[i]),
        "language": language,
        "lexeme_id": idx,
        "lexeme": lexeme
    })


def convert_to_serializable(obj):
    if isinstance(obj, np.float32):
        return float(obj)  # Convert to Python float
    else:
        raise TypeError("Type not serializable")



In [17]:

with open("../src/data/vocabulary_dataset.json", "w") as f:
    json.dump(vocabulary_dataset, f, default=convert_to_serializable, indent=4)
