# Data Preprocessing
We do it in python, because there is a much better support for stuff like multilingual bert models from huggingface.

In [1]:
import csv
import re
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import json
import gc


  from .autonotebook import tqdm as notebook_tqdm


## Load the dataset

In [2]:
input_filename = "settles.acl16.learning_traces.13m.csv"  # Change to the name of your input CSV file


In [3]:
vocabulary_lex = ""

with open(input_filename, 'r') as csv_file:
    reader = csv.reader(csv_file)
    header = next(reader)  # Read the header
    vocabulary_lex = set([(column[7], column[4], column[6]) for column in reader])
    del header, reader
del csv_file
gc.collect()


20

In [4]:
all_lex_tags = set()

for lex in vocabulary_lex:
    for l in lex[0].split("/")[-1].split("<")[1:]:
        all_lex_tags.add(l.replace(">", ""))
    
print(all_lex_tags)


{'@compound_past', '@n:petit_ami', '@itg:est_ce_que', 'pr+isso', 'pis', '@cond_perfect', '*gndr', 'acc', 'qnt', 'n+sandwich', 'pp', 'inf+lo', 'num', '@passive', '@cnj:avant_que', 'f', 'p3', '@prn:le_notre', 'pres', 'n+stier', '@future', 'attr', '@subjunctive_perfect', '@pluperfect', 'ij', 'ind', '@pr:au_dela_de', 'imp', 'pl+ci', 'nom', '@neg:il_ne_y_a', 'subj', 'loc', '@adv:au_moins', '@cond', 'adj+haltung', '@common_phrases:a_demain', 'ord', 'predet', '@prn:celui_que', 'pr+esse', '@cnj:parce_que', '@adv:s_il_vous_plait', '@ij:au_revoir', 'aa', '@common_phrases:de_rien', 'cnjcoo', '@adv:por_supuesto', 'inf+ci', 'mix', '@adv:au_dela', 'n+welt', 'itg', '*pers', 'nn', '@common_phrases:ca_va_bien', 'p2', 'def', 'vaux', '@modal', 'sg+mi', 'pprs', '@common_phrases:il_y_a', 'n', 'inf', '@cnj:bien_que', 'n+nummer', '@formal', 'pri', '*numb', '@adv:peut_etre', 'vbser', 'ref', 'n+meister', 'cnjsub', '@pos', '@ij:merci_beaucoup', '@neg:pas_du_tout', 'rel', 'pprep', 'pr+das', 'fti', 'suff', 'dem',

## Let's do the Bert Embeddings

In [5]:
pattern = r'([^<]+/\w+)<'

vocabulary = set()
for i in [re.match(pattern, s[0]).group(1) for s in vocabulary_lex if re.match(pattern, s[0])]:
    words = i.split("/")
    vocabulary.add(words[0])
    vocabulary.add(words[1])
vocabulary = list(vocabulary)
len(vocabulary)

11198

In [6]:
# Load pre-trained multilingual BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
model = AutoModel.from_pretrained('bert-base-multilingual-cased')


def get_embeddings_iteratively(phrases, batch_size=2, output_file='embeddings.npy'):
    all_embeddings = []
    for i in range(0, len(phrases), batch_size):
        batch = phrases[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        embeddings = embeddings.detach().numpy()
        all_embeddings.append(embeddings)
        del inputs, outputs, embeddings
        torch.cuda.empty_cache()  # Clear GPU memory if using GPU
    all_embeddings = np.vstack(all_embeddings)
    np.save(output_file, all_embeddings)
    return all_embeddings


In [7]:
embeddings = get_embeddings_iteratively(vocabulary, batch_size=1000, output_file='embeddings.npy')


In [8]:
embeddings = np.load('embeddings.npy')
embeddings.shape

(11198, 768)

In [9]:
# Reduce dimensionality of embeddings
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# plt.figure(figsize=(15, 15))
# for i, phrase in enumerate(vocabulary):
#     plt.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1], label=phrase, s=3)
#     plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], phrase, fontsize=7, alpha=0.4, ha='right', va='center')
# plt.title("BERT Embeddings in 2D")
# plt.xlabel("PCA Component 1")
# plt.ylabel("PCA Component 2")
# plt.show()



## Export the calculated PCA embeddings

In [10]:
vocabulary_dataset = []
for i, word in enumerate(vocabulary):
    for sample in vocabulary_lex:
        if word in sample[0]:
            lexeme = sample[0]
            language = sample[1]
            idx = sample[2]

    vocabulary_dataset.append({
        "word": word,
        "position": list(reduced_embeddings[i]),
        "language": language,
        "lexeme_id": idx,
        "lexeme": lexeme
    })


def convert_to_serializable(obj):
    if isinstance(obj, np.float32):
        return float(obj)  # Convert to Python float
    else:
        raise TypeError("Type not serializable")



In [11]:
vocabulary_dataset

[{'word': 'ablauf',
  'position': [1.997693, -2.9537148],
  'language': 'de',
  'lexeme_id': 'a94dffb2f69ef74f7fa0881205435e9c',
  'lexeme': 'ablauf/ablauf<n><m><sg><nom>'},
 {'word': 'habitants',
  'position': [1.9940443, -1.9414475],
  'language': 'fr',
  'lexeme_id': 'edee4fbd9d5bfef1b021fdd07005f68d',
  'lexeme': 'habitants/habitant<n><m><pl>'},
 {'word': 'restent',
  'position': [2.417926, -1.4302847],
  'language': 'fr',
  'lexeme_id': 'e0230a168c3999b469b72721b9f701d3',
  'lexeme': 'restent/rester<vblex><pri><p3><pl>'},
 {'word': 'dimanche',
  'position': [-0.014947407, -1.8969159],
  'language': 'fr',
  'lexeme_id': '8a7560425440f2f5c8227c1ff587b1ed',
  'lexeme': 'dimanche/dimanche<n><m><sg>'},
 {'word': 'and',
  'position': [4.532035, 1.7014924],
  'language': 'de',
  'lexeme_id': '6e10a0433dcc5cf4843294d2223498dd',
  'lexeme': 'deutschland/deutschland<np><nt><sg><dat>'},
 {'word': 'punkten',
  'position': [0.030555785, -0.927675],
  'language': 'de',
  'lexeme_id': 'd0a1c2ce9

In [12]:

with open("../src/data/vocabulary_dataset.json", "w") as f:
    json.dump(vocabulary_dataset, f, default=convert_to_serializable, indent=4)
