# Data Preprocessing
We do it in python, because there is a much better support for stuff like multilingual bert models from huggingface.

In [1]:
import csv
import re
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import json
import gc
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


## Load the dataset

In [2]:
input_filename = "settles.acl16.learning_traces.13m.csv"  # Change to the name of your input CSV file


In [3]:
vocabulary_lex = ""

with open(input_filename, 'r') as csv_file:
    reader = csv.reader(csv_file)
    header = next(reader)  # Read the header
    vocabulary_lex = set([(column[7], column[4], column[6]) for column in reader])
    del header, reader
del csv_file
gc.collect()


0

In [4]:
all_lex_tags = set()
all_lex_tags_dict = {}

for lex in vocabulary_lex:
    for l in lex[0].split("/")[-1].split("<")[1:]:
        lex_str = l.replace(">", "")
        all_lex_tags.add(lex_str)
        if lex_str in all_lex_tags_dict.keys():
            all_lex_tags_dict[lex_str] += 1
        else: all_lex_tags_dict[lex_str] = 1


print(all_lex_tags)

most_all_lex_tags_dict = {}

most_all_lex_tags_dict = sorted(all_lex_tags_dict.items(), reverse=True, key=lambda x: x[1])
most_all_lex_tags_dict = [item[0] for item in most_all_lex_tags_dict]
most_all_lex_tags_dict

{'ger', 'pprs', 'sw', '@prn:quelque_chose', '@cnj:parce_que', 'n+sandwich', 'pr+o', '@ger_past', '@neg:plus_de', '@pr:au_dela_de', 'np', 'comp', 'sg', 'pr+ele', 'preadv', '@adv:por_favor', 'nt', 'cnjadv', '@adv:a_part', 'pst', '@present_perfect', '@adv:en_fait', '@past_cond', '@prn:le_mien', '@pr:a_cause_de', '@adv:a_posteriori', 'pp', '@ref', 'pres+not', '@cnj:pour_que', 'pron', 'ord', '@adv:s_il_te_plait', '@pluperfect', '@common_phrases:a_demain', 'acr', 'pro', '@cnj:depuis_que', 'subj', 'dim', '@common_phrases:a_plus_tard', 'sg+mi', 'pr+das', '@pr:a_cote_de', '@pos', 'pr', '@ij:thank_you', 'vblex+bad', '@pr:pres_de', 'det', 'vbmod', 'pr+el', 'n+wehr', 'sint', 'an', 'dat', '@prn:l_un', 'pl', 'f', '@prn:quelque_un', 'pos', '@neg:il_ne_y_a', '@det:de_le', '@prn:le_tien', 'sup', '@past', '@adv:peut_etre', '@future', 'adv', 'suff', 'pri', '@passive', 'n', '*case', 'prn', 'rel', 'pl+lo', '@prn:le_meme', 'nn', 'pr+il', 'nom', '@compound_past', 'n+nummer', 'pred', '@ij:au_revoir', '@cond_p

['n',
 'sg',
 '*numb',
 'vblex',
 'm',
 'f',
 'pl',
 'adj',
 'pri',
 'inf',
 'p3',
 'mf',
 '*pers',
 '*gndr',
 'nom',
 'pp',
 'adv',
 'nt',
 'p1',
 'acc',
 '*case',
 'prn',
 'det',
 'pst',
 'sp',
 'pii',
 '@present_perfect',
 'p2',
 'dat',
 'pr',
 '@future',
 '@future_phrasal',
 '@compound_past',
 'ifi',
 'tn',
 'pos',
 'vbmod',
 'ind',
 'vbser',
 'num',
 '@past_perfect',
 'pres',
 'fti',
 '@cond',
 'ger',
 'pred',
 'vbhaver',
 'imp',
 'mix',
 'cni',
 'np',
 'prs',
 'def',
 '@cond_perfect',
 'past',
 '@future_perfect',
 'itg',
 'pis',
 '@modal',
 'st',
 'sint',
 'sw',
 'dem',
 'pro',
 '@ref',
 '@pluperfect',
 'attr',
 'vaux',
 'cnjadv',
 'preadv',
 'gen',
 'cnjcoo',
 'rel',
 'loc',
 'pprs',
 '@past',
 'ord',
 'pr+il',
 'ij',
 'comp',
 '@passive',
 '@past_cond',
 'ref',
 'qnt',
 'cnjsub',
 'predet',
 '@subjunctive_pluperfect',
 'enc',
 'sup',
 '@past_subjunctive',
 '@past_inf',
 'an',
 'obj',
 'subj',
 'pr+der',
 'pprep',
 '@formal',
 'pr+o',
 'pr+le',
 'sg+mi',
 'n+sandwich',
 'nn',
 '

In [5]:
for word in most_all_lex_tags_dict:
    print(f'<div style="margin-bottom:5px"><label><input type="checkbox" class="category-filter" value="{word}"> {word}</label></div>')

<div style="margin-bottom:5px"><label><input type="checkbox" class="category-filter" value="n"> n</label></div>
<div style="margin-bottom:5px"><label><input type="checkbox" class="category-filter" value="sg"> sg</label></div>
<div style="margin-bottom:5px"><label><input type="checkbox" class="category-filter" value="*numb"> *numb</label></div>
<div style="margin-bottom:5px"><label><input type="checkbox" class="category-filter" value="vblex"> vblex</label></div>
<div style="margin-bottom:5px"><label><input type="checkbox" class="category-filter" value="m"> m</label></div>
<div style="margin-bottom:5px"><label><input type="checkbox" class="category-filter" value="f"> f</label></div>
<div style="margin-bottom:5px"><label><input type="checkbox" class="category-filter" value="pl"> pl</label></div>
<div style="margin-bottom:5px"><label><input type="checkbox" class="category-filter" value="adj"> adj</label></div>
<div style="margin-bottom:5px"><label><input type="checkbox" class="category-fil

## Let's do the Bert Embeddings

In [6]:
pattern = r'([^<]+/\w+)<'

vocabulary = set()
for i in [re.match(pattern, s[0]).group(1) for s in vocabulary_lex if re.match(pattern, s[0])]:
    words = i.split("/")
    vocabulary.add(words[0])
    vocabulary.add(words[1])
vocabulary = list(vocabulary)
len(vocabulary)

11198

In [7]:
# Load pre-trained multilingual BERT model
model = "google-bert/bert-base-cased"
# model = "bert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModel.from_pretrained(model)


def get_embeddings_iteratively(phrases, batch_size=2, output_file='embeddings.npy'):
    all_embeddings = []
    for i in range(0, len(phrases), batch_size):
        batch = phrases[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        embeddings = embeddings.detach().numpy()
        all_embeddings.append(embeddings)
        del inputs, outputs, embeddings
        torch.cuda.empty_cache()  # Clear GPU memory if using GPU
    all_embeddings = np.vstack(all_embeddings)
    np.save(output_file, all_embeddings)
    return all_embeddings


In [8]:
embeddings = get_embeddings_iteratively(vocabulary, batch_size=20, output_file='embeddings.npy')


In [9]:
embeddings = np.load('embeddings.npy')
embeddings.shape

(11198, 768)

In [10]:
# Reduce dimensionality of embeddings
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# plt.figure(figsize=(15, 15))
# for i, phrase in enumerate(vocabulary):
#     plt.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1], label=phrase, s=3)
#     plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], phrase, fontsize=7, alpha=0.4, ha='right', va='center')
# plt.title("BERT Embeddings in 2D")
# plt.xlabel("PCA Component 1")
# plt.ylabel("PCA Component 2")
# plt.show()



## Export the calculated PCA embeddings

In [11]:
vocabulary_dataset = []
for i, word in enumerate(vocabulary):
    for sample in vocabulary_lex:
        if word in sample[0]:
            lexeme = sample[0]
            language = sample[1]
            idx = sample[2]

    vocabulary_dataset.append({
        "custom_id" : i,
        "word": word,
        "position": list(reduced_embeddings[i]),
        "language": language,
        "lexeme_id": idx,
        "lexeme": lexeme
    })


def convert_to_serializable(obj):
    if isinstance(obj, np.float32):
        return float(obj)  # Convert to Python float
    else:
        raise TypeError("Type not serializable")



In [12]:
vocabulary_dataset

[{'custom_id': 0,
  'word': 'untersuchungen',
  'position': [0.42969906, 3.579577],
  'language': 'de',
  'lexeme_id': '885bca84f817f7abbaf97fe10459145e',
  'lexeme': 'untersuchungen/untersuchung<n><f><pl><acc>'},
 {'custom_id': 1,
  'word': 'internetseite',
  'position': [0.10994764, -0.10234981],
  'language': 'de',
  'lexeme_id': '94c903ed62b5ab7a124c50b64cfa0f7e',
  'lexeme': 'internetseiten/internet<n>'},
 {'custom_id': 2,
  'word': 'sitio',
  'position': [1.0625157, -0.46112448],
  'language': 'en',
  'lexeme_id': '1e402749849ea2602ad632b47f3383a1',
  'lexeme': 'opposition/opposition<n><sg>'},
 {'custom_id': 3,
  'word': 'convocatoria',
  'position': [-2.1752582, -0.60248387],
  'language': 'es',
  'lexeme_id': '06f10527fa6de49394332b42c59d854b',
  'lexeme': 'convocatoria/convocatoria<n><f><sg>'},
 {'custom_id': 4,
  'word': 'feel',
  'position': [3.2406018, 0.346129],
  'language': 'en',
  'lexeme_id': '2420ec15293a72b407e278fdf0e17117',
  'lexeme': 'feel/feel<vblex><inf>'},
 {'

## Add spaced repetiton data

In [13]:
with open(input_filename, 'r') as csv_file:
    df = pd.read_csv(csv_file)
    for i, word in enumerate(vocabulary_dataset):
        lexeme_id = word["lexeme_id"]
        filtered_df = df[df["lexeme_id"] == lexeme_id]
        vocabulary_dataset[i]["average_recall"] = filtered_df["p_recall"].mean()
        vocabulary_dataset[i]["users_seen"] = filtered_df["user_id"].nunique()
        vocabulary_dataset[i]["average_performance"] = ((filtered_df["history_correct"] + filtered_df["session_correct"]) / (filtered_df["history_seen"] + filtered_df["session_seen"])).mean()
        tags = re.findall(r'<[^>]+>', word["lexeme"])
        vocabulary_dataset[i]["tags"] = tags.copy()
        filtered_word = word["lexeme"]
        for tag in tags:
            filtered_word = filtered_word.replace(tag, "")
        vocabulary_dataset[i]["full_word"] = filtered_word
        if i % 25 == 0:
            print(i)
            print(vocabulary_dataset[i])
        df = df.drop(filtered_df.index)
        del lexeme_id, filtered_df, filtered_word, tags
    del df
del csv_file
gc.collect()


0
{'custom_id': 0, 'word': 'untersuchungen', 'position': [0.42969906, 3.579577], 'language': 'de', 'lexeme_id': '885bca84f817f7abbaf97fe10459145e', 'lexeme': 'untersuchungen/untersuchung<n><f><pl><acc>', 'average_recall': 0.5, 'users_seen': 2, 'average_performance': 0.625, 'tags': ['<n>', '<f>', '<pl>', '<acc>'], 'full_word': 'untersuchungen/untersuchung'}
25
{'custom_id': 25, 'word': 'sedere', 'position': [-1.0402796, 0.9759734], 'language': 'it', 'lexeme_id': '4bc64b4bdfa58b187c0868415021d2cf', 'lexeme': '<*sf>/sedere<vblex><pri><*pers><*numb>', 'average_recall': 0.7843137254901961, 'users_seen': 20, 'average_performance': 0.8279319808731574, 'tags': ['<*sf>', '<vblex>', '<pri>', '<*pers>', '<*numb>'], 'full_word': '/sedere'}
50
{'custom_id': 50, 'word': 'kapitel', 'position': [-0.3854238, 0.28042382], 'language': 'de', 'lexeme_id': 'd25a0aa29aa4f657936bf299ea7137e8', 'lexeme': 'kapitel/kapitel<n><nt><sg><*case>', 'average_recall': 0.0, 'users_seen': 1, 'average_performance': 0.5, 't

40

In [16]:
len(vocabulary_dataset)

11198

In [17]:

with open("../src/data/vocabulary_dataset.json", "w") as f:
    json.dump(vocabulary_dataset, f, default=convert_to_serializable, indent=4)
