# Data Preprocessing
We do it in python, because there is a much better support for stuff like multilingual bert models from huggingface.

In [1]:
import csv
import re
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import json
import gc
import pandas as pd
from collections import defaultdict


  from .autonotebook import tqdm as notebook_tqdm


## Load the dataset

In [2]:
input_filename = "learning_traces.13m.csv"  # Change to the name of your input CSV file


In [3]:
vocabulary_id_lex = {} # {"oiw832ufr98e39" : (lernt/lernen<><> , de)}
lexeme_id_mapping = defaultdict(list) # {"9823jf93298fr892u39": [2,3,69]}

with open(input_filename, 'r') as csv_file:
    reader = csv.reader(csv_file)
    next(reader)  # Skip the header line
    for i, row in enumerate(reader):
        lexeme_id_mapping[row[6]].append(i)
        vocabulary_id_lex[row[6]] = (row[7], row[4])
    del reader
del csv_file
gc.collect()

lexeme_id_mapping = dict(lexeme_id_mapping)


In [4]:
vocabulary = set() # ("lernt", "lernen")
vocabulary_dict = {} # {"2903ru239ru293ru0923ru" : "lernt/lernen"}
vocabulary_lex_dict = {} # {"lernt" : "2903ru239ru293ru0923ru"}
lex_vocabulary_dict = {} # {"lksdjldkfjldskf" : "lernt"}
word_tags_dict = {} # {"lernt" : ["<n>", "<f>", "<i>"]}
all_lex_tags = set() # ("<n>", "<f>", "<i>")
all_lex_tags_dict = defaultdict(int) # {"lernt" : 3}

for lex_id, lexeme in vocabulary_id_lex.items():
    tags = re.findall(r'<[^>]+>', lexeme[0])
    filtered_word = lexeme[0]
    
    #delete the tags out of the filtered words
    for tag in tags:
        filtered_word = filtered_word.replace(tag, "")

 
    all_lex_tags.add(filtered_word)

    for lex_str in filtered_word.split("/"):
        if lexeme[1] == "de" and "<n>" in tags:
            lex_str = lex_str.capitalize()
        vocabulary.add(lex_str)
        word_tags_dict[lex_id] = tags
        vocabulary_dict[lex_id] = filtered_word
        vocabulary_lex_dict[lex_str] = lex_id
        lex_vocabulary_dict[lex_id] = lex_str
        all_lex_tags_dict[lex_str] += 1

vocabulary.discard("")
print(all_lex_tags)

# most_all_lex_tags_dict = {}
# most_all_lex_tags_dict = sorted(all_lex_tags_dict.items(), reverse=True, key=lambda x: x[1])
# most_all_lex_tags_dict = [item[0] for item in most_all_lex_tags_dict]



{'/eröffnen', 'ablauf/ablauf', '/réel', '/objet', 'rua/rua', 'pobres/pobre', 'rojas/rojo', '/apprécier', 'général/général', 'alumnos/alumno', '/aparecer', 'glauben/glauben', 'contar/contar', 'werkzeug/werkzeug', 'europa/europa', 'para/para', 'usa/usar', '/miroir', 'delivery/delivery', 'griglia/griglia', 'contam/contar', 'holz/holz', 'leisten/leisten', 'meurt/mourir', '/besoin', 'zeit/zeit', '/rentrer', 'decidere/decidere', 'la/prpers', 'besitzen/besitzen', '/genre', 'renne/rennen', 'dito/dito', '/choc', 'estudar/estudar', '/músculo', 'mange/manger', 'hersteller/hersteller', '/unico', 'jugador/jugador', '/costruire', 'writer/writer', 'saber/saber', 'pepe/pepe', 'strom/strom', 'quella/quello', 'appareil/appareil', 'month/month', 'livre/livre', 'départ/départ', 'skirt/skirt', '/week', '/finir', '/hochschule', 'a/a', 'anfordern/anfordern', 'center/centre', '/enveloppe', 'buvons/boire', 'mismas/mismo', 'nouveaux/nouveau', '/ospedale', 'cambiar/cambiar', 'dijiste/decir', '/interprétation', '

## Let's do the Bert Embeddings

In [5]:
def get_embeddings_iteratively(model, phrases, batch_size=2, output_file='embeddings.npy'):
    tokenizer = AutoTokenizer.from_pretrained(model)
    model = AutoModel.from_pretrained(model)
    all_embeddings = []
    for i in range(0, len(phrases), batch_size):
        batch = phrases[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        embeddings = embeddings.detach().numpy()
        all_embeddings.append(embeddings)
        del inputs, outputs, embeddings
        torch.cuda.empty_cache()  # Clear GPU memory if using GPU
    all_embeddings = np.vstack(all_embeddings)
    np.save(output_file, all_embeddings)
    return all_embeddings


In [6]:
models = ["google-bert/bert-base-cased", "google-bert/bert-base-uncased", "google-bert/bert-large-uncased", "bert-base-multilingual-cased"]
reduced_embeddings = []
for model in models: 
    embeddings = get_embeddings_iteratively(model, list(lex_vocabulary_dict.values()), batch_size=20, output_file='embeddings.npy')
    embeddings = np.load('embeddings.npy')

    pca = PCA(n_components=2)
    reduced_embeddings.append(pca.fit_transform(embeddings))


  return dynamo.is_compiling()
  or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
  return dynamo.is_compiling()
  or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
  return dynamo.is_compiling()
  or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
  return dynamo.is_compiling()
  or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())


## Export

In [7]:
spaced_repetition = {}

with open(input_filename, 'r') as csv_file:
    df = pd.read_csv(csv_file)
    for lexeme_id, word  in lex_vocabulary_dict.items():
        filtered_df = df.iloc[lexeme_id_mapping[lexeme_id]]
        
        spaced_repetition[lexeme_id] = {
            "average_recall" : filtered_df["p_recall"].mean(),
            "users_seen" : filtered_df["user_id"].nunique(),
            "average_performance" : ((filtered_df["history_correct"] + filtered_df["session_correct"]) / (filtered_df["history_seen"] + filtered_df["session_seen"])).mean()
        }
        
        del filtered_df
    del df
del csv_file
gc.collect()


0

In [8]:
vocabulary_dataset = []
for i, lex_id in enumerate(lex_vocabulary_dict.keys()):
    vocabulary_dataset.append({
        "custom_id" : i,
        "word": lex_vocabulary_dict[lex_id],
        "full_word" : vocabulary_dict[lex_id],
        "position_1": list(reduced_embeddings[0][i]),
        "position_2": list(reduced_embeddings[1][i]),
        "position_3": list(reduced_embeddings[2][i]),
        "position_4": list(reduced_embeddings[3][i]),
        "language": vocabulary_id_lex[lex_id][1],
        "lexeme_id": lex_id,
        "lexeme": vocabulary_id_lex[lex_id][0],
        "tags" : word_tags_dict[lex_id],
        "average_recall": spaced_repetition[lex_id]["average_recall"],
        "users_seen": spaced_repetition[lex_id]["users_seen"],
        "average_performance": spaced_repetition[lex_id]["average_performance"],
    })

def convert_to_serializable(obj):
    if isinstance(obj, np.float32):
        return float(obj)  # Convert to Python float
    else:
        raise TypeError("Type not serializable")



In [9]:
vocabulary_dataset[0]

{'custom_id': 0,
 'word': 'lernen',
 'full_word': 'lernt/lernen',
 'position_1': [-1.4908264, -1.3035642],
 'position_2': [0.8741715, 0.89908075],
 'position_3': [2.9780068, 2.5120828],
 'position_4': [1.1240356, 0.17822075],
 'language': 'de',
 'lexeme_id': '76390c1350a8dac31186187e2fe1e178',
 'lexeme': 'lernt/lernen<vblex><pri><p3><sg>',
 'tags': ['<vblex>', '<pri>', '<p3>', '<sg>'],
 'average_recall': 0.9033849490324192,
 'users_seen': 7234,
 'average_performance': 0.9122756892184505}

In [10]:

with open("../src/data/vocabulary_dataset.json", "w") as f:
    json.dump(vocabulary_dataset, f, default=convert_to_serializable, indent=4)
