In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer("PORTULAN/serafim-900m-portuguese-pt-sentence-encoder")
text_columns = ["special_interest", "diary_entry", "selfdefining_memory", "empty_sheet"]

In [None]:
# Load your dataset
file_path = "../../data/adhd-beliefs-pt/adhd-beliefs-pt-anonymized.pkl"
df = pd.read_pickle(file_path)

In [None]:
processed_df = df[text_columns].fillna("").replace("nan", "")
processed_df["merged_text"] = processed_df.agg(" ".join, axis=1).str.strip()
processed_df = processed_df[processed_df["merged_text"] != ""]
processed_df

In [None]:
cols = processed_df.columns.to_list()
print("Columns in processed_df:", cols)

In [None]:
for col in cols:
    processed_df[col] = processed_df[col].astype(str)

In [None]:
processed_df.head()

In [None]:
def get_user_embedding(text):
    if not text or text == "nan" or pd.isna(text):
        return np.zeros((1536,), dtype=np.float32)
    return model.encode(text, normalize_embeddings=True)

In [None]:
for col in cols:
    processed_df[f"{col}_embedding"] = processed_df[col].apply(get_user_embedding)

In [None]:
processed_df.head()

In [None]:
embedding_cols = [col for col in processed_df.columns if col.endswith("_embedding") or col == "merged_text"]
result_df = df.loc[processed_df.index].join(processed_df[embedding_cols], how="left")
result_df.to_pickle("../../data/adhd-beliefs-pt/adhd-beliefs-pt-embeddings-serafim.pkl")
result_df