# Costruzione emebdding del dataset matematico

In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

In [None]:
ds = pd.read_parquet("/kaggle/input/dataset-math/dataset_math.parquet")
ds.info()

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# Funzione per suddividere il testo in finestre di 256 parole con una finestra scorrevole
def sliding_window(text, max_length=256, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length - overlap):
        chunk = " ".join(words[i:i + max_length])
        chunks.append(chunk)
    return chunks

# Funzione per calcolare gli embeddings aggregati
def get_aggregated_embeddings(text, model):
    chunks = sliding_window(text)
    embeddings = [model.encode(chunk) for chunk in chunks]
    aggregated_embedding = np.mean(embeddings, axis=0)
    return aggregated_embedding

In [None]:
df = ds

df['embedding'] = df['problem'].apply(lambda x: get_aggregated_embeddings(x, model) if isinstance(x, str) else None)

df.to_parquet('file_con_embeddings_con_sliding_windows.parquet')

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
def split_text_into_segments(text, max_length=256):
    words = text.split()
    segments = [words[i:i+max_length] for i in range(0, len(words), max_length)]
    return [' '.join(segment) for segment in segments]

def get_combined_embedding(text, model, max_length=256):
    segments = split_text_into_segments(text, max_length)
    embeddings = [model.encode(segment) for segment in segments]
    combined_embedding = np.mean(embeddings, axis=0)
    return combined_embedding

ds = pd.read_parquet("/kaggle/input/dataset-math/dataset_math.parquet")

df = ds

df['embedding'] = df['problem'].apply(lambda x: get_combined_embedding(x, model) if isinstance(x, str) else None)

df.to_parquet('file_con_embeddings_senza_sliding_windows.parquet')