In [None]:
import altair as alt
import ast
import base64
import numpy as np
import os
import pandas as pd
import umap

from PIL import Image
from io import BytesIO
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_colwidth', None)

In [None]:
base_name = "2021_Hashimoto_Neural_ODE_and_holographic_QCD_PUB"
project_folder = "diygenomics-projects"
sub_category = "math"
work_bucket = "AdS-CFT"
external_id = "2023_05_22_92dc0613b4493d7b5847g"

In [None]:
data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, project_folder, sub_category, work_bucket,
                                       base_name, 'mathpix', *args)

index_col = 'uuid'

if not os.path.exists(file_path('math_embedding_charts')):
    os.makedirs(file_path('math_embedding_charts'))
    
scaler = StandardScaler()

In [None]:
df = pd.read_csv(file_path('extracted_annotated_math.csv'), index_col=index_col)

In [None]:
df['openai_math_embeddings'] = df['openai_math_embeddings'].apply(eval)
df['openai_sympy_embeddings'] = df['openai_sympy_embeddings'].apply(eval)
df['openai_clean_math_embeddings'] = df['openai_clean_math_embeddings'].apply(eval)

In [None]:
df['codesearch_math_embeddings'] = df['codesearch_math_embeddings'].apply(eval)
df['codesearch_sympy_embeddings'] = df['codesearch_sympy_embeddings'].apply(eval)
df['codesearch_clean_math_embeddings'] = df['codesearch_clean_math_embeddings'].apply(eval)

In [None]:
df['multi_qa_math_embeddings'] = df['multi_qa_math_embeddings'].apply(eval)
df['multi_qa_sympy_embeddings'] = df['multi_qa_sympy_embeddings'].apply(eval)
df['multi_qa_clean_math_embeddings'] = df['multi_qa_clean_math_embeddings'].apply(eval)

In [None]:
df['mathbert_math_embeddings'] = df['mathbert_math_embeddings'].apply(eval)
df['mathbert_sympy_embeddings'] = df['mathbert_sympy_embeddings'].apply(eval)
df['mathbert_clean_math_embeddings'] = df['mathbert_clean_math_embeddings'].apply(eval)

In [None]:
def format_image(im):
    with BytesIO() as buffer:
        im.save(buffer, 'png')
        data = base64.encodebytes(buffer.getvalue()).decode('utf-8')
    
    return f'data:image/png;base64,{data}'

def load_local_image(image_name):
    img = Image.open(file_path('math_images', image_name))
    img.thumbnail((250, 250))
    return format_image(img)

df['embedded_math_image'] = df['display_math_image'].apply(load_local_image)

In [None]:
def create_embedding_viz(column, output_file):
    scaled_data = scaler.fit_transform(df[column].to_list())

    reducer = umap.UMAP(random_state=42)
    embedding = reducer.fit_transform(scaled_data) 
    
    embedding_df = pd.DataFrame(embedding, columns=['x', 'y'])

    embedding_df['image'] = df['embedded_math_image'].reset_index(drop=True)
    
    chart = alt.Chart(embedding_df).mark_circle().encode(
        x='x',
        y='y',
        tooltip=['image']
    ).interactive()

    chart.save(file_path('math_embedding_charts', output_file))

In [None]:
create_embedding_viz('openai_math_embeddings', 'openai_math_embeddings.html')
create_embedding_viz('openai_sympy_embeddings', 'openai_sympy_embeddings.html')
create_embedding_viz('openai_clean_math_embeddings', 'openai_clean_math_embeddings.html')

In [None]:
create_embedding_viz('multi_qa_math_embeddings', 'multi_qa_math_embeddings.html')
create_embedding_viz('multi_qa_sympy_embeddings', 'multi_qa_sympy_embeddings.html')
create_embedding_viz('multi_qa_clean_math_embeddings', 'multi_qa_clean_math_embeddings.html')

In [None]:
create_embedding_viz('codesearch_math_embeddings', 'codesearch_math_embeddings.html')
create_embedding_viz('codesearch_sympy_embeddings', 'codesearch_sympy_embeddings.html')
create_embedding_viz('codesearch_clean_math_embeddings', 'codesearch_clean_math_embeddings.html')

In [None]:
create_embedding_viz('mathbert_math_embeddings', 'mathbert_math_embeddings.html')
create_embedding_viz('mathbert_sympy_embeddings', 'mathbert_sympy_embeddings.html')
create_embedding_viz('mathbert_clean_math_embeddings', 'mathbert_clean_math_embeddings.html')