In [None]:
import altair as alt
import ast
import base64
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import umap
import uuid

from PIL import Image
from io import BytesIO
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_colwidth', None)

In [None]:
base_name = "2021_Hashimoto_Neural_ODE_and_holographic_QCD_PUB"
project_folder = "diygenomics-projects"
sub_category = "math"
work_bucket = "AdS-CFT"
external_id = "2023_05_22_92dc0613b4493d7b5847g"

In [None]:
data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, project_folder, sub_category, work_bucket,
                                       base_name, 'mathpix', *args)

index_col = 'uuid'

if not os.path.exists(file_path('math_embeddings')):
    os.makedirs(file_path('math_embeddings'))

if not os.path.exists(file_path('math_embedding_charts')):
    os.makedirs(file_path('math_embedding_charts'))
    
scaler = StandardScaler()

In [None]:
df = pd.read_csv(file_path('extracted_annotated_math.csv'), index_col=index_col)

In [None]:
# df['openai_math_embeddings'] = df['openai_math_embeddings'].apply(eval)
df['openai_sympy_embeddings'] = df['openai_sympy_embeddings'].apply(eval)
df['openai_clean_math_embeddings'] = df['openai_clean_math_embeddings'].apply(eval)

In [None]:
# df['codesearch_math_embeddings'] = df['codesearch_math_embeddings'].apply(eval)
df['codesearch_sympy_embeddings'] = df['codesearch_sympy_embeddings'].apply(eval)
df['codesearch_clean_math_embeddings'] = df['codesearch_clean_math_embeddings'].apply(eval)

In [None]:
# df['multi_qa_math_embeddings'] = df['multi_qa_math_embeddings'].apply(eval)
df['multi_qa_sympy_embeddings'] = df['multi_qa_sympy_embeddings'].apply(eval)
df['multi_qa_clean_math_embeddings'] = df['multi_qa_clean_math_embeddings'].apply(eval)

In [None]:
# df['mathbert_math_embeddings'] = df['mathbert_math_embeddings'].apply(eval)
df['mathbert_sympy_embeddings'] = df['mathbert_sympy_embeddings'].apply(eval)
df['mathbert_clean_math_embeddings'] = df['mathbert_clean_math_embeddings'].apply(eval)

In [None]:
def scale_image(image):
    width, height = image.size
    aspect_ratio = width / height

    if width > height:
        max_size = int(width / 4)
        max_size = 250 if max_size < 250 else max_size
        new_width = max_size
        new_height = int(max_size / aspect_ratio)
    else:
        max_size = int(height / 4)
        new_height = max_size
        new_width = int(max_size * aspect_ratio)

    resized_image = image.resize((new_width, new_height))

    return resized_image

In [None]:
def format_image(im):
    with BytesIO() as buffer:
        im.save(buffer, 'png')
        data = base64.encodebytes(buffer.getvalue()).decode('utf-8')
    
    return f'data:image/png;base64,{data}'

def load_local_image(image_name):
    img = Image.open(file_path('math_images', image_name))
    img = scale_image(img)
    # img.thumbnail((250, 250))
    return format_image(img)

df['embedded_math_image'] = df['display_math_image'].apply(load_local_image)

In [None]:
def create_or_load_embedding(column):
    math_embeddings_file = file_path('math_embeddings', f'{column}_reduced_embeddings.csv')
    
    if os.path.exists(math_embeddings_file):
        embedding_df = pd.read_csv(math_embeddings_file, index_col=index_col)
    else:
        scaled_data = scaler.fit_transform(df[column].to_list())

        reducer = umap.UMAP(random_state=42)
        embedding = reducer.fit_transform(scaled_data) 

        embedding_df = pd.DataFrame(embedding, columns=['x', 'y'])

        embedding_df['uuid'] = df.index
        embedding_df['paper_annotation'] = df['paper_annotation'].reset_index(drop=True)
        embedding_df['image'] = df['embedded_math_image'].reset_index(drop=True)
        
        embedding_df.set_index('uuid', inplace=True)
        
        embedding_df.to_csv(math_embeddings_file)
    
    return embedding_df

def create_embedding_viz(column):
    embedding_df = create_or_load_embedding(column)
    embedding_df = embedding_df.reset_index(drop=True)

    chart = alt.Chart(embedding_df).mark_circle().encode(
        x='x',
        y='y',
        tooltip=['image']
    ).interactive()
    
    chart = chart.configure_title(
        font='Times New Roman'
    ).configure_axis(
        labelFont='Times New Roman',
        titleFont='Times New Roman'
    ).configure_legend(
        labelFont='Times New Roman',
        titleFont='Times New Roman'
    ).configure_text(
        font='Times New Roman'
    )

    chart.save(file_path('math_embedding_charts', f'{column}_circle.html'), embed_options={"downloadFileName": column})
    
    chart = alt.Chart(embedding_df).mark_text().encode(
        x='x',
        y='y',
        text='paper_annotation',
        tooltip=['image']
    ).interactive()
    
    chart = chart.configure_title(
        font='Times New Roman'
    ).configure_axis(
        labelFont='Times New Roman',
        titleFont='Times New Roman'
    ).configure_legend(
        labelFont='Times New Roman',
        titleFont='Times New Roman'
    ).configure_text(
        font='Times New Roman'
    )

    chart.save(file_path('math_embedding_charts', f'{column}_text.html'), embed_options={"downloadFileName": column})

In [None]:
# create_embedding_viz('openai_math_embeddings', 'openai_math_embeddings.html')
create_embedding_viz('openai_sympy_embeddings')
create_embedding_viz('openai_clean_math_embeddings')

In [None]:
# create_embedding_viz('multi_qa_math_embeddings', 'multi_qa_math_embeddings.html')
create_embedding_viz('multi_qa_sympy_embeddings')
create_embedding_viz('multi_qa_clean_math_embeddings')

In [None]:
# create_embedding_viz('codesearch_math_embeddings', 'codesearch_math_embeddings.html')
create_embedding_viz('codesearch_sympy_embeddings')
create_embedding_viz('codesearch_clean_math_embeddings')

In [None]:
# create_embedding_viz('mathbert_math_embeddings', 'mathbert_math_embeddings.html')
create_embedding_viz('mathbert_sympy_embeddings')
create_embedding_viz('mathbert_clean_math_embeddings')

In [None]:
def get_circle_chart(column):
    embedding_df = create_or_load_embedding(column)
    embedding_df = embedding_df.reset_index(drop=True)

    chart = alt.Chart(embedding_df).mark_circle().encode(
        x='x',
        y='y',
        tooltip=['image']
    ).properties(
        title=column
    ).interactive()
    
    return chart

def create_group_embedding_viz(column_a, column_b, column_c, column_d):
    chart_a = get_circle_chart(column_a)
    chart_b = get_circle_chart(column_b)
    chart_c = get_circle_chart(column_c)
    chart_d = get_circle_chart(column_d)

    combined_chart = alt.hconcat(
        alt.vconcat(chart_a, chart_b),
        alt.vconcat(chart_c, chart_d)
    )

    combined_chart = combined_chart.configure_title(
        font='Times New Roman'
    ).configure_axis(
        labelFont='Times New Roman',
        titleFont='Times New Roman'
    ).configure_legend(
        labelFont='Times New Roman',
        titleFont='Times New Roman'
    ).configure_text(
        font='Times New Roman'
    )

    combined_chart.save(file_path('math_embedding_charts', f'combined_circle.html'), embed_options={"downloadFileName": 'combined_circle'})

In [None]:
create_group_embedding_viz('openai_clean_math_embeddings', 'multi_qa_clean_math_embeddings', 
                           'codesearch_clean_math_embeddings', 'mathbert_clean_math_embeddings')

In [None]:
# img = Image.open(file_path('math_images', '1_6998d986-fca2-4650-a4bd-4656e657fa72.png'))
# width, height = img.size
# print(f'{width} {height}')
# new_width = 500  # or whatever value you want
# new_height = int(new_width * height / width)

# max_size = 50
# img.thumbnail((250))

# plt.imshow(resized_image)
# plt.axis('off')  # Remove the axis labels
# plt.show()