In [1]:
import altair as alt
import ast
import base64
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scrapbook as sb
import umap
import uuid

from PIL import Image
from io import BytesIO
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_colwidth', None)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
project_folder = "diygenomics-projects"
sub_category = "DATA"
work_bucket = "RSIDs"

version = '_061823'
input_file = f'truth_alzheimers_RSIDs_consolidated{version}.csv'

In [3]:
data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, project_folder, sub_category, work_bucket, *args)

index_col = 'uuid'

embeddings_folder = f'reduced_embeddings{version}'
charts_folder = f'embedding_charts{version}'

if not os.path.exists(file_path(embeddings_folder)):
    os.makedirs(file_path(embeddings_folder))

if not os.path.exists(file_path(charts_folder)):
    os.makedirs(file_path(charts_folder))
    
scaler = StandardScaler()

In [4]:
df = pd.read_csv(file_path(input_file), index_col=index_col)

In [5]:
df['openai_combined_data_embeddings'] = df['openai_combined_data_embeddings'].apply(eval)
df['codesearch_combined_data_embeddings'] = df['codesearch_combined_data_embeddings'].apply(eval)
df['multi_qa_combined_data_embeddings'] = df['multi_qa_combined_data_embeddings'].apply(eval)
df['mathbert_combined_data_embeddings'] = df['mathbert_combined_data_embeddings'].apply(eval)

In [6]:
def create_or_load_embedding(column):
    embeddings_file = file_path(embeddings_folder, f'{column}_reduced_embeddings.csv')
    
    if os.path.exists(embeddings_file):
        embedding_df = pd.read_csv(embeddings_file, index_col=index_col)
    else:
        scaled_data = scaler.fit_transform(df[column].to_list())

        reducer = umap.UMAP(random_state=42)
        embedding = reducer.fit_transform(scaled_data) 

        embedding_df = pd.DataFrame(embedding, columns=['x', 'y'])

        embedding_df['uuid'] = df.index
        embedding_df['combined_data'] = df['combined_data'].reset_index(drop=True)
        embedding_df['tooltip'] = df['tooltip'].reset_index(drop=True)
        
        embedding_df.set_index('uuid', inplace=True)
        
        embedding_df.to_csv(embeddings_file)
    
    return embedding_df

def create_embedding_viz(column):
    embedding_df = create_or_load_embedding(column)
    embedding_df = embedding_df.reset_index(drop=True)

    chart = alt.Chart(embedding_df).mark_circle().encode(
        x='x',
        y='y',
        tooltip=['tooltip']
    ).interactive()
    
    chart = chart.configure_title(
        font='Times New Roman'
    ).configure_axis(
        labelFont='Times New Roman',
        titleFont='Times New Roman'
    ).configure_legend(
        labelFont='Times New Roman',
        titleFont='Times New Roman'
    ).configure_text(
        font='Times New Roman'
    )

    chart.save(file_path(charts_folder, f'{column}_circle.html'), embed_options={"downloadFileName": column})

In [7]:
create_embedding_viz('openai_combined_data_embeddings')

In [8]:
create_embedding_viz('multi_qa_combined_data_embeddings')

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [9]:
create_embedding_viz('codesearch_combined_data_embeddings')

In [10]:
create_embedding_viz('mathbert_combined_data_embeddings')

In [11]:
def get_circle_chart(column):
    embedding_df = create_or_load_embedding(column)
    embedding_df = embedding_df.reset_index(drop=True)

    chart = alt.Chart(embedding_df).mark_circle().encode(
        x='x',
        y='y',
        tooltip=['tooltip']
    ).properties(
        title=column
    ).interactive()
    
    return chart

def create_group_embedding_viz(column_a, column_b, column_c, column_d):
    chart_a = get_circle_chart(column_a)
    chart_b = get_circle_chart(column_b)
    chart_c = get_circle_chart(column_c)
    chart_d = get_circle_chart(column_d)

    combined_chart = alt.hconcat(
        alt.vconcat(chart_a, chart_b),
        alt.vconcat(chart_c, chart_d)
    )

    combined_chart = combined_chart.configure_title(
        font='Times New Roman'
    ).configure_axis(
        labelFont='Times New Roman',
        titleFont='Times New Roman'
    ).configure_legend(
        labelFont='Times New Roman',
        titleFont='Times New Roman'
    ).configure_text(
        font='Times New Roman'
    )

    combined_chart.save(file_path(charts_folder, f'combined_circle.html'), embed_options={"downloadFileName": 'combined_circle'})

In [12]:
create_group_embedding_viz('openai_combined_data_embeddings', 'multi_qa_combined_data_embeddings', 
                           'codesearch_combined_data_embeddings', 'mathbert_combined_data_embeddings')

In [None]:
# sb.glue('status', 'completed')