# Visualize MMLU and Training Dataset Question Embeddings Using PCA and t-SNE

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import altair as alt
import pandas as pd 
import ast
import numpy as np

In [3]:
mmlu_df = pd.read_csv("../training/mmlu_test_metadata.csv", usecols=['question', 'source', 'subject', 'embedding'])
combine_df = pd.read_csv("../training/train_metadata.csv", usecols=['question', 'source', 'subject', 'embedding'])

In [4]:
# Shuffle datasets and grab the first 1500 rows
combine_df = combine_df.sample(frac=1, random_state=42).reset_index(drop=True)
combine_df_subset = combine_df.head(1500)

mmlu_df = mmlu_df.sample(frac=1, random_state=42).reset_index(drop=True)
mmlu_df_subset = mmlu_df.head(1500)

In [5]:
concat_df = pd.concat([mmlu_df_subset, combine_df_subset], ignore_index=True)
concat_df.shape

(3000, 4)

In [6]:
# Convert embeddings to lists
concat_df['embedding'] = concat_df['embedding'].apply(lambda x: ast.literal_eval(x))

## PCA Visualization

In [7]:
pca = PCA(n_components=2)
concat_df[['PCA1', 'PCA2']] = pca.fit_transform(concat_df['embedding'].tolist())
concat_df.head()

Unnamed: 0,question,subject,source,embedding,PCA1,PCA2
0,If a psychologist acts as both a fact witness ...,professional_psychology,MMLU,"[-0.0007376694702543318, -0.021310580894351006...",0.104445,0.172764
1,In what ways does the perspective on terrorism...,security_studies,MMLU,"[0.023837367072701454, 0.015235863626003265, -...",-0.152022,0.040882
2,George was charged with the murder of a well- ...,professional_law,MMLU,"[0.04566001519560814, 0.023941989988088608, -0...",0.143341,0.171975
3,Can countries rely on their domestic law as an...,international_law,MMLU,"[0.004619034007191658, 0.019672941416502, -0.0...",-0.062229,0.109203
4,A 30-year-old nulliparous female presents to t...,professional_medicine,MMLU,"[0.016010042279958725, 0.01382381934672594, -0...",0.113146,0.136764


In [8]:
selection = alt.selection_point(fields=['source'], bind='legend')

scatter = alt.Chart(concat_df).mark_circle(size=50).encode(
    x = "PCA1:Q",
    y = "PCA2:Q",
    color = alt.Color("source:N", legend=alt.Legend(title='Dataset'), scale=alt.Scale(scheme='tableau10')),
    tooltip=['question', 'source', 'subject'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0))
).add_params(
    selection
).properties(
    width=800,
    height=500,
    title="PCA of Dataset Question Embeddings"
).interactive()

scatter = scatter.configure_axis(
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=18
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
)

In [None]:
# Resulting graph is too large for GitHub upload
# Show visualization
scatter

In [10]:
scatter.save('dataset_pca.png', ppi=200)

## t-SNE Visualization

In [15]:
concat_df['embedding'].shape

(3000,)

In [18]:
tsne = TSNE(n_components=2, perplexity=10, random_state=42)
embeddings = np.array(concat_df['embedding'].tolist())
embeddings = embeddings.reshape(-1, 1024)
concat_df[['t-SNE_x', 't-SNE_y']] = tsne.fit_transform(embeddings)
concat_df.head()

Unnamed: 0,question,subject,source,embedding,PCA1,PCA2,t-SNE_x,t-SNE_y
0,If a psychologist acts as both a fact witness ...,professional_psychology,MMLU,"[-0.0007376694702543318, -0.021310580894351006...",0.104445,0.172764,55.748711,50.932869
1,In what ways does the perspective on terrorism...,security_studies,MMLU,"[0.023837367072701454, 0.015235863626003265, -...",-0.152022,0.040882,-45.001488,36.72179
2,George was charged with the murder of a well- ...,professional_law,MMLU,"[0.04566001519560814, 0.023941989988088608, -0...",0.143341,0.171975,59.075195,50.884491
3,Can countries rely on their domestic law as an...,international_law,MMLU,"[0.004619034007191658, 0.019672941416502, -0.0...",-0.062229,0.109203,-37.101765,38.614872
4,A 30-year-old nulliparous female presents to t...,professional_medicine,MMLU,"[0.016010042279958725, 0.01382381934672594, -0...",0.113146,0.136764,33.748497,40.225315


In [None]:
selection = alt.selection_point(fields=['source'], bind='legend')

tsne = alt.Chart(concat_df).mark_circle(size=50).encode(
    x = "t-SNE_x:Q",
    y = "t-SNE_y:Q",
    color = alt.Color("source:N", legend=alt.Legend(title='Dataset'), scale=alt.Scale(scheme='tableau10')),
    tooltip=['question', 'source', 'subject'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0))
).add_params(
    selection
).properties(
    width=800,
    height=500,
    title="t-SNE of Dataset Question Embeddings"
).interactive()

tsne = tsne.configure_axis(
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=18
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
)

tsne

In [20]:
tsne.save('dataset_tsne.png', ppi=200)