# Visualize MMLU and Training Dataset Question Embeddings Using PCA

In [3]:
!pip install -q torch
!pip install -q transformers
!pip install -q datasets
!pip install -q pandas
!pip install matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Get Embeddings For MMLU Validation Set

In [4]:
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMultipleChoice, AutoModel
from datasets import load_dataset, get_dataset_config_names
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

In [5]:
MODEL_NAME = "intfloat/multilingual-e5-large-instruct"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embed_model = SentenceTransformer(MODEL_NAME).to(device)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/140k [00:00<?, ?B/s]

sentence_xlm-roberta_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

In [6]:
def get_embedding(question):
    """Get the embedding of a question."""
    embedding = embed_model.encode(question, convert_to_tensor=True, device=device)
    return embedding.cpu().numpy().tolist()

In [18]:
mmlu = load_dataset("cais/mmlu", "all", split="validation")

mmlu_data = []

for example in tqdm(mmlu, desc="Processing MMLU questions: "):
    question = example["question"]
    subject = example["subject"]

    # Get BERT embedding
    embedding = get_embedding(question)

    mmlu_data.append({"question": question, "subject": subject, "source": "MMLU", "embedding": embedding})

# Convert to DataFrame
embedding_df = pd.DataFrame(mmlu_data)
embedding_df.head()

Processing MMLU questions: 100%|██████████| 1531/1531 [01:01<00:00, 24.99it/s]


Unnamed: 0,question,subject,source,embedding
0,The cyclic subgroup of Z_24 generated by 18 ha...,abstract_algebra,MMLU,"[0.04232871159911156, 0.03739118203520775, -0...."
1,Find the order of the factor group Z_6/<3>.,abstract_algebra,MMLU,"[0.039315007627010345, 0.04634949192404747, -0..."
2,Statement 1 | A permutation that is a product ...,abstract_algebra,MMLU,"[0.025039682164788246, 0.010647515766322613, -..."
3,Find the order of the factor group (Z_4 x Z_12...,abstract_algebra,MMLU,"[0.03546808287501335, 0.03807569295167923, -0...."
4,Find the maximum possible order for some eleme...,abstract_algebra,MMLU,"[0.046966664493083954, 0.03703867644071579, -0..."


In [30]:
# Load training dataset
from google.colab import drive
drive.mount('/content/drive')
training = load_dataset("arrow", data_files="/content/drive/MyDrive/DATASCI_266/Project/data-00000-of-00001.arrow")["train"].shuffle(seed=42).select(range(1500))

train_data = []

for example in tqdm(training, desc="Processing training questions: "):
    question = example["question"]
    subject = example["subject"]

    # Get BERT embedding
    embedding = get_embedding(question)

    train_data.append({"question": question, "subject": subject, "source": subject, "embedding": embedding})

# Convert to DataFrame
embedding_df2 = pd.DataFrame(train_data)
embedding_df2.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Generating train split: 0 examples [00:00, ? examples/s]

Processing training questions: 100%|██████████| 1500/1500 [00:41<00:00, 36.54it/s]


Unnamed: 0,question,subject,source,embedding
0,the cash realised on selling a 14 % stock is r...,MathQA,MathQA,"[0.033772680908441544, 0.029479950666427612, -..."
1,It is vital to your health to have a,OpenBook-Additional,OpenBook-Additional,"[0.05224911868572235, 0.03316878527402878, -0...."
2,What is the difference between a causal and no...,ComSciQA,ComSciQA,"[0.0028976735193282366, 0.002486339071765542, ..."
3,What might a saw be part?,Commonsense,Commonsense,"[0.02964751049876213, 0.024703163653612137, -0..."
4,Which of the following is a characteristic of ...,ComSciQA,ComSciQA,"[0.005619276314973831, 0.014857414178550243, -..."


In [31]:
concat_df = pd.concat([embedding_df, embedding_df2], ignore_index=True)
concat_df.head()
len(concat_df.iloc[0]["embedding"])

1024

## PCA Visualization

In [32]:
from sklearn.decomposition import PCA
import altair as alt

In [33]:
pca = PCA(n_components=2)
concat_df[["PCA1", "PCA2"]] = pca.fit_transform(concat_df["embedding"].tolist())
concat_df.head()

Unnamed: 0,question,subject,source,embedding,PCA1,PCA2
0,The cyclic subgroup of Z_24 generated by 18 ha...,abstract_algebra,MMLU,"[0.04232871159911156, 0.03739118203520775, -0....",0.093593,-0.036517
1,Find the order of the factor group Z_6/<3>.,abstract_algebra,MMLU,"[0.039315007627010345, 0.04634949192404747, -0...",0.026524,-0.100886
2,Statement 1 | A permutation that is a product ...,abstract_algebra,MMLU,"[0.025039682164788246, 0.010647515766322613, -...",0.101165,-0.009662
3,Find the order of the factor group (Z_4 x Z_12...,abstract_algebra,MMLU,"[0.03546808287501335, 0.03807569295167923, -0....",0.067061,-0.124502
4,Find the maximum possible order for some eleme...,abstract_algebra,MMLU,"[0.046966664493083954, 0.03703867644071579, -0...",0.021541,-0.121424


In [34]:
# Save embeddings to CSV
concat_df.to_csv("embed_multilingual-e5-large-instruct_embeddings.csv", index=False)

In [None]:
selection = alt.selection_multi(fields=["source"], bind="legend")

scatter = alt.Chart(concat_df).mark_circle(size=80).encode(
    x="PCA1:Q",
    y="PCA2:Q",
    color=alt.Color("source:N", legend=alt.Legend(title="Dataset")),
    tooltip=["source:N", "subject:N"],  # Show question and subject on hover
    opacity=alt.condition(selection, alt.value(1), alt.value(0.00))  # Dim unselected points
).add_selection(
    selection
).properties(
    width=800,
    height=500,
    title="MMLU and Training Question Embeddings (PCA Projection)"
).interactive()

# Increase font sizes
scatter = scatter.configure_axis(
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=18
).configure_legend(
    titleFontSize=14,  # Set legend title font size
    labelFontSize=12   # Set legend label font size
)

In [None]:
# Resulting graph is too large for GitHub upload
# Show visualization
scatter