## Dimension reduction of GPT-2 SOTU embeddings

Run this notebook in the root directory (where SOTU files can be accessed) to replicate dimension reduction algorithms (UMAP, TriMAP, PaCMAP) applied to the embedding of SOTU presidential addresses via GPT-2.  

Another part of the notebook is about authorship attribution by using nearest neighbors in the original GPT-2 embedding.

This notebook can be run on a CPU instance. 

# Initializing

In [None]:
!pip -q install tensorflow keras-nlp faiss-cpu umap-learn trimap pacmap matplotlib pandas seaborn

# Imports

In [None]:
import tensorflow as tf
from keras_nlp.models import GPT2Tokenizer, GPT2Preprocessor, GPT2Backbone
import matplotlib.pyplot as plt
import numpy as np
import faiss
import umap, trimap, pacmap
import os
import pickle
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans

# Model and dataset

Chunk length and shift for the sliding window

In [None]:
WINDOW_LEN, WINDOW_SHIFT = 512, 256

Using GPT-2

In [None]:
#
# Load GPT-2 model and tokenizer
#
gpt2_tokenizer = GPT2Tokenizer.from_preset("gpt2_base_en")
gpt2_preprocessor = GPT2Preprocessor(gpt2_tokenizer, sequence_length=WINDOW_LEN)
gpt2_model = GPT2Backbone.from_preset("gpt2_base_en")
#
# Define a function to generate embeddings (normalized to unit norm)
#
def generate_embedding(text, window_len, window_shift, preprocessor, model):
    # Split the text into words
    words = text.split()
    # Integrator for word strings
    integrator = 0
    #
    # Iterate through windows with the specified shift
    for i in range(0, len(words) - window_len + 1, window_shift):
        string = ' '.join(words[i:i + window_len])
        inputs     = tf.constant([string], dtype=tf.string)
        preprocessed = preprocessor([inputs])
        embedding  = tf.squeeze(model(preprocessed))
        avg_pool   = tf.reduce_mean(embedding, axis=0)
        integrator = integrator + avg_pool
    #
    # Include the last window if it's shorter than window_len
    last_window_start = len(words) % window_len
    if last_window_start > 0:
        string = ' '.join(words[-last_window_start:])
        inputs     = tf.constant([string], dtype=tf.string)
        preprocess = preprocessor([inputs])
        embedding  = tf.squeeze(model(preprocess))
        avg_pool   = tf.reduce_mean(embedding, axis=0)
        integrator = integrator + avg_pool
    #
    l2_norm    = tf.norm(integrator, ord='euclidean')
    #
    return integrator/l2_norm

In [None]:
#
# Function to read and store text files from a directory
# that are label='train', label='test', or both kinds if label=None
#
def store_text_files(directory, index, filenames, window_len, window_shift, preprocessor, model, label=None):
    #
    assert label in ['train', 'test', None]
    #
    end = {'train': ".train.txt", 'test': ".test.txt", None: ".txt"}
    #
    for filename in os.listdir(directory):
        #
        if filename.endswith(end[label]):
            #
            filepath = os.path.join(directory, filename)
            #
            with open(filepath, "r", encoding="utf-8") as file:
                print(f'Acquiring file {filename}')
                text = file.read()
                embedding = generate_embedding(text, window_len, window_shift, preprocessor, model)
                index.add(np.expand_dims(embedding, axis=0))
                filenames.append(filename)

In [None]:
#
# Function to read and store text files from directories starting with "sotu_"
#
def store_text_files_in_sotu_directories(root_directory, index, filenames, window_len, window_shift, preprocessor, model, label=None):
    #
    for entry in os.listdir(root_directory):
        #
        full_path = os.path.join(root_directory, entry)
        #
        if os.path.isdir(full_path) and entry.startswith("sotu_"):
            store_text_files(full_path, index, filenames, window_len, window_shift, preprocessor, model, label)

In [None]:
#
# Function to look for the nearest vector in Faiss index
#
def compare_embedding_to_index(file_path, index, filenames, window_len, window_shift, preprocessor, model, top_k=5):
    #
    with open(file_path, "r", encoding="utf-8") as file:
        #
        query_text = file.read()
        query_embedding = generate_embedding(query_text, window_len, window_shift, preprocessor, model)
        #
        # Convert query embedding to a NumPy array
        query_embedding_np = np.array([query_embedding])
        #
        # Search for the closest vectors in the Faiss index
        similarity, indices = index.search(query_embedding_np, k=top_k)
        #
        # Retrieve the top k closest texts from the filenames list
        closest_texts = [filenames[int(x)] for x in indices[0]]
        #
        return closest_texts, similarity[0]

In [None]:
def test_text_files(directory, index, filenames, top_k=5):
    #
    top_k_files = []
    #
    for filename in os.listdir(directory):
        #
        if filename.endswith('.test.txt'):
            #
            assert (filename in filenames)
            #
            i = filenames.index(filename)
            query = np.array([index.reconstruct(i)])
            similarity, indices = index.search(query, k=top_k+1)
            closest_texts = [filenames[int(x)] for x in indices[0]]
            top_k_list = [closest_texts[1:], similarity[0][1:]]
            top_k_files.append(top_k_list)
    #
    return top_k_files

In [None]:
#
# Function to read and test the FAISS index on test files from directories starting with "sotu_"
#
def test_text_files_in_sotu_directories(root_directory, index, filenames, top_k=5):
    #
    test = {}
    #
    for entry in os.listdir(root_directory):
        #
        full_path = os.path.join(root_directory, entry)
        #
        if os.path.isdir(full_path) and entry.startswith("sotu_"):
            top_k_entries = test_text_files(full_path,
                                            index,
                                            filenames,
                                            top_k)
            test[entry] = top_k_entries
    #
    return test

# FAISS index

In [None]:
# GPT-2 embedding dimension
index_dimension = 768
#
# Create FAISS index
index = faiss.IndexFlatIP(index_dimension)
#
# File names
filenames  = []
#
# Sanity check: should be 768, 0, 0, and the index is trained (by default)
#
print(f'Index dimension {index_dimension}, number of entries {index.ntotal}, number of files {len(filenames)}')
print(f'Is trained? {index.is_trained}')

In [None]:
PREP  = gpt2_preprocessor
MODEL = gpt2_model

Computing the GPT-2 embedding and storing it in the index (uncomment to replicate)

In [None]:
"""
#
# Loading files into the FAISS index with GPT2 embedding
#
store_text_files_in_sotu_directories(".", index, filenames, WINDOW_LEN, WINDOW_SHIFT, PREP, MODEL, label=None)
faiss.write_index(index, "sotu_gpt2_complete.faiss")
with open("sotu_gpt2_filenames_complete.pickle", 'wb') as file:
    pickle.dump(filenames, file)
    file.close()
""";

Loading the index and filenames from the stored dataset

In [None]:
#
# Reading the index from file; reading the filenames from file
#
index = faiss.read_index("sotu_gpt2_complete.faiss")
with open("sotu_gpt2_filenames_complete.pickle", 'rb') as file:
    filenames = pickle.load(file)
    file.close()

In [None]:
#
# Sanity check: should be 768, 229, 229, and the index is trained (by default)
#
print(f'Index dimension {index_dimension}, number of entries {index.ntotal}, number of files {len(filenames)}')
print(f'Is trained? {index.is_trained}')

# Dimension reduction

Dimension reduction and clustering: GPT-2 mapped into dimensions 2 and 3 with UMAP, TriMAP, PaCMAP

In [None]:
#
# Recover GPT2 embedding
#
gpt2_embedding = []
for i in range(index.ntotal):
    vec = index.reconstruct(i)
    gpt2_embedding += [vec]

In [None]:
#
# UMAP the GPT2 embedding into 2D
#
reducer_umap2 = umap.UMAP(n_neighbors=int(np.sqrt(index_dimension)),
                          n_components=2,
                          metric='cosine',
                          verbose=1)
umap_2_embedding = reducer_umap2.fit_transform(gpt2_embedding)

In [None]:
#
# Scatter plot in 2D
#
def plot_2d_embedding(embedding, emb_name):
    labels_year = [int(s.split('_')[1].split('.')[0]) for s in filenames]
    scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels_year)
    plt.title(f'{emb_name} of SOTU in 2D')
    plt.xlabel("X-axis")
    plt.ylabel("Y-axis")
    cbar = plt.colorbar(scatter, label="Year")
    plt.show()
#
plot_2d_embedding(umap_2_embedding, "UMAP")

In [None]:
#
# TriMAP the GPT2 embedding into 2D
#
reducer_trimap2 = trimap.TRIMAP(n_dims=2,
                                distance='cosine',
                                verbose=1)
trimap_2_embedding = reducer_trimap2.fit_transform(np.array(gpt2_embedding))
#
plot_2d_embedding(trimap_2_embedding, "TriMAP")

In [None]:
#
# PaCMAP the GPT2 embedding into 2D
#
reducer_pacmap2 = pacmap.PaCMAP(n_neighbors=int(np.sqrt(index_dimension)),
                                n_components=2,
                                distance='angular',
                                verbose=1)
pacmap_2_embedding = reducer_pacmap2.fit_transform(gpt2_embedding)
#
plot_2d_embedding(pacmap_2_embedding, "PaCMAP")

In [None]:
#
# UMAP the GPT2 embedding into 3D
#
reducer_umap3 = umap.UMAP(n_neighbors=int(np.sqrt(index_dimension)),
                          n_components=3,
                          metric='cosine',
                          verbose=1)
umap_3_embedding = reducer_umap3.fit_transform(gpt2_embedding)

In [None]:
#
# Scatter plot in 3D
#
def plot_3d_embedding(embedding, emb_name):
    labels_year = [int(s.split('_')[1].split('.')[0]) for s in filenames]
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(embedding[:, 0],
                         embedding[:, 1],
                         embedding[:, 2],
                         c=labels_year)
    ax.set_title(f'{emb_name} of SOTU in 3D')
    ax.set_xlabel("X-axis")
    ax.set_ylabel("Y-axis")
    ax.set_zlabel("Z-axis")
    fig.colorbar(scatter, label="Year")
    plt.show()
#
plot_3d_embedding(umap_3_embedding, "UMAP")

In [None]:
#
# TriMAP the GPT2 embedding into 3D
#
reducer_trimap3 = trimap.TRIMAP(n_dims=3,
                                distance='cosine',
                                verbose=1)
trimap_3_embedding = reducer_trimap3.fit_transform(np.array(gpt2_embedding))
#
plot_3d_embedding(trimap_3_embedding, "TriMAP")

In [None]:
#
# PaCMAP the GPT2 embedding into 3D
#
reducer_pacmap3 = pacmap.PaCMAP(n_neighbors=int(np.sqrt(index_dimension)),
                                n_components=3,
                                distance='angular',
                                verbose=1)
pacmap_3_embedding = reducer_pacmap3.fit_transform(gpt2_embedding)
#
plot_3d_embedding(pacmap_3_embedding, "PaCMAP")

# Diagrams for temporal clustering of SOTU speeches

In [None]:
#
# k-Mean clustering on given embedding with k=2, and then plot the diagram
#
def plot_clustering_diagram(embedding):
    
    kmeans = KMeans(n_clusters=2, random_state=42)
    
    cluster_labels = kmeans.fit_predict(embedding)
    labels_year = [int(s.split('_')[1].split('.')[0]) for s in filenames]
    
    df_emb = pd.DataFrame(data=np.array([labels_year, cluster_labels]).T,
                          columns=['Year', 'Cluster'])
    df_emb = df_emb.sort_values('Year')
    
    # Create a color map based on the values in the 'Cluster' column
    colors = df_emb['Cluster'].map({0: 'blue', 1: 'red'})

    # Same height bars for all years, only colors are different
    df_emb['Value'] = 1.0

    # Plotting a bar graph with customized colors
    ax = df_emb.plot(x='Year', y='Value', kind='bar', legend=False, width=1.0, color=colors)

    plt.title('Cluster Labels')
    plt.xlabel('Year')
    plt.ylabel('')

    for label in ax.get_xticklabels():
        if int(label.get_text()) not in [1790, 1928, 2018]:
            label.set_visible(False)

    plt.yticks([])

    plt.show()

In [None]:
#
# Clusters in UMAP 3D embedding
#
plot_clustering_diagram(umap_3_embedding)

A pandas dataframe where the temporal separation of clusters can be checked, and the watershed year(s) can be easily determined by inspection. 

In [None]:
#
# Dataframe with years labelled with 0 / 1 for clustering
#
def df_clustering(embedding):
    
    kmeans = KMeans(n_clusters=2, random_state=42)
    
    cluster_labels = kmeans.fit_predict(embedding)
    labels_year = [int(s.split('_')[1].split('.')[0]) for s in filenames]
    
    df_emb = pd.DataFrame(data=np.array([labels_year, cluster_labels]).T,
                          columns=['Year', 'Cluster'])
    df_emb = df_emb.sort_values('Year')
    
    return df_emb

In [None]:
df_ripple_umap = df_clustering(umap_3_embedding)
df_ripple_umap

In [None]:
#
# Clusters in TriMAP 3D embedding
#
plot_clustering_diagram(trimap_3_embedding)

In [None]:
df_ripple_trimap = df_clustering(trimap_3_embedding)
df_ripple_trimap

In [None]:
#
# Clusters in PaCMAP 3D embedding
#
plot_clustering_diagram(pacmap_3_embedding)

In [None]:
df_ripple_pacmap = df_clustering(pacmap_3_embedding)
df_ripple_pacmap

In [None]:
#
# Clusters in the initial GPT2 embedding
#
plot_clustering_diagram(gpt2_embedding)

In [None]:
df_ripple_gpt2 = df_clustering(gpt2_embedding)
df_ripple_gpt2

# Authorship attribution from embeddings

TOP1 and TOP5 nearest neighbors in the GPT-2 embedding

In [None]:
#
# Calculating TOP1 / TOP5 accuracy from nearest neighbours in the GPT-2 embedding
#
def compute_accuracy(test_top_k):
    top_1_counter = 0
    top_k_counter = 0
    #
    n_categories = 1.0*len(test_top_k.keys())
    #
    for filename, results in test_top_k.items():
        X = filename.split('_')[1]
        n_tests = len(results)
        for res in results:
            y = res[0][0].split('_')[0]
            top_1_counter += 1.0*(X==y) / n_tests
            top_k_counter += any([X==y.split('_')[0] for y in res[0]]) / n_tests
    top_1_acc = top_1_counter/n_categories
    top_k_acc = top_k_counter/n_categories
    return top_1_acc, top_k_acc

In [None]:
test_top_5 = test_text_files_in_sotu_directories(".", index, filenames, top_k=5)

In [None]:
#
# TOP1 accuracy, TOP5 accuracy
#
top_1_acc, top_5_acc = compute_accuracy(test_top_5)
print(f'Accuracy: TOP1 {top_1_acc}, TOP5 {top_5_acc}')

TOP1 and TOP5 nearest neighbors after TriMAP: the Zeitgeist and general politics dominate authorship in temporal clustering

In [None]:
umap_index = faiss.IndexFlatL2(3)
umap_index.add(umap_3_embedding)
print(umap_index.ntotal, len(filenames), umap_index.is_trained)

In [None]:
test_top_5_umap = test_text_files_in_sotu_directories(".", umap_index, filenames, top_k=5)

In [None]:
#
# TOP1 accuracy, TOP5 accuracy
#
top_1_acc_umap, top_5_acc_umap = compute_accuracy(test_top_5_umap)
print(f'Accuracy: TOP1 {top_1_acc_umap}, TOP5 {top_5_acc_umap}')