To apply hierarchical clustering and k-means clustering to a tabular dataset with categorical variables, the following steps can be taken:

1. Load the data into a pandas dataframe.
2. Preprocess the data by converting the categorical variables into one-hot encoded vectors using the `pd.get_dummies()` function.
3. Apply the clustering algorithm of choice, such as hierarchical clustering or k-means clustering, to the preprocessed data.
4. Analyze the resulting clusters, such as by visualizing them using t-SNE or by examining the characteristics of the data points in each cluster.
5. (Optional) Use the resulting clusters to make predictions or to guide further analysis of the data.

It's important to note that the choice of clustering algorithm and the preprocessing steps may vary depending on the specific dataset and research question at hand. It's also important to carefully choose the number of clusters in order to avoid overfitting or underfitting the data. Various methods, such as the elbow method or silhouette analysis, can be used to determine an appropriate number of clusters.


In [None]:
# conda env create -f environment.yml -n torch
# conda activate torch

In [None]:
import pandas as pd
import numpy as np

data_path = 'data/2223RX19_Project_Data.xlsx'

# Load the xlsx file
df = pd.read_excel(data_path)

# split the categories into separate strings
categories = df['CATEGORY']


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch 
# Load AutoTokenizer for a specific pre-trained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased").to('mps')



In [None]:
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
# Define a dataset class
class CategoryDataset(Dataset):
    def __init__(self, categories, tokenizer):
        self.categories = categories
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.categories)

    def __getitem__(self, idx):
        return self.categories[idx]

# Create a dataset object
dataset = CategoryDataset(categories, tokenizer)

# Create a DataLoader object to create batches of data
batch_size = 16
dataloader = DataLoader(dataset, batch_size=batch_size)

# Create a matrix of embeddings for each category
embeddings_matrix = np.zeros((len(categories), 768))
for i, batch in tqdm(enumerate(dataloader), total=len(dataset)//batch_size):
    with torch.no_grad():
        # tokenize the batch of categories
        tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to('mps')
        # generate the embeddings
        outputs = model(**tokens).last_hidden_state.mean(dim=1)
        # convert the embeddings to numpy
        embeddings = outputs.cpu().numpy()
        # add the embeddings to the dataframe
        embeddings_matrix[i*batch_size:(i+1)*batch_size] = embeddings



In [None]:
from sklearn.cluster import KMeans

# Set the number of clusters
num_clusters = 5

# Create a KMeans object
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the KMeans object to the embeddings_matrix
kmeans.fit(embeddings_matrix)

# Get the cluster labels for each row of embeddings_matrix
df['cluster_label'] = kmeans.labels_

df.to_excel('export/2223RX19_Project_Data_With_Bert_Cluster_Label.xlsx')


In [None]:
grouped = df.groupby('cluster_label')
for label, group in grouped:
    print(f'Cluster {label}: {list(group["COMPANY_ID"])}')

In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# apply t-SNE to reduce the dimensionality of the one-hot encoded categories
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(embeddings_matrix)

# create a new dataframe with the t-SNE results
tsne_df = pd.DataFrame(data=tsne_results, columns=['tsne_1', 'tsne_2'])

# plot the t-SNE results and color the points by the cluster labels
plt.figure(figsize=(16,10))
plt.scatter(tsne_df['tsne_1'], tsne_df['tsne_2'], c=df['cluster_label'], cmap='tab20')
plt.title('t-SNE Clustering of Categories')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.show()