In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from classifier import Data

dataset = Data(
    data_path = "../data/imdb._debug.csv",
    polarities = {
      "negative": 0,
      "positive": 1
    },
    data_label='review',
    target_label='sentiment'
)

In [4]:
from classifier._neural import Encoder

enc = Encoder({
    'model': 'bert-base-uncased',
    'layers': [11]
})

In [5]:
enc.df_encode(dataset.data, col=dataset.data_label, label=dataset.data_path)

                                                                                                                                                                                                                                                                                                                                                                          

In [6]:
import numpy as np
from sklearn.manifold import MDS

def manifold_reduction(data: np.ndarray, dim: int = 3) -> np.ndarray:
    return MDS(n_components=dim).fit_transform(data)

In [7]:
import torch

embed_col: np.ndarray = torch.stack(dataset.data[enc.col_name].tolist()).numpy()

In [8]:
cluster_records: list = []

In [9]:
import numpy as np
from scipy.spatial import distance

for i in [384, 192, 96, 48, 24, 12, 6, 3]:

    # create record to keep row data
    record: dict = {'dim': i}

    # if reduction size is equal to encoder output dim, skip manifold reduction
    if i == enc.dim:
        dataset.data['reduced_embeds'] = list(embed_col)
    else:
        dataset.data['reduced_embeds'] = list(manifold_reduction(embed_col, dim=i))

    # compute centroid means and dispersion for each cluster
    for label, group in dataset.data.groupby(dataset.target_label):
        record[f'centroid_{label}'] = np.mean(np.stack(group['reduced_embeds'].tolist(), axis=0), axis=0).tolist()
        record[f'dispersion_{label}'] = np.sum(distance.pdist(group['reduced_embeds'].tolist()))

    record['distance'] = distance.cdist([record['centroid_positive']], [record['centroid_negative']])

    cluster_records.append(record)

In [10]:
import pandas as pd

pd.DataFrame.from_records(cluster_records).to_csv('./cluster_analysis.csv')