In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from classifier import Data

dataset = Data(
    data_path = "../data/imdb._debug.csv",
    polarities = {
      "negative": 0,
      "positive": 1
    },
    data_label='review',
    target_label='sentiment'
)

In [4]:
from classifier._neural import Encoder

enc = Encoder({
    'model': 'bert-base-uncased',
    'layers': [11]
})

In [5]:
enc.df_encode(dataset.data, col=dataset.data_label, label=dataset.data_path)

                                                                                                                                                                                                                                                                                                                                                                          

In [6]:
import numpy as np
from sklearn.manifold import MDS

def manifold_reduction(data: np.ndarray, dim: int = 3) -> np.ndarray:
    return MDS(n_components=dim).fit_transform(data)

In [33]:
from typing import Generator
import torch
import numpy as np
import pandas as pd
from scipy.spatial import distance


def analysis(
        data: pd.DataFrame,
        dim: list = None
) -> Generator:
    if dim is None:
        dim = [768, 384, 192, 96, 48, 24, 12, 6, 3]

    embed_col: np.ndarray = torch.stack(data[enc.col_name].tolist()).numpy()

    for d in dim:

        # create record to keep row data
        record: dict = {'dim': d}

        # if reduction size is equal to encoder output dim, skip manifold reduction
        if i == enc.dim:
            data['reduced_embeds'] = list(embed_col)
        else:
            data['reduced_embeds'] = list(manifold_reduction(embed_col, dim=i))

        # compute centroid means and dispersion for each cluster
        for label, group in data.groupby(dataset.target_label):
            record[f'centroid_{label}'] = np.mean(np.stack(group['reduced_embeds'].tolist(), axis=0), axis=0).tolist()
            record[f'dispersion_{label}'] = np.sum(distance.pdist(group['reduced_embeds'].tolist()))

        record['distance'] = distance.cdist([record['centroid_positive']], [record['centroid_negative']])

        yield record

In [10]:
import pandas as pd

cluster_analysis = pd.DataFrame.from_records(list(analysis(dataset.data)))
cluster_analysis

[{'dim': 384,
  'centroid_negative': [-0.0687857840888946,
   0.032247533419599544,
   0.08225221678921629,
   -0.023262264430269672,
   0.0007421664535308681,
   -0.08416309467755631,
   0.012965938765337233,
   0.04796676728682954,
   0.03614021051170572,
   -0.06680501931671931,
   -0.07800188223383588,
   0.01997775499931005,
   -0.009801279597623206,
   -0.023054975562396947,
   0.02757508930382853,
   -0.010425695820537997,
   -0.08152642769495097,
   -0.017812506159076962,
   -0.043242175574091615,
   -0.01063365819181469,
   0.0404502875127472,
   0.03570051478731751,
   -0.08200069823375661,
   0.06414302628453583,
   -0.09934100906287242,
   0.02121242474821545,
   -0.10203991639181241,
   -0.04721100904559511,
   -0.053482350793685116,
   -0.05935280635452682,
   -0.0640217220721497,
   -0.04622248408119946,
   0.060209207040948474,
   0.03588661577878602,
   -0.0238003871383918,
   0.0746596493679772,
   0.05502544159892326,
   -0.02588032940838792,
   0.075147924387329,
  

In [11]:
cluster_analysis.to_csv('./cluster_analysis.csv')