In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from classifier import Data

dataset = Data(
    data_path = "../data/imdb._debug.csv",
    polarities = {
      "negative": 0,
      "positive": 1
    },
    data_label='review',
    target_label='sentiment'
)

In [4]:
from classifier._neural import Encoder

enc = Encoder({
    'model': 'bert-base-uncased',
    'layers': [11]
})

In [5]:
enc.df_encode(dataset.data, col=dataset.data_label, label=dataset.data_path)

                                                                                                                                                                                                                                                                                                                                                                          

In [6]:
import numpy as np
from sklearn.manifold import MDS

def manifold_reduction(data: np.ndarray, dim: int = 3) -> np.ndarray:
    return MDS(n_components=dim).fit_transform(data)

In [7]:
from typing import Generator
import torch
import numpy as np
import pandas as pd
from scipy.spatial import distance


def analysis(
        data: pd.DataFrame,
        dim: list = None
) -> Generator:
    if dim is None:
        dim = [768, 384, 192, 96, 48, 24, 12, 6, 3]

    embed_col: np.ndarray = torch.stack(data[enc.col_name].tolist()).numpy()

    for d in dim:

        # create record to keep row data
        record: dict = {'dim': d}

        # if reduction size is equal to encoder output dim, skip manifold reduction
        if d == enc.dim:
            data['reduced_embeds'] = list(embed_col)
        else:
            data['reduced_embeds'] = list(manifold_reduction(embed_col, dim=d))

        # compute centroid means and dispersion for each cluster
        for label, group in data.groupby(dataset.target_label):
            record[f'centroid_{label}'] = np.mean(np.stack(group['reduced_embeds'].tolist(), axis=0), axis=0).tolist()
            record[f'dispersion_{label}'] = np.sum(distance.pdist(group['reduced_embeds'].tolist()))

        record['distance'] = distance.cdist([record['centroid_positive']], [record['centroid_negative']])

        yield record

In [8]:
import pandas as pd

cluster_analysis = pd.DataFrame.from_records(list(analysis(dataset.data)))
cluster_analysis

Unnamed: 0,dim,centroid_negative,dispersion_negative,centroid_positive,dispersion_positive,distance
0,768,"[-0.05271393805742264, -0.11926598846912384, 0...",2937.050055,"[-0.15186531841754913, -0.03394192084670067, 0...",740.52971,[[3.0358365653848836]]
1,384,"[0.03381492741932707, -0.04140750866488096, 0....",2939.310436,"[-0.07023100310167933, 0.08600021030398339, -0...",741.413989,[[3.00779268375431]]
2,192,"[0.04444726154356465, 0.05418843001541997, -0....",2939.130094,"[-0.09231354320586505, -0.1125452008012569, 0....",741.335642,[[3.0080284294568846]]
3,96,"[0.021120708305122034, -0.09543997923958604, 0...",2938.916278,"[-0.04386608647986888, 0.19822149534375566, -0...",741.575287,[[2.9822537816317483]]
4,48,"[0.10484707443004394, -0.11290730010583819, -0...",2936.515351,"[-0.21775930843162966, 0.23449977714289483, 0....",739.894916,[[3.0641653027630857]]
5,24,"[0.4245985475647291, 0.11885078813885468, -0.4...",2932.804027,"[-0.8818585218652066, -0.24684394459608286, 0....",738.263493,[[3.113300690621971]]
6,12,"[0.15022901715408205, 0.05824877986172345, 0.3...",2926.27704,"[-0.31201411255078576, -0.12097823509742583, -...",743.144644,[[2.8600940965315838]]
7,6,"[0.33139201186619477, -0.22066486211003652, -0...",2888.5426,"[-0.6882757169528664, 0.4583039443823837, 1.31...",734.453189,[[3.1294672222006956]]
8,3,"[0.6324503236432255, -0.40516824460721473, 0.4...",2788.403979,"[-1.313550672182084, 0.8415032772611386, -1.03...",738.424212,[[2.771629463885227]]


In [9]:
cluster_analysis.to_csv('./cluster_analysis.csv')