In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from classifier import Data

dataset = Data(
    data_path = "../data/imdb._debug.csv",
    polarities = {
      "negative": 0,
      "positive": 1
    },
    data_label='review',
    target_label='sentiment'
)

In [4]:
from classifier._neural import Encoder

enc = Encoder({
    'model': 'bert-base-uncased',
    'layers': [11]
})

In [5]:
enc.df_encode(dataset.data, col=dataset.data_label, label=dataset.data_path)

                                                                                                                                                                                                                                                                                                                                                                          

In [6]:
import numpy as np
from sklearn.manifold import MDS

def manifold_reduction(data: np.ndarray, dim: int = 3) -> np.ndarray:
    return MDS(n_components=dim).fit_transform(data)

In [7]:
from typing import Generator
import torch
import numpy as np
import pandas as pd
from scipy.spatial import distance


def analysis(
        data: pd.DataFrame,
        dim: list = None
) -> Generator:
    if dim is None:
        dim = [768, 576, 384, 192, 96, 48, 24, 12, 6, 3]

    embed_col: np.ndarray = torch.stack(data[enc.col_name].tolist()).numpy()

    for d in dim:

        # create record to keep row data
        record: dict = {'dim': d}

        # if reduction size is equal to encoder output dim, skip manifold reduction
        if d == enc.dim:
            data['reduced_embeds'] = list(embed_col)
        else:
            data['reduced_embeds'] = list(manifold_reduction(embed_col, dim=d))

        # compute centroid means and dispersion for each cluster
        for label, group in data.groupby(dataset.target_label):
            record[f'centroid_{label}'] = np.mean(np.stack(group['reduced_embeds'].tolist(), axis=0), axis=0).tolist()
            record[f'dispersion_{label}'] = np.sum(distance.pdist(group['reduced_embeds'].tolist()))

        record['distance'] = distance.cdist([record['centroid_positive']], [record['centroid_negative']]).item(0)

        yield record

In [8]:
import pandas as pd

cluster_analysis = pd.DataFrame.from_records(list(analysis(dataset.data)))
cluster_analysis

Unnamed: 0,dim,centroid_negative,dispersion_negative,centroid_positive,dispersion_positive,distance
0,768,"[-0.05271393805742264, -0.11926598846912384, 0...",2937.050055,"[-0.15186531841754913, -0.03394192084670067, 0...",740.52971,3.035837
1,576,"[0.08174261896866489, -0.03410594663792309, 0....",2939.585108,"[-0.16977313170415018, 0.07083542763260951, -0...",741.552227,3.001077
2,384,"[0.007851863867035775, -0.0682076032248066, -0...",2938.939624,"[-0.01630771726230506, 0.14166194515921363, 0....",741.071362,3.027568
3,192,"[-0.05404731739650713, -0.1041588318174636, -0...",2939.063456,"[0.11225212074659163, 0.21632988146703983, 0.0...",741.168906,3.016663
4,96,"[0.14252597411451307, 0.07733253231225075, 0.1...",2939.251506,"[-0.29601548469937333, -0.16061372095621324, -...",741.960208,2.982381
5,48,"[0.1783247117716797, 0.1471603913533501, -0.09...",2937.898678,"[-0.3703667090642578, -0.30564081281080396, 0....",741.572652,2.990593
6,24,"[-0.1354294089260453, -0.153818090111205, 0.02...",2935.510216,"[0.2812764646925557, 0.31946834100019506, -0.0...",741.940899,2.965794
7,12,"[-0.08231881094074066, 0.166067634822193, -0.1...",2925.519007,"[0.1709698381076923, -0.34490970309224694, 0.3...",742.339787,2.911227
8,6,"[0.061759393908460226, 0.1884280416646303, 0.4...",2896.603703,"[-0.12826951042526347, -0.3913505480726937, -1...",740.376417,2.800732
9,3,"[0.31934071544289533, -0.8876141610736561, 0.1...",2786.847898,"[-0.6632461013044748, 1.8435063345375926, -0.3...",731.610991,2.936529


In [9]:
cluster_analysis[['dim', 'dispersion_positive', 'dispersion_negative', 'distance']].to_csv('./cluster_analysis.csv')