In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=5


In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from classifier import Data

dataset = Data(
    data_path = "../data/imdb.train.sample.csv",
    polarities = {
      "negative": 0,
      "positive": 1
    },
    data_label='review',
    target_label='sentiment'
)

In [4]:
from classifier._neural import Encoder

enc = Encoder({
    'model': 'textattack/bert-base-uncased-imdb',
    'layers': [11]
})

In [5]:
enc.df_encode(dataset.data, col=dataset.data_label, label=dataset.data_path)

                                                                                                                                                                                                                                                                                                                                                                          

In [6]:
import numpy as np
from sklearn.manifold import MDS

def manifold_reduction(data: np.ndarray, dim: int = 3) -> np.ndarray:
    return MDS(n_components=dim).fit_transform(data)

In [7]:
from typing import Generator
import torch
import numpy as np
import pandas as pd
from scipy.spatial import distance


def analysis(
        data: pd.DataFrame,
        dim: list = None
) -> Generator:
    if dim is None:
        dim = [768, 576, 384, 192, 96, 48, 24, 12, 6, 3]

    embed_col: np.ndarray = torch.stack(data[enc.col_name].tolist()).numpy()

    for d in dim:

        # create record to keep row data
        record: dict = {'dim': d}

        # if reduction size is equal to encoder output dim, skip manifold reduction
        if d == enc.dim:
            data['reduced_embeds'] = list(embed_col)
        else:
            data['reduced_embeds'] = list(manifold_reduction(embed_col, dim=d))

        # compute centroid means and dispersion for each cluster
        for label, group in data.groupby(dataset.target_label):
            record[f'centroid_{label}'] = np.mean(np.stack(group['reduced_embeds'].tolist(), axis=0), axis=0).tolist()
            record[f'dispersion_{label}'] = np.sum(distance.pdist(group['reduced_embeds'].tolist()))

        record['distance'] = distance.cdist([record['centroid_positive']], [record['centroid_negative']]).item(0)

        yield record

In [8]:
import pandas as pd

cluster_analysis = pd.DataFrame.from_records(list(analysis(dataset.data)))
cluster_analysis[['dim', 'dispersion_positive', 'dispersion_negative', 'distance']]

Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,1709004.0,1349562.0,22.700799
1,576,1709415.0,1349937.0,22.701091
2,384,1709337.0,1349875.0,22.701175
3,192,1709047.0,1349683.0,22.701461
4,96,1708582.0,1349409.0,22.701736
5,48,1707326.0,1348580.0,22.702967
6,24,1704465.0,1346718.0,22.7053
7,12,1697297.0,1341795.0,22.712486
8,6,1675842.0,1327215.0,22.739022
9,3,1587790.0,1272563.0,22.910621


In [9]:
import pandas as pd

cluster_analysis = pd.DataFrame.from_records(list(analysis(dataset.data)))
cluster_analysis[['dim', 'dispersion_positive', 'dispersion_negative', 'distance']]

Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,1709004.0,1349562.0,22.700799
1,576,1709429.0,1349943.0,22.701073
2,384,1709368.0,1349906.0,22.70108
3,192,1709093.0,1349721.0,22.701409
4,96,1708548.0,1349361.0,22.701763
5,48,1707356.0,1348583.0,22.702989
6,24,1704511.0,1346745.0,22.705339
7,12,1697146.0,1341956.0,22.713308
8,6,1675997.0,1326900.0,22.738131
9,3,1580486.0,1271556.0,22.918353


In [10]:
cluster_analysis[['dim', 'dispersion_positive', 'dispersion_negative', 'distance']].to_csv('./cluster_analysis.csv')