In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=5


In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from classifier import Data

dataset = Data(
    data_path = "../data/imdb._debug.csv", #  "../data/imdb.train.sample.csv",
    polarities = {
      "negative": 0,
      "positive": 1
    },
    data_label='review',
    target_label='sentiment'
)

In [4]:
from classifier._neural import Encoder

enc = Encoder({
    'model': 'bert-base-uncased',
    'layers': [11]
})

In [24]:
%time enc.df_encode(dataset.data, col=dataset.data_label, label=dataset.data_path)

                                                                             

CPU times: user 1min 18s, sys: 13.8 s, total: 1min 32s
Wall time: 23.4 s




In [6]:
### START ANALYSIS

import numpy as np
import pandas as pd

In [7]:
# setup hyperparameter: dimensions to analyse
DIMS: list = [768, 576, 384, 192, 96, 48, 24, 12, 6, 3]

In [17]:
from sklearn.manifold import MDS

def manifold_reduction(data: np.ndarray, dim: int = 3) -> np.ndarray:
    return MDS(n_components=dim).fit_transform(data)

In [9]:
from scipy.spatial import distance

def metric_computation(record: dict, groups: pd.core.groupby.GroupBy) -> None:

    # compute centroid means and dispersion for each cluster
    for label, group in groups:
            record[f'centroid_{label}'] = np.mean(np.stack(group['reduced_embeds'].tolist(), axis=0), axis=0).tolist()
            record[f'dispersion_{label}'] = np.sum(distance.pdist(group['reduced_embeds'].tolist()))

    record['distance'] = distance.cdist([record['centroid_positive']], [record['centroid_negative']]).item(0)

In [10]:
from typing import Generator
import torch

def analysis(data: pd.DataFrame, dim: list) -> Generator:

    embed_col: np.ndarray = torch.stack(data[enc.col_name].tolist()).numpy()

    for d in dim:

        # create record to keep row data
        record: dict = {'dim': d}

        # if reduction size is equal to encoder output dim, skip manifold reduction
        if d == enc.dim:
            data['reduced_embeds'] = list(embed_col)
        else:
            data['reduced_embeds'] = list(manifold_reduction(embed_col, dim=d))

        metric_computation(record, data.groupby(dataset.target_label))

        yield record

In [22]:
%time cluster_analysis = pd.DataFrame.from_records(list(analysis(dataset.data, DIMS)))

CPU times: user 3.22 s, sys: 47.5 ms, total: 3.27 s
Wall time: 1.02 s


In [23]:
cluster_analysis[['dim', 'dispersion_positive', 'dispersion_negative', 'distance']]

Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,740.529907,2937.050467,3.035837
1,576,741.492362,2939.475522,3.004565
2,384,740.875161,2939.113108,3.020616
3,192,741.109321,2938.997528,3.020126
4,96,741.468815,2938.516747,3.001927
5,48,740.265767,2936.923393,3.050645
6,24,739.67587,2934.071553,3.034331
7,12,741.596591,2925.47561,2.896466
8,6,733.021711,2886.248638,3.175714
9,3,728.718354,2788.146261,2.872169


In [None]:
cluster_analysis[['dim', 'dispersion_positive', 'dispersion_negative', 'distance']].to_csv('./cluster_analysis.csv')