In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=5


In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
OUT_PATH: str = '../results/manifolds'

In [3]:
DATA_PATH: str = '../data/imdb'
DATASETS: list = [
    ('train', 'sample.train'),
    ('test', 'sample.test')
]

In [4]:
DIMS: list = [768, 576, 384, 192, 96, 48, 24, 12, 6, 3]
MODELS: list = [
    ('base', 'bert-base-uncased'),
    ('textattack', 'textattack/bert-base-uncased-imdb'),
    ('fabriceyhc', 'fabriceyhc/bert-base-uncased-imdb'),
    ('wakaka', 'Wakaka/bert-finetuned-imdb')
]

In [5]:
### Load Datasets into memory

In [6]:
data_config: dict = {
    'polarities': {
      "negative": 0,
      "positive": 1
    },
    'data_label': 'text',
    'target_label': 'sentiment'
}

In [1]:
from typing import Dict
from modules import Data

datasets: Dict[str, Data] = {
    label: Data(data_path=f"{DATA_PATH}.{name}.csv", **data_config)
    for label, name in DATASETS
}

ModuleNotFoundError: No module named 'classifier'

In [8]:
for label, dataset in datasets.items():
    display(dataset.data)
    display(dataset.data['sentiment'].value_counts(normalize=True))

Unnamed: 0,text,sentiment
0,The acting in this movie was superb. As an ama...,positive
1,Watching That Lady In Ermine I was wondering w...,negative
2,"Why?!! This was an insipid, uninspired and emb...",negative
3,"Although the plot was a bit sappy at times, an...",positive
4,This is one of the few comedies I can watch ag...,positive
...,...,...
1240,This film has a special place in my heart as t...,negative
1241,OK me and a friend rented this a few days ago ...,negative
1242,Five-year-old Michael sees his mother getting ...,negative
1243,Every movie critic and metal head hated this m...,positive


positive    0.515663
negative    0.484337
Name: sentiment, dtype: float64

Unnamed: 0,text,sentiment
0,one of my favorite lines in Shakespeare.i.e. *...,positive
1,A poor basketball movie. A gruff coach with a ...,negative
2,"Based on the book ""Space Vampires"" by Colin Wi...",positive
3,There are about ten minutes about half way thr...,negative
4,>>> Great News there is a BBC DVD release sche...,positive
...,...,...
1235,Sheba Shayne (Pam Grier) receives a telegram i...,negative
1236,"On one level, Hari Om is a film using a famili...",positive
1237,"OK, so I gotta start this review by saying i w...",negative
1238,My daughter already wrote a review of this mov...,positive


negative    0.514516
positive    0.485484
Name: sentiment, dtype: float64

In [9]:
### Load Encoder into Memory

In [10]:
encoder_config: dict = {
    'layers': [11]
}

In [11]:
from classifier._neural import Encoder

encoders: Dict[str, Encoder] = {
    label: Encoder({**{'model': ref}, **encoder_config})
    for label, ref in MODELS
}

In [12]:
### Compute manifolds and measure centroid distance and cluster dispersion

In [13]:
import numpy as np
import pandas as pd

In [14]:
from sklearn.manifold import MDS

def manifold_reduction(data: np.ndarray, dim: int = 3) -> np.ndarray:
    return MDS(n_components=dim).fit_transform(data)

In [15]:
from scipy.spatial import distance

def metric_computation(record: dict, groups: pd.core.groupby.GroupBy) -> None:

    # compute centroid means and dispersion for each cluster
    for label, group in groups:
            record[f'centroid_{label}'] = np.mean(np.stack(group['reduced_embeds'].tolist(), axis=0), axis=0).tolist()
            record[f'dispersion_{label}'] = np.sum(distance.pdist(group['reduced_embeds'].tolist()))

    record['distance'] = distance.cdist([record['centroid_positive']], [record['centroid_negative']]).item(0)

In [16]:
from typing import Generator
import torch

def reduce_analyse(data: pd.DataFrame, col: str, dim: list, default_dim: int = 768) -> Generator:

    embed_col: np.ndarray = torch.stack(data[col].tolist()).numpy()

    for d in dim:

        # create record to keep row data
        record: dict = {'dim': d}

        # if reduction size is equal to encoder output dim, skip manifold reduction
        if d == default_dim:
            data['reduced_embeds'] = list(embed_col)
        else:
            data['reduced_embeds'] = list(manifold_reduction(embed_col, dim=d))

        metric_computation(record, data.groupby(dataset.target_label))

        yield record

In [17]:
output_cols: list = ['dim', 'dispersion_positive', 'dispersion_negative', 'distance']

In [18]:
results: Dict[str, pd.DataFrame] = {}

In [19]:
for enc_label, encoder in encoders.items():
    for data_label, dataset in datasets.items():
        encoder.df_encode(dataset.data, col=dataset.data_label)
        results[f'{data_label}.{enc_label}'] = pd.DataFrame.from_records(
            list(reduce_analyse(
                dataset.data, encoder.col_name, DIMS,
                default_dim=encoder.dim)
            )
        )

                                                                                                                                                                                                                                                                              

In [20]:
for label, dataset in results.items():
    results[label][output_cols].to_csv(f'{OUT_PATH}/analysis.{label}.csv')
    display(results[label][output_cols])

Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,1780549.0,1511064.0,1.215327
1,576,1781745.0,1512164.0,1.145085
2,384,1781667.0,1512090.0,1.141992
3,192,1781040.0,1511510.0,1.160019
4,96,1779856.0,1510311.0,1.189642
5,48,1777581.0,1508217.0,1.226943
6,24,1771273.0,1502219.0,1.374433
7,12,1759527.0,1490951.0,1.5383
8,6,1736990.0,1467700.0,1.60085
9,3,1685935.0,1413828.0,1.318991


Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,1555449.0,1680930.0,1.173917
1,576,1556606.0,1682037.0,1.100364
2,384,1556409.0,1681852.0,1.107525
3,192,1556013.0,1681282.0,1.115471
4,96,1554800.0,1680074.0,1.14575
5,48,1552124.0,1677225.0,1.220297
6,24,1546479.0,1670641.0,1.363173
7,12,1535531.0,1657568.0,1.551356
8,6,1508335.0,1625345.0,1.909222
9,3,1471065.0,1563120.0,1.532055


Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,2080085.0,1761406.0,7.90821
1,576,2080430.0,1761678.0,7.910161
2,384,2080154.0,1761415.0,7.911783
3,192,2079200.0,1760514.0,7.918288
4,96,2077196.0,1758636.0,7.930053
5,48,2072722.0,1754415.0,7.959053
6,24,2063231.0,1745557.0,8.016581
7,12,2041085.0,1724909.0,8.148785
8,6,1985406.0,1673575.0,8.459311
9,3,1818896.0,1530549.0,9.251512


Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,1854011.0,1996711.0,7.360264
1,576,1854369.0,1996996.0,7.361293
2,384,1854081.0,1996713.0,7.363674
3,192,1853283.0,1995866.0,7.368602
4,96,1851391.0,1993884.0,7.382469
5,48,1847360.0,1989571.0,7.410671
6,24,1838861.0,1980605.0,7.467608
7,12,1819118.0,1959699.0,7.600142
8,6,1769348.0,1906282.0,7.921906
9,3,1620701.0,1757998.0,8.749805


Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,1803098.0,1919905.0,22.903667
1,576,1803555.0,1920438.0,22.903796
2,384,1803466.0,1920355.0,22.903874
3,192,1803199.0,1920147.0,22.904074
4,96,1802619.0,1919707.0,22.904353
5,48,1801146.0,1918544.0,22.905467
6,24,1798087.0,1916102.0,22.908108
7,12,1789945.0,1909628.0,22.915375
8,6,1766668.0,1889920.0,22.937814
9,3,1663101.0,1804950.0,23.122424


Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,1892527.0,2477473.0,20.314917
1,576,1893023.0,2477948.0,20.315408
2,384,1892957.0,2477871.0,20.315493
3,192,1892713.0,2477653.0,20.31567
4,96,1892192.0,2477131.0,20.316334
5,48,1890933.0,2476012.0,20.317581
6,24,1888090.0,2473019.0,20.32073
7,12,1880559.0,2465967.0,20.329727
8,6,1858032.0,2443480.0,20.361308
9,3,1762150.0,2352780.0,20.590927


Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,2433107.0,1920625.0,11.704244
1,576,2433626.0,1920946.0,11.707232
2,384,2433385.0,1920715.0,11.707926
3,192,2432659.0,1919993.0,11.710755
4,96,2430984.0,1918393.0,11.717164
5,48,2427405.0,1914943.0,11.73065
6,24,2419387.0,1907246.0,11.762039
7,12,2399986.0,1888622.0,11.842136
8,6,2344958.0,1839915.0,12.067655
9,3,2200707.0,1708544.0,12.63445


Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,2101408.0,2188378.0,11.394895
1,576,2101885.0,2188790.0,11.3972
2,384,2101656.0,2188540.0,11.398204
3,192,2100948.0,2187746.0,11.401071
4,96,2099524.0,2186181.0,11.405367
5,48,2096108.0,2182420.0,11.420528
6,24,2088705.0,2174193.0,11.452305
7,12,2070243.0,2154695.0,11.532118
8,6,2023704.0,2104365.0,11.734304
9,3,1893067.0,1967083.0,12.306471
