In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from classifier import Data

dataset = Data(
    data_path = "../imdb.train.sample.csv",
    polarities = {
      "negative": 0,
      "positive": 1
    },
    data_label='review',
    target_label='sentiment'
)

In [1]:
from classifier._neural import Encoder

enc = Encoder({
    'model': 'bert-base-uncased',
    'layers': [11]
})

In [5]:
enc.df_encode(dataset.data, col=dataset.data_label, label=dataset.data_path)

                                                                                                                                                                                                                                                                                                                                                                          

In [6]:
import numpy as np
# from sklearn.manifold import MDS
from sklearn.decomposition import KernelPCA

def manifold_reduction(data: np.ndarray, dim: int = 3) -> np.ndarray:
    return KernelPCA(n_components=dim).fit_transform(data)

In [7]:
from typing import Generator
import torch
import numpy as np
import pandas as pd
from scipy.spatial import distance


def analysis(
        data: pd.DataFrame,
        dim: list = None
) -> Generator:
    if dim is None:
        dim = [768, 576, 384, 192, 96, 48, 24, 12, 6, 3]

    embed_col: np.ndarray = torch.stack(data[enc.col_name].tolist()).numpy()

    for d in dim:

        # create record to keep row data
        record: dict = {'dim': d}

        # if reduction size is equal to encoder output dim, skip manifold reduction
        if d == enc.dim:
            data['reduced_embeds'] = list(embed_col)
        else:
            data['reduced_embeds'] = list(manifold_reduction(embed_col, dim=d))

        # compute centroid means and dispersion for each cluster
        for label, group in data.groupby(dataset.target_label):
            record[f'centroid_{label}'] = np.mean(np.stack(group['reduced_embeds'].tolist(), axis=0), axis=0).tolist()
            record[f'dispersion_{label}'] = np.sum(distance.pdist(group['reduced_embeds'].tolist()))

        record['distance'] = distance.cdist([record['centroid_positive']], [record['centroid_negative']]).item(0)

        yield record

In [8]:
import pandas as pd

cluster_analysis = pd.DataFrame.from_records(list(analysis(dataset.data)))
cluster_analysis[['dim', 'dispersion_positive', 'dispersion_negative', 'distance']]

Unnamed: 0,dim,dispersion_positive,dispersion_negative,distance
0,768,740.52971,2937.050055,3.035837
1,576,740.970098,2938.980243,3.028671
2,384,741.140917,2939.276179,3.018186
3,192,740.566328,2938.740434,3.038683
4,96,741.184813,2938.294281,3.002429
5,48,741.260948,2938.00997,2.994337
6,24,740.255272,2933.946146,3.054566
7,12,743.686835,2927.45791,2.840172
8,6,729.966628,2886.378546,3.221255
9,3,737.22831,2783.300043,2.846027


In [None]:
import pandas as pd

cluster_analysis = pd.DataFrame.from_records(list(analysis(dataset.data)))
cluster_analysis[['dim', 'dispersion_positive', 'dispersion_negative', 'distance']]

In [None]:
cluster_analysis[['dim', 'dispersion_positive', 'dispersion_negative', 'distance']].to_csv('./cluster_analysis.csv')

In [4]:
import pandas as pd

full = pd.read_csv('../data/imdb.train.csv')
full

Unnamed: 0,review,sentiment
0,John Carpenter's Halloween is quite frankly a ...,positive
1,This is a strange sex comedy because there`s v...,negative
2,My friends and I were just discussing how frus...,positive
3,"The premise is ridiculous, the characters unbe...",negative
4,I enjoyed every moment of this beautiful film ...,positive
...,...,...
20165,"The plot of 'Edison' was decent, but one actor...",negative
20166,Amateurism best describes the film adaptation ...,negative
20167,After having seen the movie the first question...,negative
20168,I was taken to this film by a friend and was s...,positive


In [5]:
full.sample(frac=0.05)

Unnamed: 0,review,sentiment
8216,Stewart Moss stars as a scientist who is on a ...,negative
6122,This fantastic whodunit is an early prototype ...,positive
4364,The 1997 low-key indie dramedy Henry Fool woul...,positive
5517,I just saw the movie on tv. I really enjoyed i...,positive
3326,"In Iran, women are not permitted to attend men...",positive
...,...,...
10960,"Seeing ""Moonstruck"" after so many years is a r...",positive
15614,I figured that any horror film with Orson Well...,negative
2413,"Better than the typical made-for-tv movie, INV...",positive
5939,A Blair Witch-War Movie that is as much of a l...,negative
