In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from BertForSequenceClassificationOutputPooled import *
from BertTM import *

### Load pretrained

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassificationOutputPooled.from_pretrained('bert-base-uncased', 
                                                              output_attentions=True, 
                                                              output_hidden_states=True)
labels = torch.tensor([1]).unsqueeze(0)
input_list = []
token_list = []
cls_ = '[CLS]'
sep_ = '[SEP]'
sentences = ['Hello, my dog is cute and cutest.', 'I am too']
for i, sent in enumerate(sentences):
    inputs = tokenizer.encode_plus(sentences[i], add_special_tokens=True)
    tokens = [cls_] + tokenizer.tokenize(sentences[i]) + [sep_]
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0)
    input_list.append(input_ids)
    token_list.append(tokens)

### Load fine-tuned

In [2]:
output_dir = "model_save_attention_1epoch"

# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassificationOutputPooled.from_pretrained(output_dir,
                                                      output_attentions = True, 
                                                      output_hidden_states = True)
tokenizer = BertTokenizer.from_pretrained(output_dir)
labels = torch.tensor([1]).unsqueeze(0)
input_list = []
token_list = []
cls_ = '[CLS]'
sep_ = '[SEP]'
sentences = ['Hello, my dog is cute and cutest.', 'I am too']
for i, sent in enumerate(sentences):
    inputs = tokenizer.encode_plus(sentences[i], add_special_tokens=True)
    tokens = [cls_] + tokenizer.tokenize(sentences[i]) + [sep_]
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0)
    input_list.append(input_ids)
    token_list.append(tokens)

### Test that attention and vectorization work

In [3]:
attentions = get_attention(sentences, model, tokenizer, method = 'first')
np.sum([tpl[1] for tpl in attentions[1]])

vectorized = vectorize(sentences, model, tokenizer)
torch.stack(vectorized).detach().numpy().shape

(2, 768)

In [4]:
get_attention(["this movie was extremely bad"], model, tokenizer, method = 'first')

[[('this', 0.30418563),
  ('movie', 0.16935529),
  ('was', 0.16058609),
  ('extremely', 0.12557101),
  ('bad', 0.24030195)]]

## Topic Model

In [5]:
#!wget https://raw.githubusercontent.com/huseinzol05/NLP-Models-Tensorflow/master/text-classification/data/negative/negative

with open('negative') as fopen:
    negative = fopen.read().split('\n')[:-1]
len(negative)

5330

In [6]:
batch_size = 10
ngram = (1, 3)
n_topics = 10

In [7]:
from sklearn.cluster import KMeans
from tqdm import tqdm

negative = negative[:100]
rows, attentions = [], []
counter = 0
for i in range(0, len(negative), batch_size):
    index = min(i + batch_size, len(negative))
    rows.append(vectorize(negative[i:index], model, tokenizer))
    attentions.extend(get_attention(negative[i:index], model, tokenizer))
    if counter % 50 == 0:
        print(f"Processed {counter} rows out of {len(negative)}.")
    counter += 1

Processed 0 rows out of 100.


In [8]:
import json
with open('stopwords-en.json') as fopen:
    stopwords = json.load(fopen)
print(len(stopwords))
print(stopwords[:5])

1298
["'ll", "'tis", "'twas", "'ve", '10']


In [9]:
def generate_ngram(seq, ngram = (1, 3)):
    g = []
    for i in range(ngram[0], ngram[-1] + 1):
        g.extend(list(ngrams_generator(seq, i)))
    return g

def _pad_sequence(
    sequence,
    n,
    pad_left = False,
    pad_right = False,
    left_pad_symbol = None,
    right_pad_symbol = None,
):
    sequence = iter(sequence)
    if pad_left:
        sequence = itertools.chain((left_pad_symbol,) * (n - 1), sequence)
    if pad_right:
        sequence = itertools.chain(sequence, (right_pad_symbol,) * (n - 1))
    return sequence


def ngrams_generator(
    sequence,
    n,
    pad_left = False,
    pad_right = False,
    left_pad_symbol = None,
    right_pad_symbol = None,
):
    """
    generate ngrams.

    Parameters
    ----------
    sequence : list of str
        list of tokenize words.
    n : int
        ngram size

    Returns
    -------
    ngram: list
    """
    sequence = _pad_sequence(
        sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
    )

    history = []
    while n > 1:
        try:
            next_item = next(sequence)
        except StopIteration:
            return
        history.append(next_item)
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]

In [10]:
concat = np.concatenate(rows, axis = 0)
concat = [item.detach().numpy() for item in concat]
concat = np.asarray(concat, dtype=np.float32)

In [11]:
kmeans = KMeans(n_clusters = n_topics, random_state = 0).fit(concat)
labels = kmeans.labels_

overall, filtered_a = [], []
for a in attentions:
    f = [i for i in a if i[0] not in stopwords]
    overall.extend(f)
    filtered_a.append(f)

o_ngram = generate_ngram(overall, ngram)
features = []
for i in o_ngram:
    features.append(' '.join([w[0] for w in i]))
features = list(set(features))

components = np.zeros((n_topics, len(features)))
for no, i in enumerate(labels):
    if (no + 1) % 500 == 0:
        print('processed %d'%(no + 1))
    f = generate_ngram(filtered_a[no], ngram)
    for w in f:
        word = ' '.join([r[0] for r in w])
        score = np.mean([r[1] for r in w])
        if word in features:
            components[i, features.index(word)] += score

In [12]:
def print_topics_modelling(
    topics, feature_names, sorting, n_words = 20, return_df = True
):
    if return_df:
        try:
            import pandas as pd
        except:
            raise Exception(
                'pandas not installed. Please install it and try again or set `return_df = False`'
            )
    df = {}
    for i in range(topics):
        words = []
        for k in range(n_words):
            words.append(feature_names[sorting[i, k]])
        df['topic %d' % (i)] = words
    if return_df:
        return pd.DataFrame.from_dict(df)
    else:
        return df

In [13]:
print_topics_modelling(
    10,
    feature_names = np.array(features),
    sorting = np.argsort(components)[:, ::-1],
    n_words = 10,
    return_df = True,
)

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,analyze,compelling,silly,overboard,movie,characters,watch,subplots,owes,time impossible
1,comedy,earnest,silly tedious,film,film,film,watch annoying,examination,sonnenfeld owes,guy
2,romantic comedy,earnest heavyhanded,dull,film overboard,coma,script,character,watching,owes frank,silly tedious
3,movie,heavyhanded,simplistic silly,dominates,hey,action,easy watch,moments subplots,cameo,farce parody comedy
4,romantic,sour,simplistic silly tedious,prison,sports,feels,easy watch annoying,woman,barry sonnenfeld owes,gems field roughage
5,inconsequential romantic comedy,film,tedious,seagal,sports movie,crowd,lead character,moments,sonnenfeld owes frank,report totalitarian
6,odd,survived,gloom,rollerball film overboard,worse,action clichs,watch annoying demeanour,robotic,owes frank pug,sundance
7,ceos,farcical sour,simplistic,koolaid,american sports movie,mixed,annoying,movie,movie,covers huge
8,film,sentimental,kinda,orange prison,pc,gags,lead,funny,story,bullets hit sascha
9,spiritual,code,hotel,field,relative,feels cold,demeanour lead character,freeman,baboon cameo,haunted house
