In [2]:
import json
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn as nn

from BertForSequenceClassificationOutputPooled import *
from BertTM import *
from sentence_transformers import SentenceTransformer
from sentence_transformers import models, losses
from sklearn.cluster import KMeans

### Load pretrained

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassificationOutputPooled.from_pretrained('bert-base-uncased', 
                                                              output_attentions=True, 
                                                              output_hidden_states=True)
labels = torch.tensor([1]).unsqueeze(0)
input_list = []
token_list = []
cls_ = '[CLS]'
sep_ = '[SEP]'
sentences = ['Hello, my dog is cute and cutest.', 'I am too']
for i, sent in enumerate(sentences):
    inputs = tokenizer.encode_plus(sentences[i], add_special_tokens=True)
    tokens = [cls_] + tokenizer.tokenize(sentences[i]) + [sep_]
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0)
    input_list.append(input_ids)
    token_list.append(tokens)

### Load fine-tuned

In [4]:
output_dir = "../bert-classifier-pytorch/model_save_attention_1epoch"

# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassificationOutputPooled.from_pretrained(output_dir,
                                                      output_attentions = True, 
                                                      output_hidden_states = True)
tokenizer = BertTokenizer.from_pretrained(output_dir)
labels = torch.tensor([1]).unsqueeze(0)
input_list = []
token_list = []
cls_ = '[CLS]'
sep_ = '[SEP]'
sentences = ['Hello, my dog is cute and cutest.', 'I am too']
for i, sent in enumerate(sentences):
    inputs = tokenizer.encode_plus(sentences[i], add_special_tokens=True)
    tokens = [cls_] + tokenizer.tokenize(sentences[i]) + [sep_]
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0)
    input_list.append(input_ids)
    token_list.append(tokens)

In [146]:
word_embedding_model = models.BERT(output_dir, max_seq_length = 240,)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

st_model = SentenceTransformer(modules=[word_embedding_model, pooling_model],
                               #device=torch.device("cuda")
                              )

### Test that attention and vectorization work

In [3]:
attentions = get_attention(sentences, model, tokenizer, method = 'first')
np.sum([tpl[1] for tpl in attentions[1]])

vectorized = vectorize(sentences, model, tokenizer)
torch.stack(vectorized).detach().numpy().shape

(2, 768)

In [4]:
get_attention(["this movie was extremely bad"], model, tokenizer, method = 'first')

[[('this', 0.30418563),
  ('movie', 0.16935529),
  ('was', 0.16058609),
  ('extremely', 0.12557101),
  ('bad', 0.24030195)]]

## Topic Model

In [7]:
df = pd.read_csv("nlwx_2020_hashtags_no_rt_predictions.csv")
data = df['text']

In [3]:
batch_size = 20
ngram = (1, 3)
n_topics = 10

#### Pooled

In [7]:
negative = negative[:100]
rows, attentions = [], []
counter = 0
for i in range(0, len(data), batch_size):
    index = min(i + batch_size, len(data))
    rows.append(vectorize(data[i:index], model, tokenizer))
    attentions.extend(get_attention(data[i:index], model, tokenizer))
    if counter % 50 == 0:
        print(f"Processed {counter} rows out of {len(data)}.")
    counter += 1

Processed 0 rows out of 100.


#### Sentence embeddings

In [88]:
rows = st_model.encode(data)

In [None]:
attentions = get_attention(data, model, tokenizer)

In [168]:
rows, attentions = [], []
counter = 0
for i in range(0, len(data)):
    #index = min(i + batch_size, len(data))
    rows.extend(st_model.encode([data[i]]))
    attentions.extend(get_attention([data[i]], model, tokenizer))
    counter += 1
    if counter % 500 == 0:
        print(f"Processed {counter} rows out of {len(data)}.")

Processed 500 rows out of 21797.
Processed 1000 rows out of 21797.
Processed 1500 rows out of 21797.
Processed 2000 rows out of 21797.
Processed 2500 rows out of 21797.
Processed 3000 rows out of 21797.
Processed 3500 rows out of 21797.
Processed 4000 rows out of 21797.
Processed 4500 rows out of 21797.
Processed 5000 rows out of 21797.
Processed 5500 rows out of 21797.
Processed 6000 rows out of 21797.
Processed 6500 rows out of 21797.
Processed 7000 rows out of 21797.
Processed 7500 rows out of 21797.
Processed 8000 rows out of 21797.
Processed 8500 rows out of 21797.
Processed 9000 rows out of 21797.
Processed 9500 rows out of 21797.
Processed 10000 rows out of 21797.
Processed 10500 rows out of 21797.
Processed 11000 rows out of 21797.
Processed 11500 rows out of 21797.
Processed 12000 rows out of 21797.
Processed 12500 rows out of 21797.
Processed 13000 rows out of 21797.
Processed 13500 rows out of 21797.
Processed 14000 rows out of 21797.
Processed 14500 rows out of 21797.
Proce

In [4]:
with open('stopwords-en.json') as fopen:
    stopwords = json.load(fopen)
print(len(stopwords))
print(stopwords[:5])

1298
["'ll", "'tis", "'twas", "'ve", '10']


In [173]:
concat = np.concatenate(rows, axis = 0)
#concat = [item.detach().numpy() for item in concat]
concat = np.asarray(concat, dtype=np.float32)

In [188]:
all_model_data = []

for i in range(len(rows)):
    all_model_data.append((data[i], df.prediction[i], attentions[i], rows[i]))
    
pickle.dump(all_model_data, open(f"attentions_sent_embeddings.pkl", "wb" ))

In [44]:
all_model_data = pickle.load(open("attentions_sent_embeddings.pkl", "rb"))
texts, _, attentions, rows = zip(*all_model_data)

In [70]:
%%time

print("Fitting kmeans model.")
rows = rows[:1000]
attentions = attentions[:1000]
kmeans = KMeans(n_clusters = n_topics, random_state = 0).fit(rows)
labels = kmeans.labels_

Fitting kmeans model.
CPU times: user 2.21 s, sys: 596 ms, total: 2.8 s
Wall time: 1.47 s


In [71]:
%%time

overall, filtered_a = [], []
print("Filtering attentions.")
for a in attentions:
    f = [i for i in a if i[0] not in stopwords]
    overall.extend(f)
    filtered_a.append(f)

Filtering attentions.
CPU times: user 404 ms, sys: 0 ns, total: 404 ms
Wall time: 402 ms


In [73]:
%%time

print("Generating ngrams.")
o_ngram = generate_ngram(overall, ngram)
features = []
for i in o_ngram:
    features.append(' '.join([w[0] for w in i]))
features = list(set(features))


Generating ngrams.
CPU times: user 108 ms, sys: 0 ns, total: 108 ms
Wall time: 108 ms


In [76]:
%%time

import time

print(
"""
Determining cluster components. This will take awhile. 
Progress will be printed for every 100th processed property.
""")

components = {}
words_label = {}
start_time = time.time()
for idx, label in enumerate(labels):
    if label not in components:
        components[label] = {}
        words_label[label] = []
    else:
        f = generate_ngram(filtered_a[idx], ngram)
        for w in f:
            word = ' '.join([r[0] for r in w])
            score = np.mean([r[1] for r in w])
            if word in features:
                if word in components[label]:
                    components[label][word] += score
                else:
                    components[label][word] = score
                words_label[label].append(word)
    if (idx + 1) % 100 == 0:
        print(f'Processed {(idx + 1)} texts in {round(time.time() - start_time, 2)} seconds.')
            
print(f"Finished determining cluster components. Total time {round(time.time() - start_time, 2)} seconds.")


Determining cluster components. This will take awhile. 
Progress will be printed for every 100th processed property.

Processed 100 texts in 1.6 seconds.
Processed 200 texts in 3.3 seconds.
Processed 300 texts in 5.12 seconds.
Processed 400 texts in 6.99 seconds.
Processed 500 texts in 8.7 seconds.
Processed 600 texts in 10.51 seconds.
Processed 700 texts in 12.22 seconds.
Processed 800 texts in 13.96 seconds.
Processed 900 texts in 15.63 seconds.
Processed 1000 texts in 17.39 seconds.
Finished determining cluster components. Total time 17.39 seconds.
CPU times: user 17.4 s, sys: 20 ms, total: 17.4 s
Wall time: 17.4 s


In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer
#tf_idf_corpus = [[w[0] for w in text] for text in filtered_a]

def dummy_fun(doc):
    return doc

tfidf_vectorizer = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None) 

tf_idf_corpus = [[item for item in words_label[key]] for key in range(0,10)]
transformed = tfidf_vectorizer.fit_transform(tf_idf_corpus)

In [87]:
index_value={i[1]:i[0] for i in tfidf_vectorizer.vocabulary_.items()}
fully_indexed = []
for row in transformed:
    fully_indexed.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})

In [101]:
len(fully_indexed)

10

In [132]:
components_tfidf_attn = {}
components_tfidf = {}
for k1 in components:
    components_tfidf_attn[k1] = {}
    components_tfidf[k1] = {}
    for k2 in components[k1]:
        components_tfidf_attn[k1][k2] = fully_indexed[k1][k2] * components[k1][k2]
        components_tfidf[k1][k2] = fully_indexed[k1][k2]

In [133]:
topics_attn = topics_df(
    10,
    components,
    n_words = 10)

pickle.dump(topics_attn, open("topics_sent_embed.pickle", "wb"))

In [134]:
topics_attn

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,.,#,#,.,power,.,.,stay,.,#
1,snow,.,newfoundland,hope,.,snow,#,stay safe,power,.
2,storm,snowmaggedon2020,.,thinking,#,winds,storm,safe,buried,@
3,#,snow,# nlwx,safe,!,storm,snow,!,",",newfoundland
4,",",# snowmaggedon2020,/,stay,",",wind,blizzard,.,outage,!
5,…,newfoundland,nlblizzard2020,#,snow,#,stormageddon2020,. stay,evacuated,# newfoundland
6,hospital,blizzard,https,newfoundland,…,…,newfoundland,. stay safe,power outage,","
7,shovel,storm,nlwx,!,house,airport,",",",",…,[UNK]
8,labour,!,# nlblizzard2020,",",power .,gusts,!,newfoundland,trapped,'
9,… https,",",nlstorm,hoping,. #,. john,wind,#,snow,…


In [129]:
topics_tfidf = topics_df(
    10,
    components_tfidf,
    n_words = 10)

topics_tfidf

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,/,/,/,.,/,/,/,/,/,/
1,.,#,#,/,.,.,.,.,.,#
2,#,.,.,#,#,#,#,#,:,.
3,:,:,:,safe,:,:,:,stay safe,https,:
4,https,: /,https,https,https,/ .,https,safe,https :,https
5,https :,https,https :,:,https :,https,https :,stay,: /,https :
6,: /,https :,: /,https :,: /,https :,: /,:,/ /,: /
7,/ /,/ /,/ /,: /,/ /,: /,/ /,https,/ .,/ /
8,/ .,/ .,/ .,/ /,/ .,/ /,/ .,https :,. /,/ .
9,. /,. /,. /,/ .,. /,. /,. /,: /,https : /,. /


In [130]:
topics_tfidf_attn = topics_df(
    10,
    components_tfidf_attn,
    n_words = 10)

topics_tfidf_attn

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,.,#,#,.,.,.,.,stay safe,.,#
1,#,.,/,#,#,#,#,stay,power,.
2,/,/,.,safe,power,/,/,safe,…,/
3,…,snowmaggedon2020,https,hope,/,:,:,.,/,@
4,",",https,https :,thinking,",",winds,https,!,",",!
5,… https,# snowmaggedon2020,:,!,!,snow,",",#,:,https
6,snow,https :,# nlwx,newfoundland,…,…,snow,. stay,#,'
7,… https :,:,https : /,/,https,wind,!,. stay safe,… https,","
8,hospital,https : /,nlwx,",",:,airport,storm,newfoundland,… https :,…
9,:,newfoundland,. /,stay,… https,… https,blizzard,",",buried,https :
